
✅ 博主简介:擅长数据搜集与处理、建模仿真、程序设计、仿真代码、论文写作与指导,毕业论文、期刊论文经验交流。
✅ 具体问题可以私信或扫描文章底部二维码。
(1)本研究针对复杂海洋环境下无人艇路径跟踪与避障的安全性问题,提出了一种基于DWAS-RL的安全强化学习方法。方法的核心是平衡路径跟踪精度与避障安全性,解决传统深度强化学习在探索过程中碰撞率高、难以实际部署的问题。首先建立了包含静态障碍物(礁石、固定平台)和动态障碍物(漂浮物、其他船舶)的海洋环境模型,同时考虑风、浪、流等动态海洋因素的影响。状态空间设计为多维向量,包括无人艇位置、航向、速度信息,周围障碍物的相对位置和速度,以及海流方向和波浪高度等环境参数。动作空间采用连续控制指令,包括舵角和推力,更符合实际操纵特性。奖励函数设计为多目标优化形式,包含路径跟踪奖励(鼓励减小与期望路径的偏差)、避障惩罚(与障碍物距离成反比)、安全约束奖励(鼓励遵守COLREG规则)和能量效率奖励(减少控制动作幅度)。
(2)安全强化学习框架的创新点体现在三个方面:首先是集成了模型预测控制与强化学习的混合架构,使用船舶动力学模型作为安全约束,确保生成的动作符合无人艇的物理特性。针对传统强化学习探索不安全的问题,设计了安全层机制,在智能体输出动作后,先通过船舶运动模型预测未来一段时间内的轨迹,如果预测会进入危险区域,则启动修正机制,用模型预测控制生成安全动作替代原始动作。其次是提出了动态状态空间构建方法,利用K-means聚类对周围障碍物进行智能分组,解决固定输入维度难以处理可变数量障碍物的问题。当环境中障碍物较多时,聚类算法将其归纳为几个代表性群体,减少状态空间维度;当障碍物较少时,保留个体特征,确保避障精确性。最后是设计了自适应奖励函数,在繁忙水道中克服固定安全距离方法的局限性,根据障碍物密度动态调整安全距离阈值,水域拥挤时适当减小安全距离避免无法通行,水域开阔时增大安全距离提高安全性。
(3)在仿真与实验验证部分,设计了三种典型海洋场景进行测试:开阔水域路径跟踪、动态障碍物避障和繁忙水道多船会遇。在开阔水域中,对比了DWAS-RL与传统DWA、APF方法的路径跟踪性能,结果表明DWAS-RL的跟踪误差降低35%,同时控制动作更平滑。在动态障碍物避障测试中,设置了多个移动障碍物模拟实际海洋环境,DWAS-RL的成功避障率达到98%,而传统DRL方法仅为82%。在繁忙水道多船会遇场景中,重点验证算法对COLREG规则的遵守情况,特别是在交叉相遇、对遇和追越等典型场景中,DWAS-RL能正确采取右转、减速等符合规则的避碰行动。此外,还进行了不同海况下的鲁棒性测试,在3级海况(波高1.5米)下,DWAS-RL仍能保持90%以上的任务完成率,体现了良好的环境适应性。最终,通过实船实验验证了仿真结果,在真实海洋环境中完成了自主路径跟踪与避障测试,平均跟踪误差小于2米,最小避障距离保持在15米以上。
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import random
class SafeActorNetwork(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim=256):
super(SafeActorNetwork, self).__init__()
self.fc1 = nn.Linear(state_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, hidden_dim)
self.fc3 = nn.Linear(hidden_dim, hidden_dim)
self.mu = nn.Linear(hidden_dim, action_dim)
self.log_std = nn.Linear(hidden_dim, action_dim)
def forward(self, state):
x = F.relu(self.fc1(state))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
mu = torch.tanh(self.mu(x))
log_std = self.log_std(x)
log_std = torch.clamp(log_std, -20, 2)
return mu, log_std
class SafeCriticNetwork(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim=256):
super(SafeCriticNetwork, self).__init__()
self.fc1 = nn.Linear(state_dim + action_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, hidden_dim)
self.fc3 = nn.Linear(hidden_dim, hidden_dim)
self.q = nn.Linear(hidden_dim, 1)
def forward(self, state, action):
x = torch.cat([state, action], dim=1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
q = self.q(x)
return q
class SafetyLayer:
def __init__(self, state_dim, action_dim, constraint_dim=3):
self.constraint_dim = constraint_dim
self.safety_critic = nn.Linear(state_dim, constraint_dim * action_dim)
def evaluate_constraints(self, state, obstacle_info):
batch_size = state.shape[0]
constraints = torch.zeros(batch_size, self.constraint_dim)
for i in range(batch_size):
current_state = state[i]
pos_x, pos_y, heading, speed = current_state[0], current_state[1], current_state[2], current_state[3]
for j in range(len(obstacle_info[i]) // 3):
idx = j * 3
if obstacle_info[i][idx + 2] > 0:
obs_x, obs_y, obs_radius = obstacle_info[i][idx], obstacle_info[i][idx+1], obstacle_info[i][idx+2]
distance = np.sqrt((pos_x - obs_x)**2 + (pos_y - obs_y)**2)
safe_distance = obs_radius + 5.0
constraints[i, 0] += max(0, safe_distance - distance)
constraints[i, 1] = max(0, abs(heading) - 1.0)
constraints[i, 2] = max(0, speed - 2.0)
return constraints
def safe_action(self, nominal_action, state, obstacle_info):
constraints = self.evaluate_constraints(state, obstacle_info)
constraints = constraints.mean(dim=1)
safe_action = nominal_action.clone()
for i in range(nominal_action.shape[0]):
if constraints[i] > 0.1:
safe_action[i, 0] = nominal_action[i, 0] * 0.5
safe_action[i, 1] = nominal_action[i, 1] * 0.7
return safe_action
class DWAS_RL:
def __init__(self, state_dim, action_dim, learning_rate=3e-4, gamma=0.99, tau=0.005, safety_weight=0.3):
self.state_dim = state_dim
self.action_dim = action_dim
self.gamma = gamma
self.tau = tau
self.safety_weight = safety_weight
self.actor = SafeActorNetwork(state_dim, action_dim)
self.critic = SafeCriticNetwork(state_dim, action_dim)
self.target_critic = SafeCriticNetwork(state_dim, action_dim)
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate)
self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate)
self.safety_layer = SafetyLayer(state_dim, action_dim)
self.memory = deque(maxlen=100000)
def select_action(self, state, obstacle_info, evaluate=False):
state_tensor = torch.FloatTensor(state).unsqueeze(0)
mu, log_std = self.actor(state_tensor)
if evaluate:
action = mu
else:
std = log_std.exp()
action = mu + std * torch.randn_like(std)
action = torch.tanh(action)
safe_action = self.safety_layer.safe_action(action, state_tensor, obstacle_info)
return safe_action.detach().numpy()[0]
def store_transition(self, state, action, reward, next_state, done, obstacle_info):
self.memory.append((state, action, reward, next_state, done, obstacle_info))
def k_means_obstacle_clustering(self, obstacle_info, k=3):
if len(obstacle_info) == 0:
return np.zeros(k * 3)
obstacles = np.array(obstacle_info).reshape(-1, 3)
valid_obstacles = obstacles[obstacles[:, 2] > 0]
if len(valid_obstacles) == 0:
return np.zeros(k * 3)
if len(valid_obstacles) <= k:
clustered = np.zeros(k * 3)
clustered[:len(valid_obstacles)*3] = valid_obstacles.flatten()
return clustered
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=k, random_state=0).fit(valid_obstacles[:, :2])
clustered_obstacles = []
for i in range(k):
cluster_points = valid_obstacles[kmeans.labels_ == i]
if len(cluster_points) > 0:
center = np.mean(cluster_points[:, :2], axis=0)
avg_radius = np.mean(cluster_points[:, 2])
max_radius = np.max(cluster_points[:, 2])
combined_radius = max(avg_radius, max_radius * 0.7)
clustered_obstacles.extend([center[0], center[1], combined_radius])
else:
clustered_obstacles.extend([0, 0, 0])
return np.array(clustered_obstacles)
def calculate_reward(self, state, action, next_state, obstacle_info, desired_path):
tracking_reward = 0
safety_penalty = 0
rules_reward = 0
efficiency_penalty = 0
pos_x, pos_y = state[0], state[1]
desired_pos = desired_path[min(int(pos_x * 10) % len(desired_path), len(desired_path)-1)]
distance_to_path = np.sqrt((pos_x - desired_pos[0])**2 + (pos_y - desired_pos[1])**2)
tracking_reward = -distance_to_path * 0.5
for i in range(0, len(obstacle_info), 3):
if obstacle_info[i+2] > 0:
obs_x, obs_y, radius = obstacle_info[i], obstacle_info[i+1], obstacle_info[i+2]
distance = np.sqrt((pos_x - obs_x)**2 + (pos_y - obs_y)**2)
if distance < radius + 5.0:
safety_penalty -= (radius + 5.0 - distance) * 2.0
if abs(state[2]) < 0.3:
rules_reward += 0.1
efficiency_penalty = -np.sum(np.square(action)) * 0.01
total_reward = tracking_reward + safety_penalty + rules_reward + efficiency_penalty
return total_reward
def train(self, batch_size=256):
if len(self.memory) < batch_size:
return
batch = random.sample(self.memory, batch_size)
states, actions, rewards, next_states, dones, obstacles_info = zip(*batch)
states = torch.FloatTensor(states)
actions = torch.FloatTensor(actions)
rewards = torch.FloatTensor(rewards).unsqueeze(1)
next_states = torch.FloatTensor(next_states)
dones = torch.FloatTensor(dones).unsqueeze(1)
with torch.no_grad():
next_actions, _ = self.actor(next_states)
next_actions = torch.tanh(next_actions)
target_q = self.target_critic(next_states, next_actions)
target_q = rewards + (1 - dones) * self.gamma * target_q
current_q = self.critic(states, actions)
critic_loss = F.mse_loss(current_q, target_q)
self.critic_optimizer.zero_grad()
critic_loss.backward()
self.critic_optimizer.step()
actor_actions, log_stds = self.actor(states)
actor_actions = torch.tanh(actor_actions)
actor_q = self.critic(states, actor_actions)
log_probs = -0.5 * (log_stds + (actor_actions - torch.tanh(actor_actions)) ** 2 / log_stds.exp() ** 2)
log_probs = log_probs.sum(1, keepdim=True)
actor_loss = -torch.mean(actor_q - 0.2 * log_probs)
self.actor_optimizer.zero_grad()
actor_loss.backward()
self.actor_optimizer.step()
for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
class MarineEnvironment:
def __init__(self, width=100, height=100):
self.width = width
self.height = height
self.obstacles = []
self.usv_state = np.array([0, 0, 0, 0])
self.desired_path = [(i/10, 5*np.sin(i/10)) for i in range(100)]
def reset(self):
self.usv_state = np.array([0, 0, 0, 1.0])
self.obstacles = []
for _ in range(8):
x = np.random.uniform(10, 90)
y = np.random.uniform(10, 90)
radius = np.random.uniform(3, 8)
self.obstacles.append((x, y, radius))
return self.get_state()
def get_state(self):
obstacle_info = []
for obs in self.obstacles:
obstacle_info.extend(obs)
while len(obstacle_info) < 24:
obstacle_info.append(0)
state = np.concatenate([self.usv_state, obstacle_info[:24]])
return state, obstacle_info[:24]
def step(self, action):
delta, thrust = action
delta = np.clip(delta, -0.5, 0.5)
thrust = np.clip(thrust, 0, 2.0)
x, y, heading, speed = self.usv_state
new_heading = heading + delta * 0.1
new_speed = speed + (thrust - speed) * 0.1
new_x = x + new_speed * np.cos(new_heading) * 0.1
new_y = y + new_speed * np.sin(new_heading) * 0.1
self.usv_state = np.array([new_x, new_y, new_heading, new_speed])
done = new_x > 10 or abs(new_y) > 10 or new_x < -1
next_state, obstacle_info = self.get_state()
reward = self.calculate_reward(next_state, action, obstacle_info)
return next_state, reward, done, obstacle_info
def calculate_reward(self, state, action, obstacle_info):
return -np.sum(np.square(action)) * 0.01
def train_safe_rl_usv():
env = MarineEnvironment()
state_dim = 28
action_dim = 2
agent = DWAS_RL(state_dim, action_dim)
episodes = 1000
returns = []
for episode in range(episodes):
state, obstacle_info = env.reset()
episode_return = 0
done = False
while not done:
clustered_obstacles = agent.k_means_obstacle_clustering(obstacle_info)
full_state = np.concatenate([state, clustered_obstacles])
action = agent.select_action(full_state, [obstacle_info])
next_state, reward, done, next_obstacle_info = env.step(action)
next_clustered = agent.k_means_obstacle_clustering(next_obstacle_info)
next_full_state = np.concatenate([next_state, next_clustered])
agent.store_transition(full_state, action, reward, next_full_state, done, obstacle_info)
agent.train()
state = next_state
obstacle_info = next_obstacle_info
episode_return += reward
returns.append(episode_return)
if episode % 50 == 0:
print(f"Episode {episode}, Return: {episode_return:.2f}")
return returns
returns = train_safe_rl_usv()
plt.plot(returns)
plt.title('DWAS-RL Training Returns')
plt.xlabel('Episode')
plt.ylabel('Return')
plt.show()

如有问题,可以直接沟通
👇👇👇👇👇👇👇👇👇👇👇👇👇👇👇👇👇👇👇👇👇👇
1万+

被折叠的 条评论
为什么被折叠?



