轨迹规划是机器人控制中的核心问题,SAC(Soft Actor-Critic)因其出色的探索能力和稳定性,特别适合解决复杂的连续控制类轨迹规划问题。下面我将详细介绍如何用SAC实现高效的轨迹规划。
1. 轨迹规划问题建模
1.1 状态空间设计
对于机器人轨迹规划,典型的状态表示包括:
python
state = { 'joint_positions': [q1, q2, ..., qn], # 关节角度 'joint_velocities': [dq1, dq2, ..., dqn], # 关节速度 'end_effector_pos': [x, y, z], # 末端执行器位置 'target_pos': [x_t, y_t, z_t], # 目标位置 'obstacle_info': [obs1_x, obs1_y, obs1_r, ...] # 障碍物信息 }
1.2 动作空间设计
python
action = [τ1, τ2, ..., τn] # 各关节的扭矩控制量 # 通常需要归一化到[-1, 1]范围
1.3 奖励函数设计
多目标奖励函数示例:
python
def compute_reward(state, action, next_state): # 1. 到达目标奖励 dist_to_target = np.linalg.norm(next_state['end_effector_pos'] - state['target_pos']) reach_reward = -10.0 * dist_to_target # 距离惩罚 # 2. 碰撞惩罚 collision_penalty = 0.0 for obs in state['obstacle_info']: obs_dist = np.linalg.norm(next_state['end_effector_pos'] - obs[:3]) if obs_dist < obs[3]: # 障碍物半径 collision_penalty -= 50.0 # 3. 动作平滑惩罚 action_diff = np.linalg.norm(action - state['last_action']) smooth_penalty = -0.1 * action_diff # 4. 能量消耗惩罚 energy_penalty = -0.01 * np.sum(np.square(action)) # 5. 成功奖励 success_bonus = 100.0 if dist_to_target < 0.02 else 0.0 # 2cm内视为成功 return reach_reward + collision_penalty + smooth_penalty + energy_penalty + success_bonus
2. SAC实现轨迹规划的关键技术
2.1 网络架构设计
增强的SAC网络实现:
python
class EnhancedSAC(nn.Module): def __init__(self, state_dim, action_dim, max_action): super().__init__() # 状态编码器(处理高维输入) self.encoder = nn.Sequential( nn.Linear(state_dim, 512), nn.LayerNorm(512), nn.ReLU(), nn.Linear(512, 256), nn.LayerNorm(256), nn.ReLU() ) # 策略网络 self.actor_mean = nn.Linear(256, action_dim) self.actor_log_std = nn.Linear(256, action_dim) # 双Q网络 self.critic1 = nn.Sequential( nn.Linear(256 + action_dim, 256), nn.ReLU(), nn.Linear(256, 1) ) self.critic2 = nn.Sequential( nn.Linear(256 + action_dim, 256), nn.ReLU(), nn.Linear(256, 1) ) # 目标网络 self.critic1_target = deepcopy(self.critic1) self.critic2_target = deepcopy(self.critic2) self.max_action = max_action self.alpha = 0.2 # 初始温度系数
2.2 轨迹规划专用训练技巧
课程学习策略:
python
def adjust_training_difficulty(episode): # 逐步增加规划难度 if episode < 100: # 阶段1: 简单直线轨迹 target_pos = easy_targets() obstacles = [] elif episode < 300: # 阶段2: 添加静态障碍 target_pos = medium_targets() obstacles = static_obstacles() else: # 阶段3: 动态障碍和复杂目标 target_pos = hard_targets() obstacles = dynamic_obstacles() return target_pos, obstacles
优先经验回放(PER):
python
class PrioritizedReplayBuffer: def __init__(self, capacity, alpha=0.6): self.alpha = alpha self.capacity = capacity self.buffer = [] self.priorities = np.zeros(capacity) self.pos = 0 def push(self, state, action, reward, next_state, done): max_prio = self.priorities.max() if self.buffer else 1.0 if len(self.buffer) < self.capacity: self.buffer.append((state, action, reward, next_state, done)) else: self.buffer[self.pos] = (state, action, reward, next_state, done) self.priorities[self.pos] = max_prio self.pos = (self.pos + 1) % self.capacity def sample(self, batch_size, beta=0.4): if len(self.buffer) == 0: return [] prios = self.priorities[:len(self.buffer)] probs = prios ** self.alpha probs /= probs.sum() indices = np.random.choice(len(self.buffer), batch_size, p=probs) samples = [self.buffer[idx] for idx in indices] # 重要性采样权重 weights = (len(self.buffer) * probs[indices]) ** (-beta) weights /= weights.max() return samples, indices, np.array(weights, dtype=np.float32) def update_priorities(self, batch_indices, batch_priorities): for idx, prio in zip(batch_indices, batch_priorities): self.priorities[idx] = prio
3. 完整训练流程
3.1 训练主循环
python
def train_sac_trajectory_planner(env, agent, buffer, episodes=1000): for ep in range(episodes): state = env.reset() episode_reward = 0 episode_steps = 0 # 课程学习调整难度 env.set_difficulty_level(ep) while True: # 选择动作 action = agent.select_action(state) # 执行动作 next_state, reward, done, info = env.step(action) # 存储转换 buffer.push(state, action, reward, next_state, done) # 训练 if len(buffer) > batch_size: # 优先经验回放采样 batch, indices, weights = buffer.sample(batch_size) # 计算TD误差 td_error = agent.update(batch, weights) # 更新优先级 buffer.update_priorities(indices, abs(td_error) + 1e-5 state = next_state episode_reward += reward episode_steps += 1 if done or episode_steps >= max_steps: break # 评估和记录 if ep % eval_interval == 0: eval_reward = evaluate(agent, env) print(f"Episode {ep}: Train Reward {episode_reward:.2f}, Eval Reward {eval_reward:.2f}") # 保存模型 if eval_reward > best_eval: agent.save("best_sac_planner.pth") best_eval = eval_reward
3.2 轨迹规划专用评估指标
python
def evaluate_planner(agent, env, n_episodes=10): metrics = { 'success_rate': 0.0, 'avg_path_length': 0.0, 'avg_clearance': 0.0, # 与障碍物的平均最小距离 'energy_consumption': 0.0, 'smoothness': 0.0 # 轨迹曲率 } for _ in range(n_episodes): state = env.reset() trajectory = [] actions = [] for _ in range(max_steps): action = agent.select_action(state, evaluate=True) state, _, done, info = env.step(action) trajectory.append(state['end_effector_pos']) actions.append(action) if done: metrics['success_rate'] += 1.0 break # 计算各项指标 path_len = calculate_path_length(trajectory) clearance = calculate_min_clearance(trajectory, env.obstacles) energy = calculate_energy_consumption(actions) smooth = calculate_trajectory_smoothness(trajectory) metrics['avg_path_length'] += path_len metrics['avg_clearance'] += clearance metrics['energy_consumption'] += energy metrics['smoothness'] += smooth # 平均指标 for k in metrics: metrics[k] /= n_episodes return metrics
4. 实际部署优化
4.1 仿真到实物的转换(Sim-to-Real)
领域随机化技术:
python
def apply_domain_randomization(env): # 动力学参数随机化 env.randomize_dynamics( mass_range=(0.8, 1.2), friction_range=(0.5, 1.5), damping_range=(0.9, 1.1) ) # 传感器噪声 env.add_sensor_noise( pos_noise_std=0.005, # 5mm位置噪声 force_noise_std=0.1 # 0.1N力噪声 ) # 视觉外观变化 env.randomize_visuals( texture_range=(0.5, 1.5), lighting_range=(0.7, 1.3) )
4.2 在线自适应学习
python
class OnlineAdaptor: def __init__(self, agent): self.agent = agent self.real_world_buffer = ReplayBuffer(50000) self.adaptation_steps = 0 def adapt(self, real_world_samples): # 加入真实世界数据 for sample in real_world_samples: self.real_world_buffer.push(*sample) # 混合训练 if len(self.real_world_buffer) > 1000: # 从仿真和真实缓冲区各采样一半 sim_batch = agent.replay_buffer.sample(batch_size//2) real_batch = self.real_world_buffer.sample(batch_size//2) # 合并批次 combined_batch = [] for i in range(batch_size//2): combined_batch.append(sim_batch[i]) combined_batch.append(real_batch[i]) # 优先更新Critic agent.update_critic(combined_batch) # 减少Actor更新频率 if self.adaptation_steps % 3 == 0: agent.update_actor(combined_batch) self.adaptation_steps += 1
5. 高级技巧与最新进展
5.1 分层SAC规划
python
class HierarchicalSAC: def __init__(self, high_level_state_dim, low_level_state_dim, action_dim): # 高层策略(规划路径点) self.high_level_policy = SAC(high_level_state_dim, waypoint_dim, ...) # 底层策略(执行轨迹) self.low_level_policy = SAC(low_level_state_dim, action_dim, ...) def plan_and_execute(self, state): # 高层规划 waypoints = self.high_level_policy.select_action(global_state) # 底层执行 for wp in waypoints: action = self.low_level_policy.select_action(np.concatenate([state, wp])) state, reward, done = env.step(action) if done: break
5.2 结合运动原语(Motion Primitives)
python
class SACWithPrimitives: def __init__(self, state_dim, primitive_dim): self.primitives = load_motion_primitives() # 预定义的运动基元库 self.selector = SAC(state_dim, len(self.primitives)) # 选择器网络 def select_action(self, state): # 选择运动基元 prim_idx = self.selector.select_action(state) primitive = self.primitives[prim_idx] # 根据当前状态调整基元参数 adapted_action = primitive.adapt(state) return adapted_action
5.3 最新研究进展
-
SAC+Diffusion:
-
使用扩散模型生成更平滑的轨迹
-
实现代码片段:
python
-
-
-
class DiffusionPolicy(nn.Module): def forward(self, state, noise): # 多步去噪过程 for t in reversed(range(T)): noise_pred = self.model(state, noise, t) noise = self.step(noise, noise_pred) return denoised_action
-
-
SAC+Transformer:
-
使用Transformer编码历史轨迹
-
处理长序列规划问题
-
-
Multi-goal SAC:
-
同时学习多个子目标
-
实现分层规划
-
通过以上方法,您可以构建一个强大的基于SAC的轨迹规划系统。关键是根据具体应用场景调整状态表示、奖励函数和训练策略,同时结合领域知识设计适当的网络架构和训练流程。