### MADDPG 合作导航训练过程的具体实现
MADDPG(Multi-Agent Deep Deterministic Policy Gradient)是一种适用于多智能体协作环境的强化学习算法,其核心思想是通过集中式训练和分布式执行的方式,使得多个智能体能够协调地完成任务,如合作导航。
在合作导航任务中,每个智能体根据局部观测信息做出决策,而Critic网络则利用全局信息(即所有智能体的观测和动作)进行训练,从而实现更高效的策略更新。这种方式能够有效应对部分可观测性问题,并提高智能体之间的协作能力[^2]。
以下是一个MADDPG在合作导航任务中的训练过程的具体实现代码示例:
```python
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
# 定义Actor网络
class Actor(nn.Module):
def __init__(self, input_dim, output_dim):
super(Actor, self).__init__()
self.fc1 = nn.Linear(input_dim, 256)
self.fc2 = nn.Linear(256, 128)
self.fc3 = nn.Linear(128, output_dim)
self.relu = nn.ReLU()
self.tanh = nn.Tanh()
def forward(self, x):
x = self.relu(self.fc1(x))
x = self.relu(self.fc2(x))
x = self.tanh(self.fc3(x)) # 输出动作范围 [-1, 1]
return x
# 定义Critic网络
class Critic(nn.Module):
def __init__(self, input_dim, n_agents):
super(Critic, self).__init__()
self.fc1 = nn.Linear(input_dim * n_agents, 256)
self.fc2 = nn.Linear(256, 128)
self.fc3 = nn.Linear(128, 1)
self.relu = nn.ReLU()
def forward(self, x):
x = x.view(x.size(0), -1) # 将所有智能体的状态拼接
x = self.relu(self.fc1(x))
x = self.relu(self.fc2(x))
x = self.fc3(x)
return x
# 定义MADDPG智能体
class MADDPGAgent:
def __init__(self, actor, critic, n_agents, obs_dim, act_dim, lr_actor=1e-4, lr_critic=1e-3, gamma=0.95):
self.n_agents = n_agents
self.gamma = gamma
self.actors = [actor(obs_dim, act_dim) for _ in range(n_agents)]
self.critics = [critic(obs_dim, n_agents) for _ in range(n_agents)]
self.actor_optims = [optim.Adam(self.actors[i].parameters(), lr=lr_actor) for i in range(n_agents)]
self.critic_optims = [optim.Adam(self.critics[i].parameters(), lr=lr_critic) for i in range(n_agents)]
def select_actions(self, observations):
actions = []
for i, obs in enumerate(observations):
obs_tensor = torch.FloatTensor(obs)
with torch.no_grad():
action = self.actors[i](obs_tensor).numpy()
actions.append(action)
return actions
def update(self, replay_buffer, batch_size=64):
sampled_transitions = replay_buffer.sample(batch_size)
observations, actions, rewards, next_observations, dones = zip(*sampled_transitions)
observations = torch.FloatTensor(np.array(observations))
actions = torch.FloatTensor(np.array(actions))
rewards = torch.FloatTensor(np.array(rewards))
next_observations = torch.FloatTensor(np.array(next_observations))
dones = torch.FloatTensor(np.array(dones))
# 更新Critic
for i in range(self.n_agents):
next_actions = torch.cat([self.actors[j](next_observations[:, j]) for j in range(self.n_agents)], dim=1)
target_q = self.critics[i](torch.cat([next_observations[:, j] for j in range(self.n_agents)], dim=1))
target_q = rewards[:, i] + (1 - dones[:, i]) * self.gamma * target_q.squeeze()
current_q = self.critics[i](torch.cat([observations[:, j] for j in range(self.n_agents)], dim=1)).squeeze()
loss = nn.MSELoss()(current_q, target_q.detach())
self.critic_optims[i].zero_grad()
loss.backward()
self.critic_optims[i].step()
# 更新Actor
for i in range(self.n_agents):
actor_loss = -self.critics[i](torch.cat([observations[:, j] for j in range(self.n_agents)], dim=1)).mean()
self.actor_optims[i].zero_grad()
actor_loss.backward()
self.actor_optims[i].step()
# 简单的回放缓冲区
class ReplayBuffer:
def __init__(self, capacity=10000):
self.buffer = []
self.capacity = capacity
def add(self, transition):
self.buffer.append(transition)
if len(self.buffer) > self.capacity:
self.buffer.pop(0)
def sample(self, batch_size):
indices = np.random.choice(len(self.buffer), batch_size)
return [self.buffer[i] for i in indices]
# 示例使用
if __name__ == "__main__":
n_agents = 2
obs_dim = 4
act_dim = 2
actor = Actor
critic = Critic
maddpg_agent = MADDPGAgent(actor, critic, n_agents, obs_dim, act_dim)
replay_buffer = ReplayBuffer()
# 模拟训练过程
for episode in range(1000):
observations = np.random.rand(n_agents, obs_dim) # 模拟观测
actions = maddpg_agent.select_actions(observations)
rewards = np.random.rand(n_agents) # 模拟奖励
next_observations = np.random.rand(n_agents, obs_dim) # 模拟下一状态
done = np.random.choice([0, 1], size=n_agents) # 模拟是否结束
replay_buffer.add((observations, actions, rewards, next_observations, done))
maddpg_agent.update(replay_buffer)
```
### 代码说明
- **Actor网络**:用于生成智能体的动作,输入为当前观测,输出为动作。
- **Critic网络**:用于评估所有智能体联合状态-动作的价值,输入为所有智能体的观测和动作。
- **MADDPGAgent类**:管理多个智能体的Actor和Critic网络,实现动作选择和网络更新。
- **ReplayBuffer类**:用于存储经验回放数据,提高训练稳定性。
- **训练过程**:模拟了简单的训练循环,包括观测、动作选择、奖励获取和经验回放。
该实现展示了MADDPG在合作导航任务中的基本训练流程,适用于多个智能体在共享环境中协同完成导航任务的场景。训练过程中,Critic网络利用全局信息进行更新,而Actor网络则根据Critic的反馈调整策略,从而实现高效的多智能体协作[^2]。
---
###