import torch
import torch.nn as nn
import torch.optim as optim
import gym
import numpy as np
import matplotlib.pyplot as plt
# 定义Actor网络
class Actor(nn.Module):
def __init__(self, state_dim, action_dim, max_action):
super(Actor, self).__init__()
self.layer = nn.Sequential(
nn.Linear(state_dim, 400),
nn.ReLU(),
nn.Linear(400, 300),
nn.ReLU(),
nn.Linear(300, action_dim),
nn.Tanh()
)
self.max_action = max_action
def forward(self, state):
return self.max_action * self.layer(state)
# 定义Critic网络
class Critic(nn.Module):
def __init__(self, state_dim, action_dim):
super(Critic, self).__init__()
self.layer = nn.Sequential(
nn.Linear(state_dim + action_dim, 400),
nn.ReLU(),
nn.Linear(400, 300),
nn.ReLU(),
nn.Linear(300, 1)
)
def forward(self, state, action):
return self.layer(torch.cat([state, action], 1))
# DDPG智能体
class DDPG:
def __init__(self, env):
self.env = env
self.state_dim = env.observation_space.shape[0]
self.action_dim = env.action_space.shape[0]
self.max_action = float(env.action_space.high[0])
# 创建网络
self.actor = Actor(self.state_dim, self.action_dim, self.max_action)
self.actor_target = Actor(self.state_dim, self.action_dim, self.max_action)
self.actor_target.load_state_dict(self.actor.state_dict())
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4)
self.critic = Critic(self.state_dim, self.action_dim)
self.critic_target = Critic(self.state_dim, self.action_dim)
self.critic_target.load_state_dict(self.critic.state_dict())
self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)
self.replay_buffer = []
self.buffer_size = 100000
self.batch_size = 64
self.gamma = 0.99
self.tau = 0.005
def select_action(self, state, noise=True):
state = torch.FloatTensor(state.reshape(1, -1))
action = self.actor(state).cpu().data.numpy().flatten()
if noise:
action += np.random.normal(0, 0.1, size=self.action_dim)
return np.clip(action, -self.max_action, self.max_action)
def train(self):
if len(self.replay_buffer) < self.batch_size:
return
# 采样批次数据
batch = np.random.choice(len(self.replay_buffer), self.batch_size)
states, actions, rewards, next_states, dones = zip(*[self.replay_buffer[i] for i in batch])
states = torch.FloatTensor(np.array(states))
actions = torch.FloatTensor(np.array(actions))
rewards = torch.FloatTensor(np.array(rewards)).unsqueeze(1)
next_states = torch.FloatTensor(np.array(next_states))
dones = torch.FloatTensor(1 - np.array(dones)).unsqueeze(1)
# 更新Critic
target_actions = self.actor_target(next_states)
target_q = self.critic_target(next_states, target_actions)
target_q = rewards + (dones * self.gamma * target_q).detach()
current_q = self.critic(states, actions)
critic_loss = nn.MSELoss()(current_q, target_q)
self.critic_optimizer.zero_grad()
critic_loss.backward()
self.critic_optimizer.step()
# 更新Actor
actor_loss = -self.critic(states, self.actor(states)).mean()
self.actor_optimizer.zero_grad()
actor_loss.backward()
self.actor_optimizer.step()
# 软更新目标网络
for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
# 训练函数
def train_ddpg(env_name="Pendulum-v1", target_reward=-200, max_episodes=500):
env = gym.make(env_name)
agent = DDPG(env)
rewards = []
for episode in range(max_episodes):
state = env.reset()
episode_reward = 0
while True:
action = agent.select_action(state)
next_state, reward, done, _ = env.step(action)
agent.replay_buffer.append((state, action, reward, next_state, done))
if len(agent.replay_buffer) > agent.buffer_size:
agent.replay_buffer.pop(0)
episode_reward += reward
state = next_state
agent.train()
if done:
break
# 测试阶段
test_reward = 0
state = env.reset()
while True:
action = agent.select_action(state, noise=False)
state, reward, done, _ = env.step(action)
test_reward += reward
if done:
break
rewards.append(test_reward)
avg_reward = np.mean(rewards[-10:]) # 计算最近10轮平均奖励
# 可视化
plt.clf()
plt.plot(rewards, label='Episode Reward')
plt.plot([np.mean(rewards[max(0,i-10):i+1]) for i in range(len(rewards))], label='Moving Average')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title(f'DDPG Training Progress ({env_name})')
plt.legend()
plt.pause(0.001)
print(f"Episode: {episode+1}, Reward: {test_reward:.2f}, Avg Reward: {avg_reward:.2f}")
# 达到目标则停止训练
if avg_reward >= target_reward:
print(f"\nReached target average reward of {target_reward} at episode {episode+1}!")
break
env.close()
plt.show()
return agent
# 启动训练
if __name__ == "__main__":
trained_agent = train_ddpg()
请在上面代码中加入在训练结束后渲染一次最终的成功表现
最新发布