学习了actor-critic的代码和原理 并改写为pytorch版本
相比于PG,actor-critic可以实现单步更新 通过critic网络来判断每一步走的好不好 而不是要等回合结束后看总收获
1.run_this
import gym
from RL_Brain import Actor_Critic
import matplotlib.pyplot as plt
def train_RL():
global rewards
for episode in range(200):
observation = env.reset()
env.render()
done = False
ep_r = 0
while not done:
a, log_prob = RL.choose_action(observation)
observation_, reward, done, info = env.step(a)
ep_r += reward
RL.learn(log_prob, observation, observation_, reward)
observation = observation_
rewards.append(ep_r)
print('episode:', episode, 'reward:', ep_r)
plt.plot(rewards)
plt.show()
return 0
if __name__ == '__main__':
env = gym.make('CartPole-v1')
RL = Actor_Critic(
n_features=env.observation_space.shape[0],
n_actions=env.action_space.n,
n_neurons=300,
lr_a=3e-4,
lr_c=5e-4,
reward_decay=0.99
)
rewards = []
train_RL()
2.RL_Brain
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.distributions import Categorical
class Actor(nn.Module):
def __init__(self,
state_dim,
action_dim,
n_neurons):
super(Actor, self).__init__()
self.l1 = nn.Linear(state_dim, n_neurons)
self.l2 = nn.Linear(n_neurons, action_dim)
self.ln = nn.LayerNorm(n_neurons)
def forward(self, s):
if isinstance(s, np.ndarray):
s = torch.FloatTensor(s)
x = self.ln(F.relu(self.l1(s)))
out = F.softmax(self.l2(x), dim = -1)
return out
class Critic(nn.Module):
def __init__(self,
state_dim,
n_neurons):
super(Critic, self).__init__()
self.l1 = nn.Linear(state_dim, n_neurons)
self.l2 = nn.Linear(n_neurons, 1)
self.ln = nn.LayerNorm(n_neurons)
def forward(self, s):
if isinstance(s, np.ndarray):
s = torch.FloatTensor(s)
x = self.ln(F.relu(self.l1(s)))
out = self.l2(x)
return out
class Actor_Critic:
def __init__(self,
n_features,
n_actions,
n_neurons,
lr_a,
lr_c,
reward_decay
):
self.state_dim = n_features
self.action_dim = n_actions
self.n_neurons = n_neurons
self.lr_a = lr_a
self.lr_c = lr_c
self.gamma = reward_decay
self.actor = Actor(self.state_dim, self.action_dim, self.n_neurons)
self.critic = Critic(self.state_dim, self.n_neurons)
self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr = self.lr_a)
self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr = self.lr_c)
self.loss = nn.MSELoss()
def choose_action(self,s):
a = self.actor(s)
dist = Categorical(a)
action = dist.sample()
log_prob = dist.log_prob(action)
return action.detach().numpy(), log_prob
def learn(self, log_prob, s, s_, reward):
v = self.critic(s)
v_ = self.critic(s_)
critic_loss = self.loss(reward + self.gamma * v_, v)
self.critic_optim.zero_grad()
critic_loss.backward()
self.critic_optim.step()
td = self.gamma * v_ + reward - v
loss_actor = - log_prob * td.detach()
self.actor_optim.zero_grad()
loss_actor.backward()
self.actor_optim.step()