在前面的博客中,我们使用了DQN等算法训练了agent并得到了较高的分数。DQN中的神经网络是输出的动作Q值,然后通过哪个Q值更大,就采取相应的动作,可我们为什么不直接让神经网络输出动作(概率),一步到位呢。而Policy Gradient就可以一步到位。
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import parl
import numpy as np
import gym
from parl.utils import logger
from paddle.distribution import Categorical
LEARNING_RATE = 1e-3
class Model(parl.Model):
def __init__(self, obs_dim, act_dim):
super().__init__()
hid1_size = act_dim * 10
self.fc1 = nn.Linear(obs_dim, hid1_size)
self.fc2 = nn.Linear(hid1_size, act_dim)
def forward(self, obs):
out = F.tanh(self.fc1(obs))
out = F.softmax(self.fc2(out))
return out
class PolicyGradient(parl.Algorithm):
def __init__(self, model, lr=None):
self.model = model
assert isinstance(lr, float)
self.optimizer = paddle.optimizer.Adam(learning_rate=lr, parameters=model.parameters())
def predict(self, obs):
return self.model(obs)
def learn(self, obs, act, reward):
# act_prob = self.model(obs)
# log_prob = F.cross_entropy(act_prob, act)
# loss = log_prob.mean()
# self.optimizer.clear_grad()
# loss.backward()
# self.optimizer.step()
prob = self.model(obs)
log_prob = Categorical(prob).log_prob(act)
loss = paddle.mean(-1 * log_prob * reward)
self.optimizer.clear_grad()
loss.backward()
self.optimizer.step()
return loss
class Agent(parl.Agent):
def __init__(self, algorithm, obs_dim, act_dim):
super().__init__(algorithm)
self.obs_dim = obs_dim
self.act_dim = act_dim
def sample(self, obs):
obs = paddle.to_tensor(obs, dtype='float32')
act_prob = self.alg.predict(obs)
act_prob = np.squeeze(act_prob, axis=0)
act = np.random.choice(range(self.act_dim), p=act_prob.numpy())
return act
def predict(self, obs):
obs = paddle.to_tensor(obs, dtype='float32')
act_prob = self.alg.predict(obs)
act = np.argmax(act_prob)
return act
def learn(self, obs, act, reward):
act = np.expand_dims(act, axis=-1)
reward = np.expand_dims(reward, axis=-1)
obs = paddle.to_tensor(obs, dtype='float32')
act = paddle.to_tensor(act, dtype='int32')
reward = paddle.to_tensor(reward, dtype='float32')
loss = self.alg.learn(obs, act, reward)
return loss.numpy()[0]
def run_episode(env, agent):
obs_list, action_list, reward_list = [], [], []
obs = env.reset()
while True:
obs_li