Actor-Critic pytorch(莫烦python学习笔记)

学习了actor-critic的代码和原理 并改写为pytorch版本

相比于PG,actor-critic可以实现单步更新 通过critic网络来判断每一步走的好不好 而不是要等回合结束后看总收获

1.run_this

import gym
from RL_Brain import Actor_Critic
import matplotlib.pyplot as plt

def train_RL():
    global rewards
    for episode in range(200):
        observation = env.reset()
        env.render()
        done = False
        ep_r = 0
        while not done:
            a, log_prob = RL.choose_action(observation)

            observation_, reward, done, info = env.step(a)

            ep_r += reward

            RL.learn(log_prob, observation, observation_, reward)
            observation = observation_
        rewards.append(ep_r)
        print('episode:', episode, 'reward:', ep_r)
    plt.plot(rewards)
    plt.show()

    return 0

if __name__ == '__main__':
    env = gym.make('CartPole-v1')
    RL = Actor_Critic(
        n_features=env.observation_space.shape[0],
        n_actions=env.action_space.n,
        n_neurons=300,
        lr_a=3e-4,
        lr_c=5e-4,
        reward_decay=0.99
    )
    rewards = []
    train_RL()

2.RL_Brain

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.distributions import Categorical

class Actor(nn.Module):
    def __init__(self,
                 state_dim,
                 action_dim,
                 n_neurons):
        super(Actor, self).__init__()
        self.l1 = nn.Linear(state_dim, n_neurons)
        self.l2 = nn.Linear(n_neurons, action_dim)

        self.ln = nn.LayerNorm(n_neurons)

    def forward(self, s):
        if isinstance(s, np.ndarray):
            s = torch.FloatTensor(s)
        x = self.ln(F.relu(self.l1(s)))
        out = F.softmax(self.l2(x), dim = -1)
        return  out

class Critic(nn.Module):
    def __init__(self,
                 state_dim,
                 n_neurons):
        super(Critic, self).__init__()
        self.l1 = nn.Linear(state_dim, n_neurons)
        self.l2 = nn.Linear(n_neurons, 1)

        self.ln = nn.LayerNorm(n_neurons)

    def forward(self, s):
        if isinstance(s, np.ndarray):
            s = torch.FloatTensor(s)
        x = self.ln(F.relu(self.l1(s)))
        out = self.l2(x)
        return out
class Actor_Critic:

    def __init__(self,
                 n_features,
                 n_actions,
                 n_neurons,
                 lr_a,
                 lr_c,
                 reward_decay
                ):
        self.state_dim = n_features
        self.action_dim = n_actions
        self.n_neurons = n_neurons
        self.lr_a = lr_a
        self.lr_c = lr_c
        self.gamma = reward_decay

        self.actor = Actor(self.state_dim, self.action_dim, self.n_neurons)
        self.critic = Critic(self.state_dim, self.n_neurons)
        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr = self.lr_a)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr = self.lr_c)

        self.loss = nn.MSELoss()

    def choose_action(self,s):
        a = self.actor(s)
        dist = Categorical(a)
        action = dist.sample()
        log_prob = dist.log_prob(action)

        return action.detach().numpy(), log_prob

    def learn(self, log_prob, s, s_, reward):
        v = self.critic(s)
        v_ = self.critic(s_)

        critic_loss = self.loss(reward + self.gamma * v_, v)
        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.critic_optim.step()

        td = self.gamma * v_ + reward - v
        loss_actor = - log_prob * td.detach()
        self.actor_optim.zero_grad()
        loss_actor.backward()
        self.actor_optim.step()


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值