Prioritized Experience Replay(pytorch)(莫烦python学习笔记)

学习了如何使用优先经验回放 并用pytorch写了一遍

1.主函数

from RL_Brain import PrioritizedDQNAgent
import gym
import numpy as np


def train(RL):
    total_steps = 0
    steps = []
    episodes = []
    for episode in range(20):
        observation = env.reset()
        while True:
            env.render()
            action = RL.choose_action(observation)
            # 强化学习环境的期望输入动作是一个numpy数组
            observation_, reward, done, info = env.step(action)
            if done:
                reward = 10

            RL.store_transition(observation, action, reward, observation_)

            if total_steps > MEMORY_SIZE:
                RL.learn()

            if done:
                print('episode',episode,'finished')
                steps.append(total_steps)
                episodes.append(episode)
                break
            observation = observation_
            total_steps += 1

if __name__ == '__main__':

    env = gym.make('MountainCar-v0')
    env = env.unwrapped
    env.seed(1)
    MEMORY_SIZE = 10000
    RL = PrioritizedDQNAgent(n_actions = 3,n_features = 2,
                      memory_size = MEMORY_SIZE,
                      e_greedy_increment= 0.00005,
                      output_graph=True
                     )
    train(RL)



2.RL_Brain

"""
agent代码
"""

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

np.random.seed(42)
torch.manual_seed(2)

class SumTree:
    data_pointer = 0
    def __init__(self, capacity):
        self.capacity = capacity
        self.tree = np.zeros(2*self.capacity - 1)
        self.data = np.zeros(self.capacity, dtype=object)

    def add(self, p, data):
        tree_idx = self.data_pointer + self.capacity - 1
        self.data[self.data_pointer] = data
        self.update(tree_idx, p)

        self.data_pointer += 1
        if self.data_pointer >= self.capacity:
            self.data_pointer = 0

    def update(self, tree_idx, p):
        change = p - self.tree[tree_idx]
        self.tree[tree_idx] = p
        while tree_idx != 0:
            tree_idx = (tree_idx-1)//2
            self.tree[tree_idx] += change

    def get_leaf(self,v):
        parent_idx = 0
        while True:
            cl_idx = 2*parent_idx + 1
            cr_idx = cl_idx + 2
            if cl_idx >= len(self.tree):
                leaf_idx = parent_idx
                break
            else:
                if v <= self.tree[cl_idx]:
                    parent_idx = cl_idx
                else:
                    v -= self.tree[cl_idx]
                    parent_idx = cr_idx
        data_idx = leaf_idx - (self.capacity - 1)
        return leaf_idx, self.tree[leaf_idx], self.data[data_idx]

    @property
    def total_p(self):
        return self.tree[0]


class Memory:
    epsilon = 0.01
    alpha = 0.6
    beta = 0.4
    beta_increment_per_sampling = 0.001
    abs_err_upper = 1

    def __init__(self,capacity):
        self.tree = SumTree(capacity)

    def store(self, transition):
        max_p = np.max(self.tree.tree[-self.tree.capacity:])
        if max_p == 0:
            max_p = self.abs_err_upper
        self.tree.add(max_p, transition)

    def sample(self, n):
        b_idx, b_memory, ISWeights = np.empty((n,), dtype=np.int32), np.empty((n,self.tree.data[0].size)),np.empty((n,1))
        pri_seg = self.tree.total_p / n
        self.beta = np.min([1.,self.beta + self.beta_increment_per_sampling])
        min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_p
        for i in range(n):
            a, b = pri_seg * i, pri_seg * (i + 1)
            v = np.random.uniform(a, b)
            idx, p, data = self.tree.get_leaf(v)
            prob = p / self.tree.total_p
            ISWeights[i, 0 ] = np.power(prob/min_prob, -self.beta)
            b_idx[i], b_memory[i,:] = idx, data
        return b_idx, b_memory, ISWeights

    def batch_update(self, tree_idx, abs_errors):
        abs_errors += self.epsilon
        clipped_errors = np.minimum(abs_errors, self.abs_err_upper)
        ps = np.power(clipped_errors, self.alpha)
        for ti, p in zip(tree_idx, ps):
            self.tree.update(ti, p)
class Network(nn.Module):
    def __init__(self,n_features,n_actions,n_neuron=10):
        super(Network, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features=n_features, out_features=n_neuron, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=n_neuron, out_features=n_actions, bias=True)

        )

    def forward(self,s):
        s = s.float()
        q = self.net(s)
        return q

class PrioritizedDQNAgent:
    def __init__(
            self,
            n_actions,
            n_features,
            learning_rate=0.005,
            reward_decay=0.9,
            e_greedy=0.9,
            replace_target_iter=500,
            memory_size=10000,
            batch_size = 32,
            e_greedy_increment = None,
            output_graph= False
    ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.e_greedy_increment = e_greedy_increment
        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max

        self.learn_step_counter = 0
        self.memory = Memory(capacity=memory_size)

        self.eval_net = Network(self.n_features,self.n_actions)
        self.target_net = Network(self.n_features,self.n_actions)
        self.loss_function = nn.MSELoss(reduction='none')
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(),lr=self.lr)

        self.cost_his = []

    def store_transition(self,s,a,r,s_):
        # 检查对象是否包含对应的属性 没有 则创建
        transition = np.hstack((s, [a, r], s_))
        self.memory.store(transition)

    def choose_action(self,observation):
        # 将一维向量转化为二维矩阵
        observation = observation[np.newaxis,:]

        if np.random.uniform() < self.epsilon:
            s = torch.tensor(observation)
            actions_value = self.eval_net(s)
            action = [np.argmax(actions_value.detach().numpy())][0]
        else:
            action = np.random.randint(0,self.n_actions)
        return action

    def replace_target_params(self):
        self.target_net.load_state_dict(self.eval_net.state_dict())

    def learn(self):
        if self.learn_step_counter % self.replace_target_iter == 0 :
            self.replace_target_params()
            print('\ntarget params replaced\n')

        # 更清晰的写法(功能等效)
        # if self.memory_counter > self.memory_size:
        #     batch_memory = self.memory.sample(self.batch_size)
        # else:
        #     batch_memory = self.memory.iloc[:self.memory_counter].sample(
        #         self.batch_size, replace=True
        #     )
        tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size)
        ISWeights_tensor = torch.tensor(ISWeights, dtype=torch.float32).squeeze(1)


        s = torch.tensor(batch_memory[:, :self.n_features], dtype = torch.float32)
        s_ = torch.tensor(batch_memory[:,-self.n_features:], dtype = torch.float32)
        q_eval = self.eval_net(s)
        q_next = self.target_net(s_)

        q_target = q_eval.clone()
        batch_index = np.arange(self.batch_size,dtype=np.int32)
        eval_act_index = batch_memory[:,self.n_features].astype(int)
        reward = batch_memory[:,self.n_features+1].astype(np.float32)
        # 注意pandas和pytorch的value用法不同 前者是返回数组 后者返回最大值
        q_target[batch_index, eval_act_index] = torch.tensor(reward).float() + self.gamma * q_next.max(dim= 1).values.float()

        loss = self.loss_function(q_target,q_eval)
        loss_per_action = loss[torch.arange(32), eval_act_index]
        weighted_loss = (loss_per_action * ISWeights_tensor).mean()
        self.optimizer.zero_grad()
        weighted_loss.backward()
        self.optimizer.step()

        with torch.no_grad():
            abs_errors = torch.abs(q_target - q_eval).max(dim=1).values.numpy()  # 取每个样本的最大TD误差绝对值
        # 调用优先经验回放的更新方法
        self.memory.batch_update(tree_idx, abs_errors)

        self.cost_his.append(loss.detach().numpy())

        self.epsilon = self.epsilon + self.e_greedy_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        self.learn_step_counter += 1
    def plot_cost(self):
        plt.figure()
        plt.plot(np.arange(len(self.cost_his)),self.cost_his)
        plt.show()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值