torch.distributions 详解

本文探讨了distributions模块中可参数化概率分布的作用,介绍如何利用REINFORCE和Pathwisederivative估计器进行随机梯度计算,应用于强化学习和变分自动编码器中。文章详细解释了得分函数和重新参数化技巧的实现方式。

distributions包含可参数化的概率分布和采样函数,这允许构造用于优化的随机计算图和随机梯度估计器。

通常,不可能直接通过随机样本反向传播。但是,有两种方法可以创建可以反向传播的代理函数,即得分函数估计器/似然比函数估计器/REINFORCE和pathwise derivative估计器。REINFORCE通常被视为强化学习中策略梯度方法的基础,并且pathwise derivative估计器常见于变分自动编码器中的重新参数化技巧。得分函数仅需要样本的值 f ( x ) f(x) f(x),pathwise derivative需要导数 f ′ ( x ) f'(x) f(x),接下来的部分将在一个强化学习示例中讨论这两个问题。

得分函数

当概率密度函数相对于其参数可微分时,我们只需要sample()log_prob()来实现REINFORCE:

△ θ = α r ▽ θ l o g π θ ( a ∣ s ) \triangle\theta = \alpha r \triangledown_{\theta} \mathrm{log} \pi_{\theta}(a | s) θ=αrθlogπθ(as)

其中, θ \theta θ是参数, α \alpha α是学习速率, r r r是奖励, π θ ( a ∣ s ) \pi_{\theta}(a | s) πθ(as)是在给定策略 π θ \pi_{\theta} πθ下在状态 s s s执行动作 a a a的概率。

在实践中,我们将从网络输出中采样一个动作,将这个动作应用到环境中,然后使用log_prob构造一个等效的损失函数。请注意,我们使用负数是因为优化器使用梯度下降,而上面的规则假设梯度上升。

有了确定的策略,REINFORCE的实现代码如下:

probs = policy_network(state)
# Note that this is equivalent to what used to be called multinomial
m = Categorical(probs)
action = m.sample()
next_state, reward = env.step(action)
loss = -m.log_prob(action) * reward
loss.backward()

Pathwise derivative

实现这些随机/策略梯度的另一种方法是使用来自rsample()方法的重新参数化技巧,其中参数化随机变量可以通过无参数随机变量的参数确定性函数构造。因此,重新参数化的样本变得可微分,实现Pathwise derivative的代码如下:

param  = policy_network(state)
m = Normal(*params)
# Any distribution with .has_rsample == True could work based on the application
action = m.rsample()
next_state, reward = env.step(action) # Assuming that reward is differentiable
loss = -reward
loss.backward()

Normal

class torch.distributions.normal.Normal(loc, scale, validate_args=None)

基类:torch.distributions.exp_family.ExponentialFamily

创建由locscale参数化的正态分布(高斯分布)。

e.g.

>>> m = Normal(torch.tensor([0.0]), torch.tensor([1.0]))
>>> m.sample() # normally distributed with loc=0 and scale=1
tensor([0.1046])

参数:

  • loc (float or Tensor) — 均值(也被称为mu)
  • scale (float or Tensor) — 标准差 (也被称为sigma)

Reference:
https://github.com/apachecn/pytorch-doc-zh/blob/master/docs/1.0/distributions.md

class PolicyNetContinuous(torch.nn.Module): def __init__(self, state_dim, hidden_dim, action_dim): super(PolicyNetContinuous, self).__init__() self.fc1 = torch.nn.Linear(state_dim, hidden_dim) self.fc_mu = torch.nn.Linear(hidden_dim, action_dim) self.fc_std = torch.nn.Linear(hidden_dim, action_dim) def forward(self, x): x = F.relu(self.fc1(x)) mu = 2.0 * torch.tanh(self.fc_mu(x)) std = F.softplus(self.fc_std(x)) return mu, std class PPOContinuous: ''' 处理连续动作的PPO算法 ''' def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr, lmbda, epochs, eps, gamma, device): self.actor = PolicyNetContinuous(state_dim, hidden_dim, action_dim).to(device) self.critic = ValueNet(state_dim, hidden_dim).to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr) self.gamma = gamma self.lmbda = lmbda self.epochs = epochs self.eps = eps self.device = device def take_action(self, state): state = torch.tensor([state], dtype=torch.float).to(self.device) mu, sigma = self.actor(state) action_dist = torch.distributions.Normal(mu, sigma) action = action_dist.sample() return [action.item()] def update(self, transition_dict): states = torch.tensor(transition_dict['states'], dtype=torch.float).to(self.device) actions = torch.tensor(transition_dict['actions'], dtype=torch.float).view(-1, 1).to(self.device) rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1, 1).to(self.device) next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float).to(self.device) dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1, 1).to(self.device) rewards = (rewards + 8.0) / 8.0 # 和TRPO一样,对奖励进行修改,方便训练 td_target = rewards + self.gamma * self.critic(next_states) * (1 - dones) td_delta = td_target - self.critic(states) advantage = rl_utils.compute_advantage(self.gamma, self.lmbda, td_delta.cpu()).to(self.device) mu, std = self.actor(states) action_dists = torch.distributions.Normal(mu.detach(), std.detach()) # 动作是正态分布 old_log_probs = action_dists.log_prob(actions) for _ in range(self.epochs): mu, std = self.actor(states) action_dists = torch.distributions.Normal(mu, std) log_probs = action_dists.log_prob(actions) ratio = torch.exp(log_probs - old_log_probs) surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1 - self.eps, 1 + self.eps) * advantage actor_loss = torch.mean(-torch.min(surr1, surr2)) critic_loss = torch.mean( F.mse_loss(self.critic(states), td_target.detach())) self.actor_optimizer.zero_grad() self.critic_optimizer.zero_grad() actor_loss.backward() critic_loss.backward() self.actor_optimizer.step() self.critic_optimizer.step()那这里呢
08-08
import gym import torch import torch.nn.functional as F import numpy as np import matplotlib.pyplot as plt import rl_utils class PolicyNet(torch.nn.Module): def __init__(self, state_dim, hidden_dim, action_dim): super(PolicyNet, self).__init__() self.fc1 = torch.nn.Linear(state_dim, hidden_dim) self.fc2 = torch.nn.Linear(hidden_dim, action_dim) def forward(self, x): x = F.relu(self.fc1(x)) return F.softmax(self.fc2(x), dim=1) class ValueNet(torch.nn.Module): def __init__(self, state_dim, hidden_dim): super(ValueNet, self).__init__() self.fc1 = torch.nn.Linear(state_dim, hidden_dim) self.fc2 = torch.nn.Linear(hidden_dim, 1) def forward(self, x): x = F.relu(self.fc1(x)) return self.fc2(x) class PPO: ''' PPO算法,采用截断方式 ''' def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr, lmbda, epochs, eps, gamma, device): self.actor = PolicyNet(state_dim, hidden_dim, action_dim).to(device) self.critic = ValueNet(state_dim, hidden_dim).to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr) self.gamma = gamma self.lmbda = lmbda self.epochs = epochs # 一条序列的数据用来训练轮数 self.eps = eps # PPO中截断范围的参数 self.device = device def take_action(self, state): state = torch.tensor([state], dtype=torch.float).to(self.device) probs = self.actor(state) action_dist = torch.distributions.Categorical(probs) action = action_dist.sample() return action.item() def update(self, transition_dict): states = torch.tensor(transition_dict['states'], dtype=torch.float).to(self.device) actions = torch.tensor(transition_dict['actions']).view(-1, 1).to( self.device) rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1, 1).to(self.device) next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float).to(self.device) dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1, 1).to(self.device) td_target = rewards + self.gamma * self.critic(next_states) * (1 - dones) td_delta = td_target - self.critic(states) advantage = rl_utils.compute_advantage(self.gamma, self.lmbda, td_delta.cpu()).to(self.device) old_log_probs = torch.log(self.actor(states).gather(1, actions)).detach() for _ in range(self.epochs): log_probs = torch.log(self.actor(states).gather(1, actions)) ratio = torch.exp(log_probs - old_log_probs) surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1 - self.eps, 1 + self.eps) * advantage # 截断 actor_loss = torch.mean(-torch.min(surr1, surr2)) # PPO损失函数 critic_loss = torch.mean( F.mse_loss(self.critic(states), td_target.detach())) self.actor_optimizer.zero_grad() self.critic_optimizer.zero_grad() actor_loss.backward() critic_loss.backward() self.actor_optimizer.step() self.critic_optimizer.step()这里面是怎么采取行动的?它好像和之间的贪婪策略不一样
08-08
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值