
深度强化学习原理与实践代码清单
图灵保佑
这个作者很懒,什么都没留下…
展开
-
2021-11-07
import torchclass SharedAdam(torch.optim.Adam): # params--待优化参数的iterable或者是定义了参数组的dict # lr--学习率(默认:1e-3) # betas–-用于计算梯度以及梯度平方的运行平均值的系数(默认:0.9,0.999) # eps–-为了增加数值计算的稳定性而加到分母里的项(默认:1e-8) # weight_decay--权重衰减(L2惩罚)(默认: 0) def __i原创 2021-11-07 10:28:45 · 496 阅读 · 0 评论 -
2021-11-1 9
import mathimport randomimport gymimport numpy as npimport torchimport torch.nn as nnimport torch.optim as optimimport torch.nn.functional as Ffrom torch.distributions import Normalimport matplotlib.pyplot as pltuse_cuda = torch.cuda.is_availabl原创 2021-11-01 20:34:17 · 217 阅读 · 0 评论 -
2021-10-31 8
网络模型根据输入可以更改形式# Hyper ParametersBATCH_SIZE = 32LR = 0.01 # learning rateEPSILON = 0.9 # greedy policyGAMMA = 0.9 # reward discountTARGET_REPLACE_ITER = 100 # target update frequencyMEMORY_CAPACITY =原创 2021-10-31 10:37:45 · 142 阅读 · 0 评论 -
2021-10-25 7.18
class BPModel(nn.Module): def __init__(self, n_x, n_y): super(BPModel, self).__init__() self.layer1 = nn.Linear(n_x, 10) self.layer2 = nn.Linear(10, 10) self.layer3 = nn.Linear(10, n_y) def forward(self, x):原创 2021-10-25 10:49:25 · 154 阅读 · 0 评论 -
2021-10-24 7.7
class Monte_Carlo_Policy_Gradient(): def __init__(self, env, num_episodes=200, learning_rate=0.01, reward_decay=0.95): self.nA = env.action_space.n # 动作空间数量 self.nS = env.observation_space.shape[0] # 状态空间数量 self.env = env # 环原创 2021-10-24 16:07:22 · 97 阅读 · 0 评论 -
2021-10-23 6.9
# 值函数近似器class Estimator(): def __init__(self): state_examples = np.array([env.observation_space.sample() for x in range(10000)]) # 对环境进行10000次采样,便于后续对状态state抽取特征 # 特征处理1:归一化数据状态为零均值和单位方差 self.scaler = Scaler() self.sca原创 2021-10-23 09:41:18 · 244 阅读 · 0 评论 -
2021-10-17 5.13
class Q_learning(): def __init__(self, env, num_episodes, discount=1.0, alpha=0.5, epsilon=0.1, n_bins=10): self.nA = env.action_space.n # 动作空间数 self.nS = env.observation_space.shape[0] # 状态空间数 self.env = env # 环境 sel原创 2021-10-19 15:16:10 · 111 阅读 · 0 评论 -
2021-10-17 5.7
class SARSA(): def __init__(self, env, num_episodes, discount=1.0, alpha=0.5, epsilon=0.1, n_bins=10): self.nA = env.action_space.n # 动作空间数 self.nS = env.observation_space.shape[0] # 状态空间数 self.env = env # 环境 self.num原创 2021-10-19 14:58:34 · 134 阅读 · 0 评论 -
2021-10-17 4.13
def create_random_policy(nA): A = np.ones(nA, dtype=float) / nA # 创建随即策略 def policy_fn(observation): # 策略函数 return A return policy_fndef create_greedy_policy(Q): def policy_fn(state): # 创建贪婪策略 A = np.zeros_like(Q[原创 2021-10-18 21:18:39 · 116 阅读 · 0 评论 -
2021-10-17 4.10
# 贪婪算法def epsilon_greedy_policy(q, epsilon, nA): def __policy__(state): A_ = np.ones(nA, dtype=float) # 初始化动作概率 A = A_ * epsilon / nA # 以epsilon设定动作概率 best = np.argmax(q[state]) # 选取动作值函数中的最大值作为最优值 A[best] += 1 - ep原创 2021-10-18 17:01:19 · 104 阅读 · 0 评论 -
2021-10-17 3.9
# 对于给定状态计算各个动作a的期望def clac_action_value(state, V, discount_factor=1.0): A = np.zeros(env.nA) # 初始化动作期望向量 for a in range(env.nA): # 遍历当前状态下所有动作 for prob, next_state, reward, done in env.P[state][a]: A[a] += prob * (reward + di原创 2021-10-18 15:01:34 · 118 阅读 · 0 评论 -
2021-10-17 3.5
def policy_iteration(env, policy, dicount_factor=1.0): while True: V = policy_evaluation(policy, env, dicount_factor) # 策略评估 policy_stable = True # policy标志位,当某个状态策略变化后,变为false for s in range(env.nS): # 遍历状态 old_a原创 2021-10-17 20:50:31 · 85 阅读 · 0 评论 -
2021-10-17 3.2
# discount_factor为折扣因子,theta为变化阈值,如果状态值函数的变化不大于阈值,则迭代停止def policy_evaluation(policy, environment, discount_factor=1.0, theta=1.0): env = environment V = np.zeors(env.ns) # 初始化全0的值函数向量用于记录状态值 for _ in range(10000): # 迭代开始 delta = 0原创 2021-10-17 20:05:09 · 106 阅读 · 0 评论 -
2021-10-17 2.1
# nA为动作空间数量, T为进行的总时间步def epsilon_greedy(nA, R, T, epsilon=0.6): r = 0 # 初始化累计奖励r N = [0] * nA # 对动作空间进行全零初始化 for _ in range(T): if np.random.rand() < epsilon: a = np.random.randint(q_value.shape[0]) else:原创 2021-10-17 17:37:14 · 99 阅读 · 0 评论