笔者之前有基于传统表格方法构建过一个智能体,感兴趣的读者可以去看《强化学习_03_表格方法实践(CartPole-v0 And MontoCarlo)》。
之前是限制了目标的一些状态空间训练15000轮也仅仅达到了130左右成绩。进行用DQN方法进行智能体学习。
一、构建智能体
构建智能体:
policy
是和之前一样的。探索和利用, 就是利用的时候基于nn模型的预测- 主要核心:
initialize
: 初始化,基于第一批缓存数据训练数据标准化的参数,以及初始化QNetupdate
: 更新状态的预估值QQ<s, a, t> = R<s, a, t> + gamma * Q<s+1, a_max, t+1>
Q<s, a, t>
基于batch样本中的state通过Q网络预测Q<s+1, a, t+1>
基于batch样本中的n_state通过Q网络预测- 然后对每个样本的状态刷新预估值
- 用state 和 刷新的预估值去更新参数
class CartPoleActor:
def __init__(self, epsilon, actions):
self.actions = actions
self.epsilon = epsilon
self.scaler_model = None
self.model = None
self.initialized = False
...
def initialize(self, experiences):
self.scaler_model = Scaler()
self.model = QNeuralEstimator(
input_dim=experiences[0].s.shape[0],
hidden_layer_size=[10, 10],
output_dim=len(self.actions)
)
states = t.tensor([e.s for e in experiences])
self.scaler_model.fit(states)
self.update([experiences[0]], gamma=0)
self.initialized = True
print('Done initialization. From now, begin training!')
def estimate(self, s):
s = self.scaler_model.predict(s)
return self.model(s)
def _predict(self, states):
if self.initialized:
return self.estimate(states)
dim_ = len(self.actions)
size = dim_ * len(states)
return np.random.uniform(size=size).reshape((-1, dim_))
def update(self, batch_experiences, gamma):
"""
更新状态的预估Q值:
Q<s, a, t> = R<s, a, t> + gamma * Q<s+1, a_max, t+1>
"""
states = t.tensor([e.s for e in batch_experiences])
n_states = t.tensor([e.n_s for e in batch_experiences])
estimateds = self.estimate(states)
futures = self.estimate(n_states)
for i, e in enumerate(batch_experiences):
R = e.r
if not e.d:
R += gamma * t.max(futures[i])
estimateds[i][e.a] = R
estimateds = t.tensor(estimateds).float()
self.model.fit(self.scaler_model.predict(states), estimateds)
def policy(self, s):
if random.random() < self.epsilon or not self.initialized:
return np.random.randint(len(self.actions))
estimates = self.estimate(s)
return np.argmax(estimates.detach().numpy())
二、QNet及Scaler
2.1 Scaler
scaler 是笔者简单写的一个归一化的类
class Scaler:
"""标准化
ta = t.tensor([range(100), range(100, 200)]).float()
t.tensor([np.array([11, 11]), np.array([11, 11])])
sc = Scaler()
sc.fit(ta)
sc.predict(ta)
"""
def __init__(self, method='standard'):
"""torch 标准化
"""
self.method = method
def standard_scaler(self, tensor_in):
self.a = t.mean(tensor_in.float(), dim=0)
self.b = t.std(tensor_in.float(), dim=0)
def fit(self, tensor_in):
if self.method == 'standard':
self.standard_scaler(tensor_in)
def predict(self, tensor_in):
if type(tensor_in) == np.ndarray:
tensor_in = t.tensor(tensor_in)
if self.method == 'standard':
return (tensor_in.float() - self.a) / self.b
def save(self, file):
a = self.a.numpy()
b = self.b.numpy()
with open(file, 'w') as f:
f.write(f'{a},{b}')
def load(self, file):
with open(file, 'w') as f:
cc = f.read()
a, b = cc.split(',')
self.a = t.tensor(float(a)).float()
self.b = t.tensor(float(b)).float()
2.2 QNet
其实就是一个简单的全连接线性分类器, 多增加了model_compelet
生成损失函数和优化器,还有update
可以基于一个batch的样本进行参数更新
class QNeuralEstimator(nn.Module):
"""QNet
简单的nn回归器
sample
n_model = QNeuralEstimator(input_dim=3, hidden_layer_size=[16, 16], output_dim=2)
x1 = t.tensor([[1, 2, 3], [1, 2, 3]]).float()
y1 = t.tensor([[1, 1], [1, 1]]).float()
n_model(x1)
n_model.update(x1, y1)
n_model(x1)
"""
def __init__(self, input_dim, hidden_layer_size, output_dim):
super(QNeuralEstimator, self).__init__()
self.features = nn.ModuleList()
for i, h in enumerate(hidden_layer_size):
self.features.append(nn.ModuleDict({
'linear': nn.Linear(input_dim, h) if not i \
else nn.Linear(hidden_layer_size[i-1], h),
'linear_active': nn.ReLU(inplace=True)
}))
self.out = nn.Linear(hidden_layer_size[-1], output_dim)
self.model_compelet()
def forward(self, x):
for layer in self.features:
x = layer['linear'](x)
x = layer['linear_active'](x)
return self.out(x)
def model_compelet(self):
self.cost_func = nn.MSELoss()
self.opt = t.optim.Adam(self.parameters(), lr=0.001)
def update(self, batch_x, batch_y):
self.opt.zero_grad()
pred_ = self(batch_x)
loss = self.cost_func(batch_y, pred_)
loss.backward()
self.opt.step()
def fit(self, x, y):
self.update(x, y)
三、模型训练
核心就是:
agent.initialize
step
其实就是重buffer中抽样进行agent.update
模型参数更新- 加入了严格止停,因为探索的时候采样会采样到一些在比初始化的状态均值与方差还要大的样本。
- 这样点我们可以在笔者的前一篇中有对环境的探索 《强化学习_03_表格方法实践(CartPole-v0 And MontoCarlo)》
class CartPoleActorTrainer:
def __init__(self, buffer_size=1024, batch_size=32, gamma=0.9):
self.buffer_size = buffer_size
self.batch_size = batch_size
self.gamma = gamma
self.experiences = deque(maxlen=buffer_size)
self.training = False
self.training_count = 0
def train_loop(self, env, agent, episode=200, render=False):
# 严格early_stop: 连续达到最优值8次停止迭代
th_ = 8
not_improve_cnt = 0
episode_reward = deque(maxlen=2)
episode_reward.append(0)
for e in range(episode):
s = env.reset()
done = False
step_count = 0
episode_r = 0
while not done:
if render:
env.render()
a = agent.policy(s)
n_state, reward, done, info = env.step(a)
episode_r += reward
# 收集样本
self.experiences.append(
Experience(s, a, reward, n_state, done)
)
if not self.training and \
len(self.experiences) == self.buffer_size:
agent.initialize(self.experiences)
self.training = True
# 样本采用并进行参数更新
self.step(agent)
s = n_state
step_count += 1
else:
self.episode_end(e, step_count)
episode_reward.append(episode_r)
if self.training:
self.training_count += 1
if episode_reward[1] == episode_reward[0]:
not_improve_cnt += 1
print(f'keep times {not_improve_cnt}')
else:
not_improve_cnt = 0
if not_improve_cnt == th_:
break
def train(self, env, episode_count=200, epsilon=0.1, render=False):
actions = list(range(env.action_space.n))
agent = CartPoleActor(epsilon, actions)
self.train_loop(env, agent, episode_count, render)
return agent
def step(self, agent):
if self.training:
batch = random.sample(self.experiences, self.batch_size)
agent.update(batch, self.gamma)
def episode_end(self, episode, step_count):
recent_idx = range(len(self.experiences) - step_count, len(self.experiences))
recent = [self.experiences[i] for i in recent_idx]
rewards = sum([e.r for e in recent])
print(f'[{episode}]-Trained({self.training_count}) Reward- {rewards}')
四、观测训练出的智能体的表现
显然我们的DQN表现要好于传统方法,可以达到该游戏的奖励上限200
env = CartPoleObserver(gym.make('CartPole-v0'))
trainer = CartPoleActorTrainer(buffer_size=1024, batch_size=128, gamma=0.8)
trained_agent = trainer.train(env, episode_count=320)
trained_agent.play(env)
完整脚本查看笔者github: dqn_rl.py