视频链接:https://www.bilibili.com/video/BV1Ye411P7Sw/?spm_id_from=333.1007.top_right_bar_window_history.content.click&vd_source=2613f45f05cdff9eb69129d6b00e37a2
动手写迷宫环境
导入库
plt是画图像
np是创建数组
animation是创建动态可视化效果
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import animation
import time
绘制图像
figure()函数是构建图像
gca()函数是获取当前坐标轴对象
set_xlim(),set_ylim()设置x,y轴范围
plot()函数是绘制([x1,x2],[y1,y2])的线条
text()函数是设置每个格子的文字
tick_params()函数是自定义坐标轴刻度(ticks)和刻度标签(tick labels)样式的函数。
fig = plt.figure(figsize=(5, 5))#创建一个5*5的图像对象
ax = plt.gca()#获取当前的坐标轴(Axes)对象。
ax.set_xlim(0, 3)#设置 x 轴的范围(限制)。
ax.set_ylim(0, 3)#设置 y 轴的范围(限制)。
plt.plot([2, 3], [1, 1], color='red', linewidth=2)
plt.plot([0, 1], [1, 1], color='red', linewidth=2)
plt.plot([1, 1], [1, 2], color='red', linewidth=2)
plt.plot([1, 2], [2, 2], color='red', linewidth=2)
#绘制从(2,1)到(3,1),从(0,1)到(1,1),从(1,1)到(1,2),从(1,2)到(2,2)的线,颜色是红色,线宽度是2
plt.text(0.5, 2.5, 'S0', size=14, ha='center')
plt.text(1.5, 2.5, 'S1', size=14, ha='center')
plt.text(2.5, 2.5, 'S2', size=14, ha='center')
plt.text(0.5, 1.5, 'S3', size=14, ha='center')
plt.text(1.5, 1.5, 'S4', size=14, ha='center')
plt.text(2.5, 1.5, 'S5', size=14, ha='center')
plt.text(0.5, 0.5, 'S6', size=14, ha='center')
plt.text(1.5, 0.5, 'S7', size=14, ha='center')
plt.text(2.5, 0.5, 'S8', size=14, ha='center')
plt.text(0.5, 2.3, 'START', ha='center')
plt.text(2.5, 0.3, 'GOAL', ha='center')
#标记每个格子的文字
plt.tick_params(axis='both', which='both',
bottom=False, top=False,
right=False, left=False,
labelbottom=False, labelleft=False
)
#设置坐标轴的刻度线和刻度标签的显示方式。这段代码会隐藏 x 轴和 y 轴的所有刻度线和刻度标签,使坐标轴看起来更加简洁。
line, = ax.plot([0.5], [2.5], marker='o', color='g', markersize=60)
#在坐标 (0.5, 2.5) 处绘制一个绿色的、直径为 60 的圆形标记,并且 line 可以用来引用该点对象,进一步操作它。
#line, = ax.plot(...):返回的 line 是一个 Line2D 对象,可以通过它来对这条线或点进行进一步操作(比如设置属性或进行更新)。这里的逗号(,)表示该返回值是一个元组,尽管此处只关心第一个元素(即绘制的线或点)。
plt.show()
#绘制图像
规定每个动作的概率
# border & barrier
theta_0 = np.asarray([[np.nan, 1, 1, np.nan], # s0
[np.nan, 1, np.nan, 1], # s1
[np.nan, np.nan, 1, 1], # s2
[1, np.nan, np.nan, np.nan], # s3
[np.nan, 1, 1, np.nan], # s4
[1, np.nan, np.nan, 1], # s5
[np.nan, 1, np.nan, np.nan], # s6
[1, 1, np.nan, 1]] # s7
)#规定动作
pi = cvt_theta_0_to_pi(theta_0)
#选择每个动作的概率
print(pi)
设置做出动作的状态,动作历史和状态历史;通过随机选出一个动作并到达一个状态,直至最终到达GOAL
actions = list(range(4))
print(actions)
def step(state, action):
if action == 0:
state -= 3
elif action == 1:
state += 1
elif action == 2:
state += 3
elif action == 3:
state -= 1
return state
#做出动作后的状态
state = 0
action_history = []
state_history = [state]
while True:
action = np.random.choice(actions, p=pi[state, :])
state = step(state, action)
if state == 8:
state_history.append(8)
break
action_history.append(action)#记入历史
state_history.append(state)#记入历史
len(action_history)
print(len(action_history))
print(state_history)
可视化,生成.gif
def init():
line.set_data([], [])
return (line, )
def animate(i):
state = state_history[i]
x =[ (state % 3)+0.5]
y =[ 2.5 - int(state/3)]
line.set_data(x, y)
anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(state_history), interval=200, repeat=False)
#创建一个基于函数的动画。使用 animate 函数逐帧更新图形。使用 init 函数初始化动画的初始状态。动画的总帧数由 state_history 的长度决定。每帧之间的时间间隔为 200 毫秒,且动画只播放一次。通过这种方式,可以实现动态可视化数据变化的效果。
name='MG'+str(time.mktime(time.localtime(time.time())))+'.gif'
anim.save(name, writer='pillow')#生成gif
完整代码
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import animation
import time
fig = plt.figure(figsize=(5, 5))
#创建一个5*5的图像对象
ax = plt.gca()
#获取当前的坐标轴(Axes)对象。
ax.set_xlim(0, 3)
#设置 x 轴的范围(限制)。
ax.set_ylim(0, 3)
#设置 y 轴的范围(限制)。
plt.plot([2, 3], [1, 1], color='red', linewidth=2)
plt.plot([0, 1], [1, 1], color='red', linewidth=2)
plt.plot([1, 1], [1, 2], color='red', linewidth=2)
plt.plot([1, 2], [2, 2], color='red', linewidth=2)
#绘制从(2,1)到(3,1),从(0,1)到(1,1),从(1,1)到(1,2),从(1,2)到(2,2)的线,颜色是红色,线宽度是2
plt.text(0.5, 2.5, 'S0', size=14, ha='center')
plt.text(1.5, 2.5, 'S1', size=14, ha='center')
plt.text(2.5, 2.5, 'S2', size=14, ha='center')
plt.text(0.5, 1.5, 'S3', size=14, ha='center')
plt.text(1.5, 1.5, 'S4', size=14, ha='center')
plt.text(2.5, 1.5, 'S5', size=14, ha='center')
plt.text(0.5, 0.5, 'S6', size=14, ha='center')
plt.text(1.5, 0.5, 'S7', size=14, ha='center')
plt.text(2.5, 0.5, 'S8', size=14, ha='center')
plt.text(0.5, 2.3, 'START', ha='center')
plt.text(2.5, 0.3, 'GOAL', ha='center')
#标记每个格子的文字
# plt.axis('off')
plt.tick_params(axis='both', which='both',
bottom=False, top=False,
right=False, left=False,
labelbottom=False, labelleft=False
)
#设置坐标轴的刻度线和刻度标签的显示方式。这段代码会隐藏 x 轴和 y 轴的所有刻度线和刻度标签,使坐标轴看起来更加简洁。
line, = ax.plot([0.5], [2.5], marker='o', color='g', markersize=60)
#在坐标 (0.5, 2.5) 处绘制一个绿色的、直径为 60 的圆形标记,并且 line 可以用来引用该点对象,进一步操作它。
#line, = ax.plot(...):返回的 line 是一个 Line2D 对象,可以通过它来对这条线或点进行进一步操作(比如设置属性或进行更新)。这里的逗号(,)表示该返回值是一个元组,尽管此处只关心第一个元素(即绘制的线或点)。
plt.show()
#绘制图像
# border & barrier
theta_0 = np.asarray([[np.nan, 1, 1, np.nan], # s0
[np.nan, 1, np.nan, 1], # s1
[np.nan, np.nan, 1, 1], # s2
[1, np.nan, np.nan, np.nan], # s3
[np.nan, 1, 1, np.nan], # s4
[1, np.nan, np.nan, 1], # s5
[np.nan, 1, np.nan, np.nan], # s6
[1, 1, np.nan, 1]] # s7
)
#规定动作
def cvt_theta_0_to_pi(theta):
m, n = theta.shape
pi = np.zeros((m, n))
for r in range(m):
pi[r, :] = theta[r, :] / np.nansum(theta[r, :])
return np.nan_to_num(pi)
pi = cvt_theta_0_to_pi(theta_0)
#选择每个动作的概率
print(pi)
actions = list(range(4))
print(actions)
def step(state, action):
if action == 0:
state -= 3
elif action == 1:
state += 1
elif action == 2:
state += 3
elif action == 3:
state -= 1
return state
#做出动作后的状态
state = 0
action_history = []
state_history = [state]
while True:
action = np.random.choice(actions, p=pi[state, :])
state = step(state, action)
if state == 8:
state_history.append(8)
break
action_history.append(action)#记入历史
state_history.append(state)#记入历史
len(action_history)
print(len(action_history))
print(state_history)
def init():
line.set_data([], [])
return (line, )
def animate(i):
state = state_history[i]
x =[ (state % 3)+0.5]
y =[ 2.5 - int(state/3)]
line.set_data(x, y)
anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(state_history), interval=200, repeat=False)
#创建一个基于函数的动画。使用 animate 函数逐帧更新图形。使用 init 函数初始化动画的初始状态。动画的总帧数由 state_history 的长度决定。每帧之间的时间间隔为 200 毫秒,且动画只播放一次。通过这种方式,可以实现动态可视化数据变化的效果。
name='MG'+str(time.mktime(time.localtime(time.time())))+'.gif'
anim.save(name, writer='pillow')#生成gif
封装MazeEnv和Agent
MazeEnv类
init()函数是必不可少的,初始化环境
step()函数则是执行动作、更新环境、计算奖励、检查终止条件,并返回交互结果,定义每一步动作后的状态
reset()函数则是重置环境
class MazeEnv(gym.Env):
def __init__(self):
self.state = 0
def step(self, action):#step是执行动作、更新环境、计算奖励、检查终止条件,并返回交互结果
if action == 0:
self.state -=3
elif action == 1:
self.state +=1
elif action == 2:
self.state +=3
elif action == 3:
self.state -=1
done = False
if self.state == 8:
done = True
return self.state, 1 ,done,False, {}
def reset(self):
self.state = 0
return self.state
Agent类
init()函数初始化代理
_cvt_theta_0_to_pi()函数将可能的动作转化成分布概率
choose_action()函数是随机选出一个动作
class Agent:
def __init__(self):
self.actions=list(range(4))
self.theta_0 = np.asarray([[np.nan, 1, 1, np.nan], # s0
[np.nan, 1, np.nan, 1], # s1
[np.nan, np.nan, 1, 1], # s2
[1, np.nan, np.nan, np.nan], # s3
[np.nan, 1, 1, np.nan], # s4
[1, np.nan, np.nan, 1], # s5
[np.nan, 1, np.nan, np.nan], # s6
[1, 1, np.nan, 1]] # s7
)
self.pi=self._cvt_theta_0_to_pi(self.theta_0)
def _cvt_theta_0_to_pi(self,theta):
m, n = theta.shape
pi = np.zeros((m, n))
for r in range(m):
pi[r, :] = theta[r, :] / np.nansum(theta[r, :])
return np.nan_to_num(pi)
def choose_action(self, state):
action = np.random.choice(self.actions,p=self.pi[state,:])
return action
完整代码
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation
import time
import os
plots_dir = 'plots\\MG2\\'
if not os.path.exists(plots_dir):
os.makedirs(plots_dir)
#.gif存储位置
class MazeEnv(gym.Env):
def __init__(self):
self.state = 0
def step(self, action):#step是执行动作、更新环境、计算奖励、检查终止条件,并返回交互结果
if action == 0:
self.state -=3
elif action == 1:
self.state +=1
elif action == 2:
self.state +=3
elif action == 3:
self.state -=1
done = False
if self.state == 8:
done = True
return self.state, 1 ,done,False, {}
def reset(self):
self.state = 0
return self.state
class Agent:
def __init__(self):
self.actions=list(range(4))
self.theta_0 = np.asarray([[np.nan, 1, 1, np.nan], # s0
[np.nan, 1, np.nan, 1], # s1
[np.nan, np.nan, 1, 1], # s2
[1, np.nan, np.nan, np.nan], # s3
[np.nan, 1, 1, np.nan], # s4
[1, np.nan, np.nan, 1], # s5
[np.nan, 1, np.nan, np.nan], # s6
[1, 1, np.nan, 1]] # s7
)
self.pi=self._cvt_theta_0_to_pi(self.theta_0)
def _cvt_theta_0_to_pi(self,theta):
m, n = theta.shape
pi = np.zeros((m, n))
for r in range(m):
pi[r, :] = theta[r, :] / np.nansum(theta[r, :])
return np.nan_to_num(pi)
def choose_action(self, state):
action = np.random.choice(self.actions,p=self.pi[state,:])
return action
env = MazeEnv()
state = env.reset()
agent = Agent()
done = False
action_history=[]
state_history=[state]
while not done:
action = agent.choose_action(state)
state, reward, done,_, info = env.step(action)
action_history.append(action)
state_history.append(state)
print(action_history)
print(state_history)
fig = plt.figure(figsize=(5, 5))
ax = plt.gca()
ax.set_xlim(0, 3)
ax.set_ylim(0, 3)
plt.plot([2, 3], [1, 1], color='red', linewidth=2)
plt.plot([0, 1], [1, 1], color='red', linewidth=2)
plt.plot([1, 1], [1, 2], color='red', linewidth=2)
plt.plot([1, 2], [2, 2], color='red', linewidth=2)
plt.text(0.5, 2.5, 'S0', size=14, ha='center')
plt.text(1.5, 2.5, 'S1', size=14, ha='center')
plt.text(2.5, 2.5, 'S2', size=14, ha='center')
plt.text(0.5, 1.5, 'S3', size=14, ha='center')
plt.text(1.5, 1.5, 'S4', size=14, ha='center')
plt.text(2.5, 1.5, 'S5', size=14, ha='center')
plt.text(0.5, 0.5, 'S6', size=14, ha='center')
plt.text(1.5, 0.5, 'S7', size=14, ha='center')
plt.text(2.5, 0.5, 'S8', size=14, ha='center')
plt.text(0.5, 2.3, 'START', ha='center')
plt.text(2.5, 0.3, 'GOAL', ha='center')
# plt.axis('off')
plt.tick_params(axis='both', which='both',
bottom=False, top=False,
right=False, left=False,
labelbottom=False, labelleft=False
)
line, = ax.plot([0.5], [2.5], marker='o', color='g', markersize=60)
plt.show()
def init():
line.set_data([], [])
return (line,)
def animate(i):
state = state_history[i]
x = [(state % 3) + 0.5]
y = [2.5 - int(state / 3)]
line.set_data(x, y)
anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(state_history), interval=200, repeat=False)
name = plots_dir + str(time.mktime(time.localtime(time.time()))) + '.gif'
anim.save(name, writer='pillow') # 生成gif
利用softmax来更新策略
在每一次循环中,都需要重置环境
在一次循环中:根据所选动作得出下一个状态,回报,并加入状态-动作对历史,直至这次循环结束。在这次循环结束之后,则需要需要更新参数方程,直至收敛
基于 softmax,更 新general 的概率化方式
# 基于 softmax,更新general 的概率化方式
def _softmax_cvt_theta_to_pi(self, beta=1.0):
m, n = self.theta.shape
pi = np.zeros((m, n))
exp_theta = np.exp(self.theta * beta)
for r in range(m):
pi[r, :] = exp_theta[r, :] / np.nansum(exp_theta[r, :])
pi = np.nan_to_num(pi)
return pi
def update_theta(self, s_a_history):
T = len(s_a_history) - 1
m, n = self.theta.shape
delta_theta = self.theta.copy()
for i in range(m):
for j in range(n):
if not (np.isnan(self.theta_0[i, j])):
sa_i = [sa for sa in s_a_history if sa[0] == i]
#从 s_a_history 中筛选出所有状态为 i 的状态-动作对,并存储在 sa_i 中。
sa_ij = [sa for sa in s_a_history if (sa[0] == i and sa[1] == j)]
#从 s_a_history 中筛选出状态为 i 且动作为 j 的状态-动作对,并存储在 sa_i中。
N_i = len(sa_i)
N_ij = len(sa_ij)
delta_theta[i, j] = (N_ij - self.pi[i, j] * N_i) / T
self.theta = self.theta + self.eta * delta_theta
return self.theta
完整代码
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation
import time
import os
plots_dir = 'plots\\MG3\\'
if not os.path.exists(plots_dir):
os.makedirs(plots_dir)
class MazeEnv(gym.Env):
def __init__(self):
self.state = 0
def step(self, action):
if action == 0:
self.state -=3
elif action == 1:
self.state +=1
elif action == 2:
self.state +=3
elif action == 3:
self.state -=1
done = False
if self.state == 8:
done = True
return self.state, 1 ,done,False, {}
def reset(self):
self.state = 0
return self.state
class Agent:
def __init__(self):
self.actions=list(range(4))
self.theta_0 = np.asarray([[np.nan, 1, 1, np.nan], # s0
[np.nan, 1, np.nan, 1], # s1
[np.nan, np.nan, 1, 1], # s2
[1, np.nan, np.nan, np.nan], # s3
[np.nan, 1, 1, np.nan], # s4
[1, np.nan, np.nan, 1], # s5
[np.nan, 1, np.nan, np.nan], # s6
[1, 1, np.nan, 1]] # s7
)
self.theta = self.theta_0
self.pi = self._softmax_cvt_theta_to_pi()
self.eta = 0.1
# 基于 softmax,更新general 的概率化方式
def _softmax_cvt_theta_to_pi(self, beta=1.0):
m, n = self.theta.shape
pi = np.zeros((m, n))
exp_theta = np.exp(self.theta * beta)
for r in range(m):
pi[r, :] = exp_theta[r, :] / np.nansum(exp_theta[r, :])
pi = np.nan_to_num(pi)
return pi
def update_theta(self, s_a_history):
T = len(s_a_history) - 1
m, n = self.theta.shape
delta_theta = self.theta.copy()
for i in range(m):
for j in range(n):
if not (np.isnan(self.theta_0[i, j])):
sa_i = [sa for sa in s_a_history if sa[0] == i]
sa_ij = [sa for sa in s_a_history if (sa[0] == i and sa[1] == j)]
N_i = len(sa_i)
N_ij = len(sa_ij)
delta_theta[i, j] = (N_ij - self.pi[i, j] * N_i) / T
self.theta = self.theta + self.eta * delta_theta
return self.theta
def update_pi(self):
self.pi = self._softmax_cvt_theta_to_pi()
return self.pi
def choose_action(self, state):
action = np.random.choice(self.actions,p=self.pi[state,:])
return action
stop_eps = 1e-4
agent = Agent()
env = MazeEnv()
while True:
# 不断地从初始状态出发,产生一次 trajectory
state = env.reset()
# state, action
s_a_history = [[state, np.nan]]
while True:
action = agent.choose_action(state)
s_a_history[-1][1] = action
state, reward, done, _,_ = env.step(action)
# print(action, state, done)
s_a_history.append([state, np.nan])
if state == 8 or done:
break
# 更新 theta
agent.update_theta(s_a_history)
pi = agent.pi.copy()
# 更新 pi
agent.update_pi()
delta = np.sum(np.abs(agent.pi - pi))
print(len(s_a_history), delta)
if delta < stop_eps:
break
print(agent.pi)
print(s_a_history)
fig = plt.figure(figsize=(5, 5))
ax = plt.gca()
ax.set_xlim(0, 3)
ax.set_ylim(0, 3)
plt.plot([2, 3], [1, 1], color='red', linewidth=2)
plt.plot([0, 1], [1, 1], color='red', linewidth=2)
plt.plot([1, 1], [1, 2], color='red', linewidth=2)
plt.plot([1, 2], [2, 2], color='red', linewidth=2)
plt.text(0.5, 2.5, 'S0', size=14, ha='center')
plt.text(1.5, 2.5, 'S1', size=14, ha='center')
plt.text(2.5, 2.5, 'S2', size=14, ha='center')
plt.text(0.5, 1.5, 'S3', size=14, ha='center')
plt.text(1.5, 1.5, 'S4', size=14, ha='center')
plt.text(2.5, 1.5, 'S5', size=14, ha='center')
plt.text(0.5, 0.5, 'S6', size=14, ha='center')
plt.text(1.5, 0.5, 'S7', size=14, ha='center')
plt.text(2.5, 0.5, 'S8', size=14, ha='center')
plt.text(0.5, 2.3, 'START', ha='center')
plt.text(2.5, 0.3, 'GOAL', ha='center')
# plt.axis('off')
plt.tick_params(axis='both', which='both',
bottom=False, top=False,
right=False, left=False,
labelbottom=False, labelleft=False
)
line, = ax.plot([0.5], [2.5], marker='o', color='g', markersize=60)
plt.show()
def init():
line.set_data([], [])
return (line,)
def animate(i):
state = s_a_history[i]
#print(state[0])
x = [(state[0] % 3) + 0.5]
y = [2.5 - int(state[0] / 3)]
line.set_data(x, y)
anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(s_a_history), interval=200, repeat=False)
name = plots_dir + str(time.mktime(time.localtime(time.time()))) + '.gif'
anim.save(name, writer='pillow') # 生成gif
sarsa算法策略
首先会确认做多少次循环即epoch,在每一次循环中,都需要重置环境
并计算 agent.Q 表中每一行的最大值(即每个状态下所有动作的最大 Q 值),并将结果存储在 old_Q 中。
在一次循环中:根据所选动作得出下一个状态,回报,并加入状态-动作对历史,以及更新Q值,直至这次循环结束,在这次循环结束之后,则需要计算数组中最大值并忽略 NaN;通过计算最大的差距来判断是否收敛。
sarsa算法
如果S_next是终止形态,仅用当前奖励r更新Q值;如果不是,则使用当前奖励 r 和未来奖励(通过折扣因子 γ 和下一个状态-动作对的 Q 值 Q(S_next ,a _next)更新 Q 值。
def sarsa(self, s, a, r, s_next, a_next):
if s_next == 8:
self.Q[s, a] = self.Q[s, a] + self.eta * (r - self.Q[s, a])
else:
self.Q[s, a] = self.Q[s, a] + self.eta * (r + self.gamma * self.Q[s_next, a_next] - self.Q[s, a])
动作选择- ε-贪心策略
def get_action(self, s):
# eps, explore
if np.random.rand() < self.eps:
action = np.random.choice(self.action_space, p=self.pi[s, :])
else:
# 1-eps, exploit
action = np.nanargmax(self.Q[s, :])
return action
完整代码
import numpy as np
import gym
import matplotlib.pyplot as plt
from matplotlib import animation
import time
import os
plots_dir = 'plots\\MG4\\'
if not os.path.exists(plots_dir):
os.makedirs(plots_dir)
fig = plt.figure(figsize=(5, 5))
ax = plt.gca()
ax.set_xlim(0, 3)
ax.set_ylim(0, 3)
plt.plot([2, 3], [1, 1], color='red', linewidth=2)
plt.plot([0, 1], [1, 1], color='red', linewidth=2)
plt.plot([1, 1], [1, 2], color='red', linewidth=2)
plt.plot([1, 2], [2, 2], color='red', linewidth=2)
plt.text(0.5, 2.5, 'S0', size=14, ha='center')
plt.text(1.5, 2.5, 'S1', size=14, ha='center')
plt.text(2.5, 2.5, 'S2', size=14, ha='center')
plt.text(0.5, 1.5, 'S3', size=14, ha='center')
plt.text(1.5, 1.5, 'S4', size=14, ha='center')
plt.text(2.5, 1.5, 'S5', size=14, ha='center')
plt.text(0.5, 0.5, 'S6', size=14, ha='center')
plt.text(1.5, 0.5, 'S7', size=14, ha='center')
plt.text(2.5, 0.5, 'S8', size=14, ha='center')
plt.text(0.5, 2.3, 'START', ha='center')
plt.text(2.5, 0.3, 'GOAL', ha='center')
# plt.axis('off')
plt.tick_params(axis='both', which='both',
bottom=False, top=False,
right=False, left=False,
labelbottom=False, labelleft=False
)
line, = ax.plot([0.5], [2.5], marker='o', color='g', markersize=60)
plt.show()
# 维护着状态,以及 step 函数的返回
class MazeEnv(gym.Env):
def __init__(self):
self.state = 0
pass
def reset(self):
self.state = 0
return self.state
def step(self, action):
if action == 0:
self.state -= 3
elif action == 1:
self.state += 1
elif action == 2:
self.state += 3
elif action == 3:
self.state -= 1
done = False
reward = 0
if self.state == 8:
done = True
reward = 1
# state, reward, done, _
return self.state, reward, done, {}
# 动作策略选择,基于当前环境的状态
class Agent:
def __init__(self):
self.action_space = list(range(4))
self.theta_0 = np.asarray([[np.nan, 1, 1, np.nan], # s0
[np.nan, 1, np.nan, 1], # s1
[np.nan, np.nan, 1, 1], # s2
[1, np.nan, np.nan, np.nan], # s3
[np.nan, 1, 1, np.nan], # s4
[1, np.nan, np.nan, 1], # s5
[np.nan, 1, np.nan, np.nan], # s6
[1, 1, np.nan, 1]] # s7
)
self.pi = self._cvt_theta_to_pi()
self.Q = np.random.rand(*self.theta_0.shape) * self.theta_0
self.eta = 0.1#表示学习率
self.gamma = 0.9#表示时间的折扣因子
self.eps = 0.5#选择action的策略
def get_action(self, s):
# eps, explore
if np.random.rand() < self.eps:
action = np.random.choice(self.action_space, p=self.pi[s, :])
else:
# 1-eps, exploit
action = np.nanargmax(self.Q[s, :])
return action
def sarsa(self, s, a, r, s_next, a_next):
if s_next == 8:
self.Q[s, a] = self.Q[s, a] + self.eta * (r - self.Q[s, a])
else:
self.Q[s, a] = self.Q[s, a] + self.eta * (r + self.gamma * self.Q[s_next, a_next] - self.Q[s, a])
def _cvt_theta_to_pi(self):
m, n = self.theta_0.shape
pi = np.zeros((m, n))
for r in range(m):
pi[r, :] = self.theta_0[r, :] / np.nansum(self.theta_0[r, :])
return np.nan_to_num(pi)
#等待可选择的动作的概率
maze = MazeEnv()
agent = Agent()
epoch = 0
while True:#要做多少次epoch
old_Q = np.nanmax(agent.Q, axis=1)
s = maze.reset()
a = agent.get_action(s)
s_a_history = [[s, np.nan]]
while True:#里面的循环是从开始到结束
# s, a
s_a_history[-1][1] = a
s_next, reward, done, _ = maze.step(a, )
# s_next, a_next
s_a_history.append([s_next, np.nan])
if done:
a_next = np.nan#如果已经结束
else:
a_next = agent.get_action(s_next)#如果没有结束,会选出新的动作
# print(s, a, reward, s_next, a_next)
agent.sarsa(s, a, reward, s_next, a_next)
# print(agent.pi)
if done:
break
else:
a = a_next
s = maze.state
# s_s_history, agent.Q
update = np.sum(np.abs(np.nanmax(agent.Q, axis=1) - old_Q))
#计算数组中最大值并忽略 NaN;通过计算最大的差距来判断是否收敛
epoch +=1
agent.eps /= 2
print(epoch, update, len(s_a_history))
if epoch > 100 or update < 1e-5:
break
print(agent.Q)
def init():
line.set_data([], [])
return (line,)
def animate(i):
state = s_a_history[i]
#print(state[0])
x = [(state[0] % 3) + 0.5]
y = [2.5 - int(state[0] / 3)]
line.set_data(x, y)
anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(s_a_history), interval=200, repeat=False)
name = plots_dir + str(time.mktime(time.localtime(time.time()))) + '.gif'
anim.save(name, writer='pillow') # 生成gif
Q-learning
和sarsa算法的区别就是Q-learning 在更新 Q 值时,使用的是下一个状态中 最大 Q 值的动作,而不关心下一个动作是如何选择的。
sarsa 是 基于策略(on-policy)算法。sarsa 在更新 Q 值时,使用的是 实际选择的动作
Q-learning
def q_learning(self, s, a, r, s_next):
if s_next == 8:
self.Q[s, a] = self.Q[s, a] + self.eta * (r - self.Q[s, a])
else:
self.Q[s, a] = self.Q[s, a] + self.eta * (r + self.gamma * np.nanmax(self.Q[s_next, :]) - self.Q[s, a])
完整代码
import numpy as np
import gym
import matplotlib.pyplot as plt
from matplotlib import animation
import time
import os
plots_dir = 'plots\\MG5\\'
if not os.path.exists(plots_dir):
os.makedirs(plots_dir)
fig = plt.figure(figsize=(5, 5))
ax = plt.gca()
ax.set_xlim(0, 3)
ax.set_ylim(0, 3)
plt.plot([2, 3], [1, 1], color='red', linewidth=2)
plt.plot([0, 1], [1, 1], color='red', linewidth=2)
plt.plot([1, 1], [1, 2], color='red', linewidth=2)
plt.plot([1, 2], [2, 2], color='red', linewidth=2)
plt.text(0.5, 2.5, 'S0', size=14, ha='center')
plt.text(1.5, 2.5, 'S1', size=14, ha='center')
plt.text(2.5, 2.5, 'S2', size=14, ha='center')
plt.text(0.5, 1.5, 'S3', size=14, ha='center')
plt.text(1.5, 1.5, 'S4', size=14, ha='center')
plt.text(2.5, 1.5, 'S5', size=14, ha='center')
plt.text(0.5, 0.5, 'S6', size=14, ha='center')
plt.text(1.5, 0.5, 'S7', size=14, ha='center')
plt.text(2.5, 0.5, 'S8', size=14, ha='center')
plt.text(0.5, 2.3, 'START', ha='center')
plt.text(2.5, 0.3, 'GOAL', ha='center')
# plt.axis('off')
plt.tick_params(axis='both', which='both',
bottom=False, top=False,
right=False, left=False,
labelbottom=False, labelleft=False
)
line, = ax.plot([0.5], [2.5], marker='o', color='g', markersize=60)
plt.show()
# 维护着状态,以及 step 函数的返回
class MazeEnv(gym.Env):
def __init__(self):
self.state = 0
pass
def reset(self):
self.state = 0
return self.state
def step(self, action):
if action == 0:
self.state -= 3
elif action == 1:
self.state += 1
elif action == 2:
self.state += 3
elif action == 3:
self.state -= 1
done = False
reward = 0
if self.state == 8:
done = True
reward = 1
# state, reward, done, _
return self.state, reward, done, {}
# 动作策略选择,基于当前环境的状态
class Agent:
def __init__(self):
self.action_space = list(range(4))
self.theta_0 = np.asarray([[np.nan, 1, 1, np.nan], # s0
[np.nan, 1, np.nan, 1], # s1
[np.nan, np.nan, 1, 1], # s2
[1, np.nan, np.nan, np.nan], # s3
[np.nan, 1, 1, np.nan], # s4
[1, np.nan, np.nan, 1], # s5
[np.nan, 1, np.nan, np.nan], # s6
[1, 1, np.nan, 1]] # s7
)
self.pi = self._cvt_theta_to_pi()
# self.pi = self._softmax_cvt_theta_to_pi()
# self.theta = self.theta_0
self.Q = np.random.rand(*self.theta_0.shape) * self.theta_0
self.eta = 0.1#表示学习率
self.gamma = 0.9#表示时间的折扣因子
self.eps = 0.5#选择action的策略
def get_action(self, s):
# eps, explore
if np.random.rand() < self.eps:
action = np.random.choice(self.action_space, p=self.pi[s, :])
else:
# 1-eps, exploit
action = np.nanargmax(self.Q[s, :])
return action
def q_learning(self, s, a, r, s_next):
if s_next == 8:
self.Q[s, a] = self.Q[s, a] + self.eta * (r - self.Q[s, a])
else:
self.Q[s, a] = self.Q[s, a] + self.eta * (r + self.gamma * np.nanmax(self.Q[s_next, :]) - self.Q[s, a])
#q-learning和sarsa的区别就是Q-learning 在更新 Q 值时,使用的是下一个状态中 最大 Q 值的动作,而不关心下一个动作是如何选择的。
# sarsa 是 基于策略(on-policy)算法。sarsa 在更新 Q 值时,使用的是 实际选择的动作
def _cvt_theta_to_pi(self):
m, n = self.theta_0.shape
pi = np.zeros((m, n))
for r in range(m):
pi[r, :] = self.theta_0[r, :] / np.nansum(self.theta_0[r, :])
return np.nan_to_num(pi)
#等待可选择的动作的概率
maze = MazeEnv()
agent = Agent()
epoch = 0
while True:#要做多少次epoch
old_Q = np.nanmax(agent.Q, axis=1)
s = maze.reset()
a = agent.get_action(s)
s_a_history = [[s, np.nan]]
while True:#里面的循环是从开始到结束
# s, a
s_a_history[-1][1] = a
s_next, reward, done, _ = maze.step(a, )
# s_next, a_next
s_a_history.append([s_next, np.nan])
if done:
a_next = np.nan#如果已经结束
else:
a_next = agent.get_action(s_next)#如果没有结束,会选出新的动作
# print(s, a, reward, s_next, a_next)
agent.q_learning(s, a, reward, s_next, a_next)
# print(agent.pi)
if done:
break
else:
a = a_next
s = maze.state
# s_s_history, agent.Q
update = np.sum(np.abs(np.nanmax(agent.Q, axis=1) - old_Q))
#计算数组中最大值并忽略 NaN;通过计算最大的差距来判断是否收敛
epoch +=1
agent.eps /= 2
print(epoch, update, len(s_a_history))
if epoch > 100 or update < 1e-5:
break
print(agent.Q)
def init():
line.set_data([], [])
return (line,)
def animate(i):
state = s_a_history[i]
#print(state[0])
x = [(state[0] % 3) + 0.5]
y = [2.5 - int(state[0] / 3)]
line.set_data(x, y)
anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(s_a_history), interval=200, repeat=False)
name = plots_dir + str(time.mktime(time.localtime(time.time()))) + '.gif'
anim.save(name, writer='pillow') # 生成gif