强化学习- 动手写迷宫环境（maze env）状态及动作策略初步（及.gif保存）

帅小伙―苏

于 2025-02-26 10:06:07 发布

阅读量1.1k

点赞数 42

文章标签：深度学习 python

本文链接：https://blog.youkuaiyun.com/m0_65472749/article/details/145772425

版权

视频链接：https://www.bilibili.com/video/BV1Ye411P7Sw/?spm_id_from=333.1007.top_right_bar_window_history.content.click&vd_source=2613f45f05cdff9eb69129d6b00e37a2

动手写迷宫环境

导入库

plt是画图像
np是创建数组
animation是创建动态可视化效果

import matplotlib.pyplot as plt  
import numpy as np
from matplotlib import animation
import time

绘制图像

figure（）函数是构建图像
gca（）函数是获取当前坐标轴对象
set_xlim（），set_ylim（）设置x,y轴范围
plot()函数是绘制（[x1,x2],[y1,y2]）的线条
text()函数是设置每个格子的文字
tick_params（）函数是自定义坐标轴刻度（ticks）和刻度标签（tick labels）样式的函数。

fig = plt.figure(figsize=(5, 5))#创建一个5*5的图像对象
ax = plt.gca()#获取当前的坐标轴（Axes）对象。
ax.set_xlim(0, 3)#设置 x 轴的范围（限制）。
ax.set_ylim(0, 3)#设置 y 轴的范围（限制）。
plt.plot([2, 3], [1, 1], color='red', linewidth=2)
plt.plot([0, 1], [1, 1], color='red', linewidth=2)
plt.plot([1, 1], [1, 2], color='red', linewidth=2)
plt.plot([1, 2], [2, 2], color='red', linewidth=2)
#绘制从（2，1）到（3，1），从（0，1）到（1，1），从（1，1）到（1，2），从（1，2）到（2，2）的线，颜色是红色，线宽度是2
plt.text(0.5, 2.5, 'S0', size=14, ha='center')
plt.text(1.5, 2.5, 'S1', size=14, ha='center')
plt.text(2.5, 2.5, 'S2', size=14, ha='center')
plt.text(0.5, 1.5, 'S3', size=14, ha='center')
plt.text(1.5, 1.5, 'S4', size=14, ha='center')
plt.text(2.5, 1.5, 'S5', size=14, ha='center')
plt.text(0.5, 0.5, 'S6', size=14, ha='center')
plt.text(1.5, 0.5, 'S7', size=14, ha='center')
plt.text(2.5, 0.5, 'S8', size=14, ha='center')
plt.text(0.5, 2.3, 'START', ha='center')
plt.text(2.5, 0.3, 'GOAL', ha='center')
#标记每个格子的文字
plt.tick_params(axis='both', which='both',
                bottom=False, top=False,
                right=False, left=False,
                labelbottom=False, labelleft=False
               )
#设置坐标轴的刻度线和刻度标签的显示方式。这段代码会隐藏 x 轴和 y 轴的所有刻度线和刻度标签，使坐标轴看起来更加简洁。            
line, = ax.plot([0.5], [2.5], marker='o', color='g', markersize=60)
#在坐标 (0.5, 2.5) 处绘制一个绿色的、直径为 60 的圆形标记，并且 line 可以用来引用该点对象，进一步操作它。
#line, = ax.plot(...)：返回的 line 是一个 Line2D 对象，可以通过它来对这条线或点进行进一步操作（比如设置属性或进行更新）。这里的逗号（,）表示该返回值是一个元组，尽管此处只关心第一个元素（即绘制的线或点）。
plt.show()
#绘制图像

规定每个动作的概率

# border & barrier
theta_0 = np.asarray([[np.nan, 1, 1, np.nan],      # s0
                      [np.nan, 1, np.nan, 1],      # s1
                      [np.nan, np.nan, 1, 1],      # s2
                      [1, np.nan, np.nan, np.nan], # s3
                      [np.nan, 1, 1, np.nan],      # s4
                      [1, np.nan, np.nan, 1],      # s5
                      [np.nan, 1, np.nan, np.nan], # s6
                      [1, 1, np.nan, 1]]           # s7
                     )#规定动作
pi = cvt_theta_0_to_pi(theta_0)
#选择每个动作的概率
print(pi)

设置做出动作的状态，动作历史和状态历史；通过随机选出一个动作并到达一个状态，直至最终到达GOAL

actions = list(range(4))
print(actions)
def step(state, action):
    if action == 0:
        state -= 3
    elif action == 1:
        state += 1
    elif action == 2:
        state += 3
    elif action == 3:
        state -= 1
    return state
#做出动作后的状态
state = 0
action_history = []
state_history = [state]
while True:
    action = np.random.choice(actions, p=pi[state, :])
    state = step(state, action)
    if state == 8:
        state_history.append(8)
        break
    action_history.append(action)#记入历史
    state_history.append(state)#记入历史
len(action_history)
print(len(action_history))
print(state_history)

可视化，生成.gif

def init():
    line.set_data([], [])
    return (line, )
def animate(i):
    state = state_history[i]
    x =[ (state % 3)+0.5]
    y =[ 2.5 - int(state/3)]
    line.set_data(x, y)

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(state_history), interval=200, repeat=False)
#创建一个基于函数的动画。使用 animate 函数逐帧更新图形。使用 init 函数初始化动画的初始状态。动画的总帧数由 state_history 的长度决定。每帧之间的时间间隔为 200 毫秒，且动画只播放一次。通过这种方式，可以实现动态可视化数据变化的效果。
name='MG'+str(time.mktime(time.localtime(time.time())))+'.gif'
anim.save(name, writer='pillow')#生成gif

完整代码

import matplotlib.pyplot as plt
import numpy as np
from matplotlib import animation
import time
fig = plt.figure(figsize=(5, 5))
#创建一个5*5的图像对象
ax = plt.gca()
#获取当前的坐标轴（Axes）对象。
ax.set_xlim(0, 3)
#设置 x 轴的范围（限制）。
ax.set_ylim(0, 3)
#设置 y 轴的范围（限制）。

plt.plot([2, 3], [1, 1], color='red', linewidth=2)
plt.plot([0, 1], [1, 1], color='red', linewidth=2)
plt.plot([1, 1], [1, 2], color='red', linewidth=2)
plt.plot([1, 2], [2, 2], color='red', linewidth=2)
#绘制从（2，1）到（3，1），从（0，1）到（1，1），从（1，1）到（1，2），从（1，2）到（2，2）的线，颜色是红色，线宽度是2
plt.text(0.5, 2.5, 'S0', size=14, ha='center')
plt.text(1.5, 2.5, 'S1', size=14, ha='center')
plt.text(2.5, 2.5, 'S2', size=14, ha='center')
plt.text(0.5, 1.5, 'S3', size=14, ha='center')
plt.text(1.5, 1.5, 'S4', size=14, ha='center')
plt.text(2.5, 1.5, 'S5', size=14, ha='center')
plt.text(0.5, 0.5, 'S6', size=14, ha='center')
plt.text(1.5, 0.5, 'S7', size=14, ha='center')
plt.text(2.5, 0.5, 'S8', size=14, ha='center')
plt.text(0.5, 2.3, 'START', ha='center')
plt.text(2.5, 0.3, 'GOAL', ha='center')
#标记每个格子的文字
# plt.axis('off')
plt.tick_params(axis='both', which='both',
                bottom=False, top=False,
                right=False, left=False,
                labelbottom=False, labelleft=False
               )
#设置坐标轴的刻度线和刻度标签的显示方式。这段代码会隐藏 x 轴和 y 轴的所有刻度线和刻度标签，使坐标轴看起来更加简洁。            
line, = ax.plot([0.5], [2.5], marker='o', color='g', markersize=60)
#在坐标 (0.5, 2.5) 处绘制一个绿色的、直径为 60 的圆形标记，并且 line 可以用来引用该点对象，进一步操作它。
#line, = ax.plot(...)：返回的 line 是一个 Line2D 对象，可以通过它来对这条线或点进行进一步操作（比如设置属性或进行更新）。这里的逗号（,）表示该返回值是一个元组，尽管此处只关心第一个元素（即绘制的线或点）。
plt.show()
#绘制图像

# border & barrier
theta_0 = np.asarray([[np.nan, 1, 1, np.nan],      # s0
                      [np.nan, 1, np.nan, 1],      # s1
                      [np.nan, np.nan, 1, 1],      # s2
                      [1, np.nan, np.nan, np.nan], # s3
                      [np.nan, 1, 1, np.nan],      # s4
                      [1, np.nan, np.nan, 1],      # s5
                      [np.nan, 1, np.nan, np.nan], # s6
                      [1, 1, np.nan, 1]]           # s7
                     )
#规定动作
def cvt_theta_0_to_pi(theta):
    m, n = theta.shape
    pi = np.zeros((m, n))
    for r in range(m):
        pi[r, :] = theta[r, :] / np.nansum(theta[r, :])
    return np.nan_to_num(pi)
pi = cvt_theta_0_to_pi(theta_0)
#选择每个动作的概率
print(pi)

actions = list(range(4))
print(actions)
def step(state, action):
    if action == 0:
        state -= 3
    elif action == 1:
        state += 1
    elif action == 2:
        state += 3
    elif action == 3:
        state -= 1
    return state
#做出动作后的状态
state = 0
action_history = []
state_history = [state]
while True:
    action = np.random.choice(actions, p=pi[state, :])
    state = step(state, action)
    if state == 8:
        state_history.append(8)
        break
    action_history.append(action)#记入历史
    state_history.append(state)#记入历史
len(action_history)
print(len(action_history))
print(state_history)

def init():
    line.set_data([], [])
    return (line, )
def animate(i):
    state = state_history[i]
    x =[ (state % 3)+0.5]
    y =[ 2.5 - int(state/3)]
    line.set_data(x, y)

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(state_history), interval=200, repeat=False)
#创建一个基于函数的动画。使用 animate 函数逐帧更新图形。使用 init 函数初始化动画的初始状态。动画的总帧数由 state_history 的长度决定。每帧之间的时间间隔为 200 毫秒，且动画只播放一次。通过这种方式，可以实现动态可视化数据变化的效果。
name='MG'+str(time.mktime(time.localtime(time.time())))+'.gif'
anim.save(name, writer='pillow')#生成gif

封装MazeEnv和Agent

MazeEnv类

init()函数是必不可少的，初始化环境
step()函数则是执行动作、更新环境、计算奖励、检查终止条件，并返回交互结果，定义每一步动作后的状态
reset()函数则是重置环境

class MazeEnv(gym.Env):
    def __init__(self):
        self.state = 0
    def step(self, action):#step是执行动作、更新环境、计算奖励、检查终止条件，并返回交互结果
        if action == 0:
            self.state -=3
        elif action == 1:
            self.state +=1
        elif action == 2:
            self.state +=3
        elif action == 3:
            self.state -=1
        done = False
        if self.state == 8:
            done = True
        return self.state, 1 ,done,False, {}
    def reset(self):
        self.state = 0
        return self.state

Agent类

init()函数初始化代理
_cvt_theta_0_to_pi()函数将可能的动作转化成分布概率
choose_action（）函数是随机选出一个动作

class Agent:
    def __init__(self):
        self.actions=list(range(4))
        self.theta_0 = np.asarray([[np.nan, 1, 1, np.nan],  # s0
                              [np.nan, 1, np.nan, 1],  # s1
                              [np.nan, np.nan, 1, 1],  # s2
                              [1, np.nan, np.nan, np.nan],  # s3
                              [np.nan, 1, 1, np.nan],  # s4
                              [1, np.nan, np.nan, 1],  # s5
                              [np.nan, 1, np.nan, np.nan],  # s6
                              [1, 1, np.nan, 1]]  # s7
                             )
        self.pi=self._cvt_theta_0_to_pi(self.theta_0)
    def _cvt_theta_0_to_pi(self,theta):
        m, n = theta.shape
        pi = np.zeros((m, n))
        for r in range(m):
            pi[r, :] = theta[r, :] / np.nansum(theta[r, :])
        return np.nan_to_num(pi)
    def choose_action(self, state):
        action = np.random.choice(self.actions,p=self.pi[state,:])
        return action

完整代码

import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation
import time

import os
plots_dir = 'plots\\MG2\\'
if not os.path.exists(plots_dir):
    os.makedirs(plots_dir)
#.gif存储位置
class MazeEnv(gym.Env):
    def __init__(self):
        self.state = 0
    def step(self, action):#step是执行动作、更新环境、计算奖励、检查终止条件，并返回交互结果
        if action == 0:
            self.state -=3
        elif action == 1:
            self.state +=1
        elif action == 2:
            self.state +=3
        elif action == 3:
            self.state -=1
        done = False
        if self.state == 8:
            done = True
        return self.state, 1 ,done,False, {}
    def reset(self):
        self.state = 0
        return self.state
class Agent:
    def __init__(self):
        self.actions=list(range(4))
        self.theta_0 = np.asarray([[np.nan, 1, 1, np.nan],  # s0
                              [np.nan, 1, np.nan, 1],  # s1
                              [np.nan, np.nan, 1, 1],  # s2
                              [1, np.nan, np.nan, np.nan],  # s3
                              [np.nan, 1, 1, np.nan],  # s4
                              [1, np.nan, np.nan, 1],  # s5
                              [np.nan, 1, np.nan, np.nan],  # s6
                              [1, 1, np.nan, 1]]  # s7
                             )
        self.pi=self._cvt_theta_0_to_pi(self.theta_0)
    def _cvt_theta_0_to_pi(self,theta):
        m, n = theta.shape
        pi = np.zeros((m, n))
        for r in range(m):
            pi[r, :] = theta[r, :] / np.nansum(theta[r, :])
        return np.nan_to_num(pi)
    def choose_action(self, state):
        action = np.random.choice(self.actions,p=self.pi[state,:])
        return action
env = MazeEnv()
state = env.reset()
agent = Agent()
done = False
action_history=[]
state_history=[state]
while  not done:
    action = agent.choose_action(state)
    state, reward, done,_, info = env.step(action)
    action_history.append(action)
    state_history.append(state)
print(action_history)
print(state_history)

fig = plt.figure(figsize=(5, 5))
ax = plt.gca()
ax.set_xlim(0, 3)
ax.set_ylim(0, 3)

plt.plot([2, 3], [1, 1], color='red', linewidth=2)
plt.plot([0, 1], [1, 1], color='red', linewidth=2)
plt.plot([1, 1], [1, 2], color='red', linewidth=2)
plt.plot([1, 2], [2, 2], color='red', linewidth=2)

plt.text(0.5, 2.5, 'S0', size=14, ha='center')
plt.text(1.5, 2.5, 'S1', size=14, ha='center')
plt.text(2.5, 2.5, 'S2', size=14, ha='center')
plt.text(0.5, 1.5, 'S3', size=14, ha='center')
plt.text(1.5, 1.5, 'S4', size=14, ha='center')
plt.text(2.5, 1.5, 'S5', size=14, ha='center')
plt.text(0.5, 0.5, 'S6', size=14, ha='center')
plt.text(1.5, 0.5, 'S7', size=14, ha='center')
plt.text(2.5, 0.5, 'S8', size=14, ha='center')
plt.text(0.5, 2.3, 'START', ha='center')
plt.text(2.5, 0.3, 'GOAL', ha='center')
# plt.axis('off')
plt.tick_params(axis='both', which='both',
                bottom=False, top=False,
                right=False, left=False,
                labelbottom=False, labelleft=False
                )
line, = ax.plot([0.5], [2.5], marker='o', color='g', markersize=60)
plt.show()

def init():
    line.set_data([], [])
    return (line,)

def animate(i):
    state = state_history[i]
    x = [(state % 3) + 0.5]
    y = [2.5 - int(state / 3)]
    line.set_data(x, y)

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(state_history), interval=200, repeat=False)
name = plots_dir + str(time.mktime(time.localtime(time.time()))) + '.gif'
anim.save(name, writer='pillow')  # 生成gif

利用softmax来更新策略

在每一次循环中，都需要重置环境
在一次循环中：根据所选动作得出下一个状态，回报，并加入状态-动作对历史，直至这次循环结束。在这次循环结束之后，则需要需要更新参数方程，直至收敛

基于 softmax，更新general 的概率化方式

    # 基于 softmax，更新general 的概率化方式
    def _softmax_cvt_theta_to_pi(self, beta=1.0):
        m, n = self.theta.shape
        pi = np.zeros((m, n))
        exp_theta = np.exp(self.theta * beta)
        for r in range(m):
            pi[r, :] = exp_theta[r, :] / np.nansum(exp_theta[r, :])
        pi = np.nan_to_num(pi)
        return pi

在这里插入图片描述

    def update_theta(self, s_a_history):
        T = len(s_a_history) - 1
        m, n = self.theta.shape
        delta_theta = self.theta.copy()
        for i in range(m):
            for j in range(n):
                if not (np.isnan(self.theta_0[i, j])):
                    sa_i = [sa for sa in s_a_history if sa[0] == i]
                    #从 s_a_history 中筛选出所有状态为 i 的状态-动作对，并存储在 sa_i 中。
                    sa_ij = [sa for sa in s_a_history if (sa[0] == i and sa[1] == j)]
                    #从 s_a_history 中筛选出状态为 i 且动作为 j 的状态-动作对，并存储在 sa_i中。
                    N_i = len(sa_i)
                    N_ij = len(sa_ij)
                    delta_theta[i, j] = (N_ij - self.pi[i, j] * N_i) / T
        self.theta = self.theta + self.eta * delta_theta
        return self.theta

完整代码

import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation
import time

import os
plots_dir = 'plots\\MG3\\'
if not os.path.exists(plots_dir):
    os.makedirs(plots_dir)



class MazeEnv(gym.Env):
    def __init__(self):
        self.state = 0
    def step(self, action):
        if action == 0:
            self.state -=3
        elif action == 1:
            self.state +=1
        elif action == 2:
            self.state +=3
        elif action == 3:
            self.state -=1
        done = False
        if self.state == 8:
            done = True
        return self.state, 1 ,done,False, {}
    def reset(self):
        self.state = 0
        return self.state
class Agent:
    def __init__(self):
        self.actions=list(range(4))
        self.theta_0 = np.asarray([[np.nan, 1, 1, np.nan],  # s0
                              [np.nan, 1, np.nan, 1],  # s1
                              [np.nan, np.nan, 1, 1],  # s2
                              [1, np.nan, np.nan, np.nan],  # s3
                              [np.nan, 1, 1, np.nan],  # s4
                              [1, np.nan, np.nan, 1],  # s5
                              [np.nan, 1, np.nan, np.nan],  # s6
                              [1, 1, np.nan, 1]]  # s7
                             )
        self.theta = self.theta_0
        self.pi = self._softmax_cvt_theta_to_pi()
        self.eta = 0.1

    # 基于 softmax，更新general 的概率化方式
    def _softmax_cvt_theta_to_pi(self, beta=1.0):
        m, n = self.theta.shape
        pi = np.zeros((m, n))
        exp_theta = np.exp(self.theta * beta)
        for r in range(m):
            pi[r, :] = exp_theta[r, :] / np.nansum(exp_theta[r, :])
        pi = np.nan_to_num(pi)
        return pi

    def update_theta(self, s_a_history):
        T = len(s_a_history) - 1
        m, n = self.theta.shape
        delta_theta = self.theta.copy()
        for i in range(m):
            for j in range(n):
                if not (np.isnan(self.theta_0[i, j])):
                    sa_i = [sa for sa in s_a_history if sa[0] == i]
                    sa_ij = [sa for sa in s_a_history if (sa[0] == i and sa[1] == j)]
                    N_i = len(sa_i)
                    N_ij = len(sa_ij)
                    delta_theta[i, j] = (N_ij - self.pi[i, j] * N_i) / T
        self.theta = self.theta + self.eta * delta_theta
        return self.theta

    def update_pi(self):
        self.pi = self._softmax_cvt_theta_to_pi()
        return self.pi

    def choose_action(self, state):
        action = np.random.choice(self.actions,p=self.pi[state,:])
        return action
stop_eps = 1e-4
agent = Agent()
env = MazeEnv()

while True:
    # 不断地从初始状态出发，产生一次 trajectory
    state = env.reset()
    # state, action
    s_a_history = [[state, np.nan]]
    while True:
        action = agent.choose_action(state)
        s_a_history[-1][1] = action
        state, reward, done, _,_ = env.step(action)
        #     print(action, state, done)
        s_a_history.append([state, np.nan])
        if state == 8 or done:
            break
    # 更新 theta
    agent.update_theta(s_a_history)
    pi = agent.pi.copy()
    # 更新 pi
    agent.update_pi()
    delta = np.sum(np.abs(agent.pi - pi))
    print(len(s_a_history), delta)
    if delta < stop_eps:
        break
print(agent.pi)
print(s_a_history)

fig = plt.figure(figsize=(5, 5))
ax = plt.gca()
ax.set_xlim(0, 3)
ax.set_ylim(0, 3)

plt.plot([2, 3], [1, 1], color='red', linewidth=2)
plt.plot([0, 1], [1, 1], color='red', linewidth=2)
plt.plot([1, 1], [1, 2], color='red', linewidth=2)
plt.plot([1, 2], [2, 2], color='red', linewidth=2)

plt.text(0.5, 2.5, 'S0', size=14, ha='center')
plt.text(1.5, 2.5, 'S1', size=14, ha='center')
plt.text(2.5, 2.5, 'S2', size=14, ha='center')
plt.text(0.5, 1.5, 'S3', size=14, ha='center')
plt.text(1.5, 1.5, 'S4', size=14, ha='center')
plt.text(2.5, 1.5, 'S5', size=14, ha='center')
plt.text(0.5, 0.5, 'S6', size=14, ha='center')
plt.text(1.5, 0.5, 'S7', size=14, ha='center')
plt.text(2.5, 0.5, 'S8', size=14, ha='center')
plt.text(0.5, 2.3, 'START', ha='center')
plt.text(2.5, 0.3, 'GOAL', ha='center')
# plt.axis('off')
plt.tick_params(axis='both', which='both',
                bottom=False, top=False,
                right=False, left=False,
                labelbottom=False, labelleft=False
                )
line, = ax.plot([0.5], [2.5], marker='o', color='g', markersize=60)
plt.show()

def init():
    line.set_data([], [])
    return (line,)

def animate(i):
    state = s_a_history[i]
    #print(state[0])
    x = [(state[0] % 3) + 0.5]
    y = [2.5 - int(state[0] / 3)]
    line.set_data(x, y)

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(s_a_history), interval=200, repeat=False)
name = plots_dir + str(time.mktime(time.localtime(time.time()))) + '.gif'
anim.save(name, writer='pillow')  # 生成gif

sarsa算法策略

首先会确认做多少次循环即epoch，在每一次循环中，都需要重置环境
并计算 agent.Q 表中每一行的最大值（即每个状态下所有动作的最大 Q 值），并将结果存储在 old_Q 中。
在一次循环中：根据所选动作得出下一个状态，回报，并加入状态-动作对历史，以及更新Q值，直至这次循环结束，在这次循环结束之后，则需要计算数组中最大值并忽略 NaN；通过计算最大的差距来判断是否收敛。

sarsa算法

如果S_next是终止形态，仅用当前奖励r更新Q值；如果不是，则使用当前奖励 r 和未来奖励（通过折扣因子 γ 和下一个状态-动作对的 Q 值 Q(S_next ,a _next）更新 Q 值。

    def sarsa(self, s, a, r, s_next, a_next):
        if s_next == 8:
            self.Q[s, a] = self.Q[s, a] + self.eta * (r - self.Q[s, a])
        else:
            self.Q[s, a] = self.Q[s, a] + self.eta * (r + self.gamma * self.Q[s_next, a_next] - self.Q[s, a])

动作选择- ε-贪心策略

在这里插入图片描述

    def get_action(self, s):
        # eps, explore
        if np.random.rand() < self.eps:
            action = np.random.choice(self.action_space, p=self.pi[s, :])
        else:
            # 1-eps, exploit
            action = np.nanargmax(self.Q[s, :])
        return action

完整代码

import numpy as np
import gym
import matplotlib.pyplot as plt

from matplotlib import animation
import time

import os
plots_dir = 'plots\\MG4\\'
if not os.path.exists(plots_dir):
    os.makedirs(plots_dir)

fig = plt.figure(figsize=(5, 5))
ax = plt.gca()
ax.set_xlim(0, 3)
ax.set_ylim(0, 3)

plt.plot([2, 3], [1, 1], color='red', linewidth=2)
plt.plot([0, 1], [1, 1], color='red', linewidth=2)
plt.plot([1, 1], [1, 2], color='red', linewidth=2)
plt.plot([1, 2], [2, 2], color='red', linewidth=2)

plt.text(0.5, 2.5, 'S0', size=14, ha='center')
plt.text(1.5, 2.5, 'S1', size=14, ha='center')
plt.text(2.5, 2.5, 'S2', size=14, ha='center')
plt.text(0.5, 1.5, 'S3', size=14, ha='center')
plt.text(1.5, 1.5, 'S4', size=14, ha='center')
plt.text(2.5, 1.5, 'S5', size=14, ha='center')
plt.text(0.5, 0.5, 'S6', size=14, ha='center')
plt.text(1.5, 0.5, 'S7', size=14, ha='center')
plt.text(2.5, 0.5, 'S8', size=14, ha='center')
plt.text(0.5, 2.3, 'START', ha='center')
plt.text(2.5, 0.3, 'GOAL', ha='center')
# plt.axis('off')
plt.tick_params(axis='both', which='both',
                bottom=False, top=False,
                right=False, left=False,
                labelbottom=False, labelleft=False
                )
line, = ax.plot([0.5], [2.5], marker='o', color='g', markersize=60)
plt.show()


# 维护着状态，以及 step 函数的返回
class MazeEnv(gym.Env):
    def __init__(self):
        self.state = 0
        pass

    def reset(self):
        self.state = 0
        return self.state

    def step(self, action):
        if action == 0:
            self.state -= 3
        elif action == 1:
            self.state += 1
        elif action == 2:
            self.state += 3
        elif action == 3:
            self.state -= 1
        done = False
        reward = 0
        if self.state == 8:
            done = True
            reward = 1
        # state, reward, done, _
        return self.state, reward, done, {}


# 动作策略选择，基于当前环境的状态
class Agent:
    def __init__(self):
        self.action_space = list(range(4))
        self.theta_0 = np.asarray([[np.nan, 1, 1, np.nan],  # s0
                                   [np.nan, 1, np.nan, 1],  # s1
                                   [np.nan, np.nan, 1, 1],  # s2
                                   [1, np.nan, np.nan, np.nan],  # s3
                                   [np.nan, 1, 1, np.nan],  # s4
                                   [1, np.nan, np.nan, 1],  # s5
                                   [np.nan, 1, np.nan, np.nan],  # s6
                                   [1, 1, np.nan, 1]]  # s7
                                  )
        self.pi = self._cvt_theta_to_pi()

        self.Q = np.random.rand(*self.theta_0.shape) * self.theta_0
        self.eta = 0.1#表示学习率
        self.gamma = 0.9#表示时间的折扣因子
        self.eps = 0.5#选择action的策略

    def get_action(self, s):
        # eps, explore
        if np.random.rand() < self.eps:
            action = np.random.choice(self.action_space, p=self.pi[s, :])
        else:
            # 1-eps, exploit
            action = np.nanargmax(self.Q[s, :])
        return action

    def sarsa(self, s, a, r, s_next, a_next):
        if s_next == 8:
            self.Q[s, a] = self.Q[s, a] + self.eta * (r - self.Q[s, a])
        else:
            self.Q[s, a] = self.Q[s, a] + self.eta * (r + self.gamma * self.Q[s_next, a_next] - self.Q[s, a])

    def _cvt_theta_to_pi(self):
        m, n = self.theta_0.shape
        pi = np.zeros((m, n))
        for r in range(m):
            pi[r, :] = self.theta_0[r, :] / np.nansum(self.theta_0[r, :])
        return np.nan_to_num(pi)
    #等待可选择的动作的概率

maze = MazeEnv()
agent = Agent()
epoch = 0

while True:#要做多少次epoch
    old_Q = np.nanmax(agent.Q, axis=1)
    s = maze.reset()
    a = agent.get_action(s)
    s_a_history = [[s, np.nan]]
    while True:#里面的循环是从开始到结束
        # s, a
        s_a_history[-1][1] = a
        s_next, reward, done, _ = maze.step(a, )
        # s_next, a_next
        s_a_history.append([s_next, np.nan])
        if done:
            a_next = np.nan#如果已经结束
        else:
            a_next = agent.get_action(s_next)#如果没有结束，会选出新的动作
#         print(s, a, reward, s_next, a_next)
        agent.sarsa(s, a, reward, s_next, a_next)
#         print(agent.pi)
        if done:
            break
        else:
            a = a_next
            s = maze.state

    # s_s_history, agent.Q
    update = np.sum(np.abs(np.nanmax(agent.Q, axis=1) - old_Q))
    #计算数组中最大值并忽略 NaN；通过计算最大的差距来判断是否收敛
    epoch +=1
    agent.eps /= 2
    print(epoch, update, len(s_a_history))
    if epoch > 100 or update < 1e-5:
        break
print(agent.Q)
def init():
    line.set_data([], [])
    return (line,)

def animate(i):
    state = s_a_history[i]
    #print(state[0])
    x = [(state[0] % 3) + 0.5]
    y = [2.5 - int(state[0] / 3)]
    line.set_data(x, y)

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(s_a_history), interval=200, repeat=False)
name = plots_dir + str(time.mktime(time.localtime(time.time()))) + '.gif'
anim.save(name, writer='pillow')  # 生成gif

Q-learning

和sarsa算法的区别就是Q-learning 在更新 Q 值时，使用的是下一个状态中最大 Q 值的动作，而不关心下一个动作是如何选择的。
sarsa 是基于策略（on-policy）算法。sarsa 在更新 Q 值时，使用的是实际选择的动作

Q-learning

def q_learning(self, s, a, r, s_next):
        if s_next == 8:
            self.Q[s, a] = self.Q[s, a] + self.eta * (r - self.Q[s, a])
        else:
            self.Q[s, a] = self.Q[s, a] + self.eta * (r + self.gamma * np.nanmax(self.Q[s_next, :]) - self.Q[s, a])

完整代码

import numpy as np
import gym
import matplotlib.pyplot as plt

from matplotlib import animation
import time

import os
plots_dir = 'plots\\MG5\\'
if not os.path.exists(plots_dir):
    os.makedirs(plots_dir)

fig = plt.figure(figsize=(5, 5))
ax = plt.gca()
ax.set_xlim(0, 3)
ax.set_ylim(0, 3)

plt.plot([2, 3], [1, 1], color='red', linewidth=2)
plt.plot([0, 1], [1, 1], color='red', linewidth=2)
plt.plot([1, 1], [1, 2], color='red', linewidth=2)
plt.plot([1, 2], [2, 2], color='red', linewidth=2)

plt.text(0.5, 2.5, 'S0', size=14, ha='center')
plt.text(1.5, 2.5, 'S1', size=14, ha='center')
plt.text(2.5, 2.5, 'S2', size=14, ha='center')
plt.text(0.5, 1.5, 'S3', size=14, ha='center')
plt.text(1.5, 1.5, 'S4', size=14, ha='center')
plt.text(2.5, 1.5, 'S5', size=14, ha='center')
plt.text(0.5, 0.5, 'S6', size=14, ha='center')
plt.text(1.5, 0.5, 'S7', size=14, ha='center')
plt.text(2.5, 0.5, 'S8', size=14, ha='center')
plt.text(0.5, 2.3, 'START', ha='center')
plt.text(2.5, 0.3, 'GOAL', ha='center')
# plt.axis('off')
plt.tick_params(axis='both', which='both',
                bottom=False, top=False,
                right=False, left=False,
                labelbottom=False, labelleft=False
                )
line, = ax.plot([0.5], [2.5], marker='o', color='g', markersize=60)
plt.show()


# 维护着状态，以及 step 函数的返回
class MazeEnv(gym.Env):
    def __init__(self):
        self.state = 0
        pass

    def reset(self):
        self.state = 0
        return self.state

    def step(self, action):
        if action == 0:
            self.state -= 3
        elif action == 1:
            self.state += 1
        elif action == 2:
            self.state += 3
        elif action == 3:
            self.state -= 1
        done = False
        reward = 0
        if self.state == 8:
            done = True
            reward = 1
        # state, reward, done, _
        return self.state, reward, done, {}


# 动作策略选择，基于当前环境的状态
class Agent:
    def __init__(self):
        self.action_space = list(range(4))
        self.theta_0 = np.asarray([[np.nan, 1, 1, np.nan],  # s0
                                   [np.nan, 1, np.nan, 1],  # s1
                                   [np.nan, np.nan, 1, 1],  # s2
                                   [1, np.nan, np.nan, np.nan],  # s3
                                   [np.nan, 1, 1, np.nan],  # s4
                                   [1, np.nan, np.nan, 1],  # s5
                                   [np.nan, 1, np.nan, np.nan],  # s6
                                   [1, 1, np.nan, 1]]  # s7
                                  )
        self.pi = self._cvt_theta_to_pi()
        #         self.pi = self._softmax_cvt_theta_to_pi()
        #         self.theta = self.theta_0

        self.Q = np.random.rand(*self.theta_0.shape) * self.theta_0
        self.eta = 0.1#表示学习率
        self.gamma = 0.9#表示时间的折扣因子
        self.eps = 0.5#选择action的策略

    def get_action(self, s):
        # eps, explore
        if np.random.rand() < self.eps:
            action = np.random.choice(self.action_space, p=self.pi[s, :])
        else:
            # 1-eps, exploit
            action = np.nanargmax(self.Q[s, :])
        return action

    def q_learning(self, s, a, r, s_next):
        if s_next == 8:
            self.Q[s, a] = self.Q[s, a] + self.eta * (r - self.Q[s, a])
        else:
            self.Q[s, a] = self.Q[s, a] + self.eta * (r + self.gamma * np.nanmax(self.Q[s_next, :]) - self.Q[s, a])
    #q-learning和sarsa的区别就是Q-learning 在更新 Q 值时，使用的是下一个状态中 最大 Q 值的动作，而不关心下一个动作是如何选择的。
    # sarsa 是 基于策略（on-policy）算法。sarsa 在更新 Q 值时，使用的是 实际选择的动作
    def _cvt_theta_to_pi(self):
        m, n = self.theta_0.shape
        pi = np.zeros((m, n))
        for r in range(m):
            pi[r, :] = self.theta_0[r, :] / np.nansum(self.theta_0[r, :])
        return np.nan_to_num(pi)
    #等待可选择的动作的概率

maze = MazeEnv()
agent = Agent()
epoch = 0
while True:#要做多少次epoch
    old_Q = np.nanmax(agent.Q, axis=1)
    s = maze.reset()
    a = agent.get_action(s)
    s_a_history = [[s, np.nan]]
    while True:#里面的循环是从开始到结束
        # s, a
        s_a_history[-1][1] = a
        s_next, reward, done, _ = maze.step(a, )
        # s_next, a_next
        s_a_history.append([s_next, np.nan])
        if done:
            a_next = np.nan#如果已经结束
        else:
            a_next = agent.get_action(s_next)#如果没有结束，会选出新的动作
#         print(s, a, reward, s_next, a_next)
        agent.q_learning(s, a, reward, s_next, a_next)
#         print(agent.pi)
        if done:
            break
        else:
            a = a_next
            s = maze.state

    # s_s_history, agent.Q
    update = np.sum(np.abs(np.nanmax(agent.Q, axis=1) - old_Q))
    #计算数组中最大值并忽略 NaN；通过计算最大的差距来判断是否收敛
    epoch +=1
    agent.eps /= 2
    print(epoch, update, len(s_a_history))
    if epoch > 100 or update < 1e-5:
        break
print(agent.Q)
def init():
    line.set_data([], [])
    return (line,)

def animate(i):
    state = s_a_history[i]
    #print(state[0])
    x = [(state[0] % 3) + 0.5]
    y = [2.5 - int(state[0] / 3)]
    line.set_data(x, y)

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(s_a_history), interval=200, repeat=False)
name = plots_dir + str(time.mktime(time.localtime(time.time()))) + '.gif'
anim.save(name, writer='pillow')  # 生成gif