这段代码实现了一个运输路径优化的强化学习环境和 Q 学习智能体。下面是对代码的逐行解释:
导入必要的库
python
运行
import numpy as np
import gym
from gym import spaces
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import random
- 导入 NumPy 用于数值计算
- 导入 OpenAI Gym 库创建强化学习环境
- 导入 Matplotlib 用于可视化
- 导入 random 用于随机数生成
配置中文字体
python
运行
plt.rcParams['font.sans-serif'] = ['SimSun'] # 设置宋体
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
- 设置 Matplotlib 使用宋体显示中文
- 确保负号能够正确显示
定义运输环境类
python
运行
class TransportationEnv(gym.Env):
"""运输路径优化环境"""
metadata = {'render.modes': ['human']}
- 创建自定义 Gym 环境
- 指定环境支持的渲染模式为 'human'(人类可查看)
初始化环境参数
python
运行
def __init__(self, grid_size=(10, 10), num_obstacles=15, congestion_factor=0.3):
super(TransportationEnv, self).__init__()
# 环境参数
self.grid_size = grid_size
self.num_obstacles = num_obstacles
self.congestion_factor = congestion_factor
# 动作空间: 上、下、左、右
self.action_space = spaces.Discrete(4)
# 状态空间: 智能体位置(x,y) + 终点位置(x,y) + 拥堵信息(简化为距离终点的曼哈顿距离)
low = np.array([0, 0, 0, 0, 0])
high = np.array([grid_size[0]-1, grid_size[1]-1,
grid_size[0]-1, grid_size[1]-1,
grid_size[0]+grid_size[1]])
self.observation_space = spaces.Box(low, high, dtype=np.int32)
# 初始化环境
self.reset()
- 设置网格大小、障碍物数量和拥堵系数
- 定义动作空间为 4 个离散动作(上、下、左、右)
- 定义状态空间包含智能体位置、目标位置和到目标的曼哈顿距离
- 调用 reset 方法初始化环境状态
重置环境
python
运行
def reset(self):
"""重置环境"""
# 随机生成障碍物
self.obstacles = set()
while len(self.obstacles) < self.num_obstacles:
x = random.randint(0, self.grid_size[0]-1)
y = random.randint(0, self.grid_size[1]-1)
self.obstacles.add((x, y))
# 随机生成起点和终点,确保不在障碍物上
while True:
self.agent_pos = (random.randint(0, self.grid_size[0]-1),
random.randint(0, self.grid_size[1]-1))
if self.agent_pos not in self.obstacles:
break
while True:
self.goal_pos = (random.randint(0, self.grid_size[0]-1),
random.randint(0, self.grid_size[1]-1))
if self.goal_pos != self.agent_pos and self.goal_pos not in self.obstacles:
break
# 初始化路径和步数
self.path = [self.agent_pos]
self.steps = 0
self.max_steps = self.grid_size[0] * self.grid_size[1] * 2
# 生成拥堵区域
self.generate_congestion()
# 返回初始状态
return self.get_state()
- 随机生成障碍物位置
- 随机生成智能体起点和目标点,确保它们不在障碍物上且不相同
- 记录智能体走过的路径和步数
- 设置最大步数限制
- 生成拥堵区域
- 返回当前状态
生成拥堵区域
python
运行
def generate_congestion(self):
"""生成拥堵区域"""
self.congestion = set()
num_congestion = int(self.grid_size[0] * self.grid_size[1] * self.congestion_factor)
# 拥堵区域倾向于靠近中心
center_x, center_y = self.grid_size[0]//2, self.grid_size[1]//2
for _ in range(num_congestion):
while True:
# 以中心为基准的高斯分布生成坐标
x = int(np.clip(np.random.normal(center_x, self.grid_size[0]/4), 0, self.grid_size[0]-1))
y = int(np.clip(np.random.normal(center_y, self.grid_size[1]/4), 0, self.grid_size[1]-1))
pos = (x, y)
if pos not in self.obstacles and pos not in self.congestion:
self.congestion.add(pos)
break
- 根据拥堵系数计算拥堵区域数量
- 使用高斯分布在网格中心附近生成拥堵区域
- 确保拥堵区域不会与障碍物重叠
执行动作并返回结果
python
运行
def step(self, action):
"""执行动作并返回下一状态、奖励和终止标志"""
self.steps += 1
# 计算新位置
x, y = self.agent_pos
if action == 0: # 上
y = max(y-1, 0)
elif action == 1: # 下
y = min(y+1, self.grid_size[1]-1)
elif action == 2: # 左
x = max(x-1, 0)
elif action == 3: # 右
x = min(x+1, self.grid_size[0]-1)
new_pos = (x, y)
# 检查是否撞到障碍物
if new_pos in self.obstacles:
reward = -10 # 撞到障碍物的惩罚
done = False # 可以继续尝试
else:
self.agent_pos = new_pos
self.path.append(new_pos)
# 计算奖励
prev_dist = self.manhattan_distance(self.path[-2], self.goal_pos)
curr_dist = self.manhattan_distance(self.agent_pos, self.goal_pos)
if new_pos == self.goal_pos:
reward = 100 # 到达目标的奖励
done = True
elif self.steps >= self.max_steps:
reward = -50 # 超时惩罚
done = True
else:
# 距离减少的奖励,进入拥堵区域的惩罚
reward = (prev_dist - curr_dist) * 2
if new_pos in self.congestion:
reward -= 3 # 拥堵区域惩罚
done = False
# 返回下一状态、奖励、终止标志和额外信息
return self.get_state(), reward, done, {}
- 根据动作计算新位置并确保不超出网格边界
- 如果撞到障碍物,给予负奖励但允许继续尝试
- 如果到达目标,给予高奖励并终止本轮
- 如果超过最大步数,给予惩罚并终止本轮
- 根据距离目标的远近变化给予奖励
- 如果进入拥堵区域,给予额外惩罚
获取当前状态和计算曼哈顿距离
python
运行
def get_state(self):
"""获取当前状态"""
dist_to_goal = self.manhattan_distance(self.agent_pos, self.goal_pos)
return np.array([self.agent_pos[0], self.agent_pos[1],
self.goal_pos[0], self.goal_pos[1],
dist_to_goal])
def manhattan_distance(self, pos1, pos2):
"""计算曼哈顿距离"""
return abs(pos1[0] - pos2[0]) + abs(pos1[1] - pos2[1])
- get_state: 返回包含智能体位置、目标位置和距离的状态向量
- manhattan_distance: 计算两点之间的曼哈顿距离(横向和纵向距离之和)
渲染环境
python
运行
def render(self, mode='human'):
"""渲染环境"""
if mode == 'human':
# 创建网格
grid = np.zeros(self.grid_size)
# 标记障碍物
for x, y in self.obstacles:
grid[y, x] = -1 # 障碍物用-1表示
# 标记拥堵区域
for x, y in self.congestion:
if grid[y, x] != -1: # 不覆盖障碍物
grid[y, x] = 0.5 # 拥堵区域用0.5表示
# 标记路径
for x, y in self.path:
if grid[y, x] not in [-1, 0.5]: # 不覆盖障碍物和拥堵
grid[y, x] = 0.2 # 路径用0.2表示
# 标记当前位置和目标位置
grid[self.agent_pos[1], self.agent_pos[0]] = 1 # 当前位置用1表示
grid[self.goal_pos[1], self.goal_pos[0]] = 2 # 目标位置用2表示
# 创建自定义颜色映射
colors = [(0, 0, 0), (0.5, 0.5, 0.5), (0.7, 0.7, 0.7), (0, 1, 0), (1, 0, 0)] # 黑, 灰, 浅灰, 绿, 红
cmap_name = 'custom_cmap'
cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=5)
# 显示网格
plt.figure(figsize=(10, 8))
plt.imshow(grid, cmap=cmap, interpolation='nearest')
plt.colorbar(ticks=[-1, 0, 0.5, 1, 2],
label='-1:障碍物, 0:空, 0.5:拥堵, 1:当前位置, 2:目标')
plt.title('运输路径优化环境')
plt.grid(True, which='both', linestyle='-', color='gray', alpha=0.5)
plt.show()
else:
super(TransportationEnv, self).render(mode=mode)
- 创建一个二维网格数组表示环境
- 用不同数值表示障碍物、拥堵区域、路径、当前位置和目标位置
- 创建自定义颜色映射将不同数值映射为不同颜色
- 使用 Matplotlib 显示环境可视化结果
Q 学习智能体类
python
运行
class QLearningAgent:
"""Q学习智能体"""
def __init__(self, state_size, action_size, learning_rate=0.1,
discount_factor=0.9, exploration_rate=1.0,
exploration_decay=0.995, min_exploration_rate=0.01):
self.state_size = state_size
self.action_size = action_size
self.learning_rate = learning_rate
self.discount_factor = discount_factor
self.exploration_rate = exploration_rate
self.exploration_decay = exploration_decay
self.min_exploration_rate = min_exploration_rate
# 离散化状态空间以适应Q表
self.state_bins = self._create_state_bins()
# 初始化Q表
self.q_table = {}
- 初始化 Q 学习所需的参数:学习率、折扣因子、探索率等
- 创建状态分箱用于离散化连续状态空间
- 初始化 Q 表(用字典实现)
状态离散化方法
python
运行
def _create_state_bins(self):
"""创建状态离散化的分箱"""
bins = []
# 位置和目标坐标的分箱
for _ in range(4):
bins.append(np.linspace(0, 9, 10)) # 假设网格大小为10x10
# 距离的分箱
bins.append(np.linspace(0, 18, 10)) # 最大曼哈顿距离为18
return bins
def _discretize_state(self, state):
"""将连续状态离散化"""
discrete_state = []
for i, s in enumerate(state):
if i < 4: # 位置和目标坐标
discrete_state.append(int(s))
else: # 距离
discrete_state.append(min(int(s / 2), 9)) # 将距离分箱为0-9
return tuple(discrete_state)
- _create_state_bins: 创建用于离散化状态的分箱
- _discretize_state: 将连续状态转换为离散状态表示,便于 Q 表存储
动作选择和 Q 表更新
python
运行
def act(self, state):
"""选择动作"""
discrete_state = self._discretize_state(state)
# 探索或利用
if np.random.uniform(0, 1) < self.exploration_rate:
return np.random.choice(self.action_size) # 随机探索
# 利用已知最优动作
if discrete_state not in self.q_table:
return np.random.choice(self.action_size)
q_values = self.q_table[discrete_state]
return np.argmax(q_values)
def update(self, state, action, reward, next_state, done):
"""更新Q表"""
discrete_state = self._discretize_state(state)
discrete_next_state = self._discretize_state(next_state)
# 初始化Q表项(如果不存在)
if discrete_state not in self.q_table:
self.q_table[discrete_state] = np.zeros(self.action_size)
if discrete_next_state not in self.q_table:
self.q_table[discrete_next_state] = np.zeros(self.action_size)
# Q学习更新公式
current_q = self.q_table[discrete_state][action]
max_next_q = np.max(self.q_table[discrete_next_state])
if done:
new_q = reward
else:
new_q = current_q + self.learning_rate * (reward + self.discount_factor * max_next_q - current_q)
self.q_table[discrete_state][action] = new_q
# 衰减探索率
if self.exploration_rate > self.min_exploration_rate:
self.exploration_rate *= self.exploration_decay
- act: 根据当前状态选择动作,使用 ε- 贪婪策略平衡探索和利用
- update: 根据 Q 学习算法更新 Q 表值,使用贝尔曼方程
- 随着训练进行,逐渐降低探索率
训练和评估函数
python
运行
def train_agent(env, agent, num_episodes=1000):
"""训练智能体"""
rewards_history = []
for episode in range(num_episodes):
state = env.reset()
total_reward = 0
done = False
while not done:
action = agent.act(state)
next_state, reward, done, _ = env.step(action)
agent.update(state, action, reward, next_state, done)
state = next_state
total_reward += reward
rewards_history.append(total_reward)
if (episode + 1) % 100 == 0:
print(f"Episode {episode+1}/{num_episodes}, Average Reward: {np.mean(rewards_history[-100:]):.2f}")
return rewards_history
def evaluate_agent(env, agent, num_episodes=10):
"""评估智能体性能"""
success_count = 0
total_steps = []
for episode in range(num_episodes):
state = env.reset()
done = False
steps = 0
while not done:
action = agent.act(state) # 利用模式,不探索
state, _, done, _ = env.step(action)
steps += 1
if env.agent_pos == env.goal_pos:
success_count += 1
total_steps.append(steps)
print(f"Evaluation Episode {episode+1}: {'Success' if done and env.agent_pos == env.goal_pos else 'Failure'}")
success_rate = success_count / num_episodes
avg_steps = np.mean(total_steps) if total_steps else float('inf')
print(f"Success Rate: {success_rate:.2%}, Average Steps: {avg_steps:.2f}")
return success_rate, avg_steps
- train_agent: 训练智能体,记录每轮的总奖励
- evaluate_agent: 评估智能体性能,计算成功率和平均步数
主函数
python
运行
if __name__ == "__main__":
# 创建环境和智能体
env = TransportationEnv(grid_size=(10, 10))
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = QLearningAgent(state_size, action_size)
# 训练智能体
print("开始训练智能体...")
rewards = train_agent(env, agent, num_episodes=500)
# 评估智能体
print("\n评估智能体性能...")
evaluate_agent(env, agent, num_episodes=5)
# 可视化训练过程
plt.figure(figsize=(10, 6))
plt.plot(rewards)
plt.title('训练奖励')
plt.xlabel('回合')
plt.ylabel('总奖励')
plt.grid(True)
plt.show()
# 可视化最优路径
print("\n可视化最优路径...")
state = env.reset()
done = False
while not done:
action = agent.act(state) # 利用模式
state, _, done, _ = env.step(action)
env.render()
- 创建环境和智能体
- 训练智能体并显示训练进度
- 评估智能体性能
- 可视化训练奖励变化
- 展示智能体找到的最优路径
总结
这段代码实现了一个基于强化学习的运输路径优化系统,主要包括:
- 自定义 Gym 环境,模拟带有障碍物和拥堵区域的网格世界
- Q 学习智能体,通过离散化状态空间来处理连续状态
- 训练和评估函数,用于训练智能体并评估其性能
- 可视化功能,展示环境状态和训练过程
系统的核心是 Q 学习算法,通过不断尝试和学习,智能体逐渐找到从起点到终点的最优路径,同时避开障碍物和拥堵区域。
import numpy as np
import gym
from gym import spaces
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import random
# 添加中文字体配置
# ==============================
plt.rcParams['font.sans-serif'] = ['SimSun'] # 设置宋体
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
class TransportationEnv(gym.Env):
"""运输路径优化环境"""
metadata = {'render.modes': ['human']}
def __init__(self, grid_size=(10, 10), num_obstacles=15, congestion_factor=0.3):
super(TransportationEnv, self).__init__()
# 环境参数
self.grid_size = grid_size
self.num_obstacles = num_obstacles
self.congestion_factor = congestion_factor
# 动作空间: 上、下、左、右
self.action_space = spaces.Discrete(4)
# 状态空间: 智能体位置(x,y) + 终点位置(x,y) + 拥堵信息(简化为距离终点的曼哈顿距离)
low = np.array([0, 0, 0, 0, 0])
high = np.array([grid_size[0]-1, grid_size[1]-1,
grid_size[0]-1, grid_size[1]-1,
grid_size[0]+grid_size[1]])
self.observation_space = spaces.Box(low, high, dtype=np.int32)
# 初始化环境
self.reset()
def reset(self):
"""重置环境"""
# 随机生成障碍物
self.obstacles = set()
while len(self.obstacles) < self.num_obstacles:
x = random.randint(0, self.grid_size[0]-1)
y = random.randint(0, self.grid_size[1]-1)
self.obstacles.add((x, y))
# 随机生成起点和终点,确保不在障碍物上
while True:
self.agent_pos = (random.randint(0, self.grid_size[0]-1),
random.randint(0, self.grid_size[1]-1))
if self.agent_pos not in self.obstacles:
break
while True:
self.goal_pos = (random.randint(0, self.grid_size[0]-1),
random.randint(0, self.grid_size[1]-1))
if self.goal_pos != self.agent_pos and self.goal_pos not in self.obstacles:
break
# 初始化路径和步数
self.path = [self.agent_pos]
self.steps = 0
self.max_steps = self.grid_size[0] * self.grid_size[1] * 2
# 生成拥堵区域
self.generate_congestion()
# 返回初始状态
return self.get_state()
def generate_congestion(self):
"""生成拥堵区域"""
self.congestion = set()
num_congestion = int(self.grid_size[0] * self.grid_size[1] * self.congestion_factor)
# 拥堵区域倾向于靠近中心
center_x, center_y = self.grid_size[0]//2, self.grid_size[1]//2
for _ in range(num_congestion):
while True:
# 以中心为基准的高斯分布生成坐标
x = int(np.clip(np.random.normal(center_x, self.grid_size[0]/4), 0, self.grid_size[0]-1))
y = int(np.clip(np.random.normal(center_y, self.grid_size[1]/4), 0, self.grid_size[1]-1))
pos = (x, y)
if pos not in self.obstacles and pos not in self.congestion:
self.congestion.add(pos)
break
def step(self, action):
"""执行动作并返回下一状态、奖励和终止标志"""
self.steps += 1
# 计算新位置
x, y = self.agent_pos
if action == 0: # 上
y = max(y-1, 0)
elif action == 1: # 下
y = min(y+1, self.grid_size[1]-1)
elif action == 2: # 左
x = max(x-1, 0)
elif action == 3: # 右
x = min(x+1, self.grid_size[0]-1)
new_pos = (x, y)
# 检查是否撞到障碍物
if new_pos in self.obstacles:
reward = -10 # 撞到障碍物的惩罚
done = False # 可以继续尝试
else:
self.agent_pos = new_pos
self.path.append(new_pos)
# 计算奖励
prev_dist = self.manhattan_distance(self.path[-2], self.goal_pos)
curr_dist = self.manhattan_distance(self.agent_pos, self.goal_pos)
if new_pos == self.goal_pos:
reward = 100 # 到达目标的奖励
done = True
elif self.steps >= self.max_steps:
reward = -50 # 超时惩罚
done = True
else:
# 距离减少的奖励,进入拥堵区域的惩罚
reward = (prev_dist - curr_dist) * 2
if new_pos in self.congestion:
reward -= 3 # 拥堵区域惩罚
done = False
# 返回下一状态、奖励、终止标志和额外信息
return self.get_state(), reward, done, {}
def get_state(self):
"""获取当前状态"""
dist_to_goal = self.manhattan_distance(self.agent_pos, self.goal_pos)
return np.array([self.agent_pos[0], self.agent_pos[1],
self.goal_pos[0], self.goal_pos[1],
dist_to_goal])
def manhattan_distance(self, pos1, pos2):
"""计算曼哈顿距离"""
return abs(pos1[0] - pos2[0]) + abs(pos1[1] - pos2[1])
def render(self, mode='human'):
"""渲染环境"""
if mode == 'human':
# 创建网格
grid = np.zeros(self.grid_size)
# 标记障碍物
for x, y in self.obstacles:
grid[y, x] = -1 # 障碍物用-1表示
# 标记拥堵区域
for x, y in self.congestion:
if grid[y, x] != -1: # 不覆盖障碍物
grid[y, x] = 0.5 # 拥堵区域用0.5表示
# 标记路径
for x, y in self.path:
if grid[y, x] not in [-1, 0.5]: # 不覆盖障碍物和拥堵
grid[y, x] = 0.2 # 路径用0.2表示
# 标记当前位置和目标位置
grid[self.agent_pos[1], self.agent_pos[0]] = 1 # 当前位置用1表示
grid[self.goal_pos[1], self.goal_pos[0]] = 2 # 目标位置用2表示
# 创建自定义颜色映射
colors = [(0, 0, 0), (0.5, 0.5, 0.5), (0.7, 0.7, 0.7), (0, 1, 0), (1, 0, 0)] # 黑, 灰, 浅灰, 绿, 红
cmap_name = 'custom_cmap'
cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=5)
# 显示网格
plt.figure(figsize=(10, 8))
plt.imshow(grid, cmap=cmap, interpolation='nearest')
plt.colorbar(ticks=[-1, 0, 0.5, 1, 2],
label='-1:障碍物, 0:空, 0.5:拥堵, 1:当前位置, 2:目标')
plt.title('运输路径优化环境')
plt.grid(True, which='both', linestyle='-', color='gray', alpha=0.5)
plt.show()
else:
super(TransportationEnv, self).render(mode=mode)
class QLearningAgent:
"""Q学习智能体"""
def __init__(self, state_size, action_size, learning_rate=0.1,
discount_factor=0.9, exploration_rate=1.0,
exploration_decay=0.995, min_exploration_rate=0.01):
self.state_size = state_size
self.action_size = action_size
self.learning_rate = learning_rate
self.discount_factor = discount_factor
self.exploration_rate = exploration_rate
self.exploration_decay = exploration_decay
self.min_exploration_rate = min_exploration_rate
# 离散化状态空间以适应Q表
self.state_bins = self._create_state_bins()
# 初始化Q表
self.q_table = {}
def _create_state_bins(self):
"""创建状态离散化的分箱"""
bins = []
# 位置和目标坐标的分箱
for _ in range(4):
bins.append(np.linspace(0, 9, 10)) # 假设网格大小为10x10
# 距离的分箱
bins.append(np.linspace(0, 18, 10)) # 最大曼哈顿距离为18
return bins
def _discretize_state(self, state):
"""将连续状态离散化"""
discrete_state = []
for i, s in enumerate(state):
if i < 4: # 位置和目标坐标
discrete_state.append(int(s))
else: # 距离
discrete_state.append(min(int(s / 2), 9)) # 将距离分箱为0-9
return tuple(discrete_state)
def act(self, state):
"""选择动作"""
discrete_state = self._discretize_state(state)
# 探索或利用
if np.random.uniform(0, 1) < self.exploration_rate:
return np.random.choice(self.action_size) # 随机探索
# 利用已知最优动作
if discrete_state not in self.q_table:
return np.random.choice(self.action_size)
q_values = self.q_table[discrete_state]
return np.argmax(q_values)
def update(self, state, action, reward, next_state, done):
"""更新Q表"""
discrete_state = self._discretize_state(state)
discrete_next_state = self._discretize_state(next_state)
# 初始化Q表项(如果不存在)
if discrete_state not in self.q_table:
self.q_table[discrete_state] = np.zeros(self.action_size)
if discrete_next_state not in self.q_table:
self.q_table[discrete_next_state] = np.zeros(self.action_size)
# Q学习更新公式
current_q = self.q_table[discrete_state][action]
max_next_q = np.max(self.q_table[discrete_next_state])
if done:
new_q = reward
else:
new_q = current_q + self.learning_rate * (reward + self.discount_factor * max_next_q - current_q)
self.q_table[discrete_state][action] = new_q
# 衰减探索率
if self.exploration_rate > self.min_exploration_rate:
self.exploration_rate *= self.exploration_decay
def train_agent(env, agent, num_episodes=1000):
"""训练智能体"""
rewards_history = []
for episode in range(num_episodes):
state = env.reset()
total_reward = 0
done = False
while not done:
action = agent.act(state)
next_state, reward, done, _ = env.step(action)
agent.update(state, action, reward, next_state, done)
state = next_state
total_reward += reward
rewards_history.append(total_reward)
if (episode + 1) % 100 == 0:
print(f"Episode {episode+1}/{num_episodes}, Average Reward: {np.mean(rewards_history[-100:]):.2f}")
return rewards_history
def evaluate_agent(env, agent, num_episodes=10):
"""评估智能体性能"""
success_count = 0
total_steps = []
for episode in range(num_episodes):
state = env.reset()
done = False
steps = 0
while not done:
action = agent.act(state) # 利用模式,不探索
state, _, done, _ = env.step(action)
steps += 1
if env.agent_pos == env.goal_pos:
success_count += 1
total_steps.append(steps)
print(f"Evaluation Episode {episode+1}: {'Success' if done and env.agent_pos == env.goal_pos else 'Failure'}")
success_rate = success_count / num_episodes
avg_steps = np.mean(total_steps) if total_steps else float('inf')
print(f"Success Rate: {success_rate:.2%}, Average Steps: {avg_steps:.2f}")
return success_rate, avg_steps
# 主函数
if __name__ == "__main__":
# 创建环境和智能体
env = TransportationEnv(grid_size=(10, 10))
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = QLearningAgent(state_size, action_size)
# 训练智能体
print("开始训练智能体...")
rewards = train_agent(env, agent, num_episodes=500)
# 评估智能体
print("\n评估智能体性能...")
evaluate_agent(env, agent, num_episodes=5)
# 可视化训练过程
plt.figure(figsize=(10, 6))
plt.plot(rewards)
plt.title('训练奖励')
plt.xlabel('回合')
plt.ylabel('总奖励')
plt.grid(True)
plt.show()
# 可视化最优路径
print("\n可视化最优路径...")
state = env.reset()
done = False
while not done:
action = agent.act(state) # 利用模式
state, _, done, _ = env.step(action)
env.render()