运输路径优化的强化学习环境和 Q 学习智能体

最新推荐文章于 2025-08-04 22:31:11 发布

欣然～

最新推荐文章于 2025-08-04 22:31:11 发布

阅读量514

点赞数 15

CC 4.0 BY-SA版权

文章标签： python

本文链接：https://blog.youkuaiyun.com/weixin_69327572/article/details/148556661

这段代码实现了一个运输路径优化的强化学习环境和 Q 学习智能体。下面是对代码的逐行解释：

导入必要的库

python

运行

import numpy as np
import gym
from gym import spaces
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import random

导入 NumPy 用于数值计算
导入 OpenAI Gym 库创建强化学习环境
导入 Matplotlib 用于可视化
导入 random 用于随机数生成

配置中文字体

python

运行

plt.rcParams['font.sans-serif'] = ['SimSun']  # 设置宋体
plt.rcParams['axes.unicode_minus'] = False     # 解决负号显示问题

设置 Matplotlib 使用宋体显示中文
确保负号能够正确显示

定义运输环境类

python

运行

class TransportationEnv(gym.Env):
    """运输路径优化环境"""
    metadata = {'render.modes': ['human']}

创建自定义 Gym 环境
指定环境支持的渲染模式为 'human'（人类可查看）

初始化环境参数

python

运行

def __init__(self, grid_size=(10, 10), num_obstacles=15, congestion_factor=0.3):
    super(TransportationEnv, self).__init__()
    
    # 环境参数
    self.grid_size = grid_size
    self.num_obstacles = num_obstacles
    self.congestion_factor = congestion_factor
    
    # 动作空间: 上、下、左、右
    self.action_space = spaces.Discrete(4)
    
    # 状态空间: 智能体位置(x,y) + 终点位置(x,y) + 拥堵信息(简化为距离终点的曼哈顿距离)
    low = np.array([0, 0, 0, 0, 0])
    high = np.array([grid_size[0]-1, grid_size[1]-1, 
                     grid_size[0]-1, grid_size[1]-1, 
                     grid_size[0]+grid_size[1]])
    self.observation_space = spaces.Box(low, high, dtype=np.int32)
    
    # 初始化环境
    self.reset()

设置网格大小、障碍物数量和拥堵系数
定义动作空间为 4 个离散动作（上、下、左、右）
定义状态空间包含智能体位置、目标位置和到目标的曼哈顿距离
调用 reset 方法初始化环境状态

重置环境

python

运行

def reset(self):
    """重置环境"""
    # 随机生成障碍物
    self.obstacles = set()
    while len(self.obstacles) < self.num_obstacles:
        x = random.randint(0, self.grid_size[0]-1)
        y = random.randint(0, self.grid_size[1]-1)
        self.obstacles.add((x, y))
    
    # 随机生成起点和终点，确保不在障碍物上
    while True:
        self.agent_pos = (random.randint(0, self.grid_size[0]-1), 
                         random.randint(0, self.grid_size[1]-1))
        if self.agent_pos not in self.obstacles:
            break
            
    while True:
        self.goal_pos = (random.randint(0, self.grid_size[0]-1), 
                        random.randint(0, self.grid_size[1]-1))
        if self.goal_pos != self.agent_pos and self.goal_pos not in self.obstacles:
            break
    
    # 初始化路径和步数
    self.path = [self.agent_pos]
    self.steps = 0
    self.max_steps = self.grid_size[0] * self.grid_size[1] * 2
    
    # 生成拥堵区域
    self.generate_congestion()
    
    # 返回初始状态
    return self.get_state()

随机生成障碍物位置
随机生成智能体起点和目标点，确保它们不在障碍物上且不相同
记录智能体走过的路径和步数
设置最大步数限制
生成拥堵区域
返回当前状态

生成拥堵区域

python

运行

def generate_congestion(self):
    """生成拥堵区域"""
    self.congestion = set()
    num_congestion = int(self.grid_size[0] * self.grid_size[1] * self.congestion_factor)
    
    # 拥堵区域倾向于靠近中心
    center_x, center_y = self.grid_size[0]//2, self.grid_size[1]//2
    for _ in range(num_congestion):
        while True:
            # 以中心为基准的高斯分布生成坐标
            x = int(np.clip(np.random.normal(center_x, self.grid_size[0]/4), 0, self.grid_size[0]-1))
            y = int(np.clip(np.random.normal(center_y, self.grid_size[1]/4), 0, self.grid_size[1]-1))
            pos = (x, y)
            if pos not in self.obstacles and pos not in self.congestion:
                self.congestion.add(pos)
                break

根据拥堵系数计算拥堵区域数量
使用高斯分布在网格中心附近生成拥堵区域
确保拥堵区域不会与障碍物重叠

执行动作并返回结果

python

运行

def step(self, action):
    """执行动作并返回下一状态、奖励和终止标志"""
    self.steps += 1
    
    # 计算新位置
    x, y = self.agent_pos
    if action == 0:  # 上
        y = max(y-1, 0)
    elif action == 1:  # 下
        y = min(y+1, self.grid_size[1]-1)
    elif action == 2:  # 左
        x = max(x-1, 0)
    elif action == 3:  # 右
        x = min(x+1, self.grid_size[0]-1)
        
    new_pos = (x, y)
    
    # 检查是否撞到障碍物
    if new_pos in self.obstacles:
        reward = -10  # 撞到障碍物的惩罚
        done = False  # 可以继续尝试
    else:
        self.agent_pos = new_pos
        self.path.append(new_pos)
        
        # 计算奖励
        prev_dist = self.manhattan_distance(self.path[-2], self.goal_pos)
        curr_dist = self.manhattan_distance(self.agent_pos, self.goal_pos)
        
        if new_pos == self.goal_pos:
            reward = 100  # 到达目标的奖励
            done = True
        elif self.steps >= self.max_steps:
            reward = -50  # 超时惩罚
            done = True
        else:
            # 距离减少的奖励，进入拥堵区域的惩罚
            reward = (prev_dist - curr_dist) * 2
            if new_pos in self.congestion:
                reward -= 3  # 拥堵区域惩罚
            done = False
    
    # 返回下一状态、奖励、终止标志和额外信息
    return self.get_state(), reward, done, {}

根据动作计算新位置并确保不超出网格边界
如果撞到障碍物，给予负奖励但允许继续尝试
如果到达目标，给予高奖励并终止本轮
如果超过最大步数，给予惩罚并终止本轮
根据距离目标的远近变化给予奖励
如果进入拥堵区域，给予额外惩罚

获取当前状态和计算曼哈顿距离

python

运行

def get_state(self):
    """获取当前状态"""
    dist_to_goal = self.manhattan_distance(self.agent_pos, self.goal_pos)
    return np.array([self.agent_pos[0], self.agent_pos[1], 
                     self.goal_pos[0], self.goal_pos[1], 
                     dist_to_goal])

def manhattan_distance(self, pos1, pos2):
    """计算曼哈顿距离"""
    return abs(pos1[0] - pos2[0]) + abs(pos1[1] - pos2[1])

get_state: 返回包含智能体位置、目标位置和距离的状态向量
manhattan_distance: 计算两点之间的曼哈顿距离（横向和纵向距离之和）

渲染环境

python

运行

def render(self, mode='human'):
    """渲染环境"""
    if mode == 'human':
        # 创建网格
        grid = np.zeros(self.grid_size)
        
        # 标记障碍物
        for x, y in self.obstacles:
            grid[y, x] = -1  # 障碍物用-1表示
            
        # 标记拥堵区域
        for x, y in self.congestion:
            if grid[y, x] != -1:  # 不覆盖障碍物
                grid[y, x] = 0.5  # 拥堵区域用0.5表示
        
        # 标记路径
        for x, y in self.path:
            if grid[y, x] not in [-1, 0.5]:  # 不覆盖障碍物和拥堵
                grid[y, x] = 0.2  # 路径用0.2表示
        
        # 标记当前位置和目标位置
        grid[self.agent_pos[1], self.agent_pos[0]] = 1  # 当前位置用1表示
        grid[self.goal_pos[1], self.goal_pos[0]] = 2  # 目标位置用2表示
        
        # 创建自定义颜色映射
        colors = [(0, 0, 0), (0.5, 0.5, 0.5), (0.7, 0.7, 0.7), (0, 1, 0), (1, 0, 0)]  # 黑, 灰, 浅灰, 绿, 红
        cmap_name = 'custom_cmap'
        cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=5)
        
        # 显示网格
        plt.figure(figsize=(10, 8))
        plt.imshow(grid, cmap=cmap, interpolation='nearest')
        plt.colorbar(ticks=[-1, 0, 0.5, 1, 2], 
                     label='-1:障碍物, 0:空, 0.5:拥堵, 1:当前位置, 2:目标')
        plt.title('运输路径优化环境')
        plt.grid(True, which='both', linestyle='-', color='gray', alpha=0.5)
        plt.show()
    else:
        super(TransportationEnv, self).render(mode=mode)

创建一个二维网格数组表示环境
用不同数值表示障碍物、拥堵区域、路径、当前位置和目标位置
创建自定义颜色映射将不同数值映射为不同颜色
使用 Matplotlib 显示环境可视化结果

Q 学习智能体类

python

运行

class QLearningAgent:
    """Q学习智能体"""
    def __init__(self, state_size, action_size, learning_rate=0.1, 
                 discount_factor=0.9, exploration_rate=1.0, 
                 exploration_decay=0.995, min_exploration_rate=0.01):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.min_exploration_rate = min_exploration_rate
        
        # 离散化状态空间以适应Q表
        self.state_bins = self._create_state_bins()
        
        # 初始化Q表
        self.q_table = {}

初始化 Q 学习所需的参数：学习率、折扣因子、探索率等
创建状态分箱用于离散化连续状态空间
初始化 Q 表（用字典实现）

状态离散化方法

python

运行

def _create_state_bins(self):
    """创建状态离散化的分箱"""
    bins = []
    # 位置和目标坐标的分箱
    for _ in range(4):
        bins.append(np.linspace(0, 9, 10))  # 假设网格大小为10x10
    # 距离的分箱
    bins.append(np.linspace(0, 18, 10))  # 最大曼哈顿距离为18
    return bins

def _discretize_state(self, state):
    """将连续状态离散化"""
    discrete_state = []
    for i, s in enumerate(state):
        if i < 4:  # 位置和目标坐标
            discrete_state.append(int(s))
        else:  # 距离
            discrete_state.append(min(int(s / 2), 9))  # 将距离分箱为0-9
    return tuple(discrete_state)

_create_state_bins: 创建用于离散化状态的分箱
_discretize_state: 将连续状态转换为离散状态表示，便于 Q 表存储

动作选择和 Q 表更新

python

运行

def act(self, state):
    """选择动作"""
    discrete_state = self._discretize_state(state)
    
    # 探索或利用
    if np.random.uniform(0, 1) < self.exploration_rate:
        return np.random.choice(self.action_size)  # 随机探索
        
    # 利用已知最优动作
    if discrete_state not in self.q_table:
        return np.random.choice(self.action_size)
        
    q_values = self.q_table[discrete_state]
    return np.argmax(q_values)

def update(self, state, action, reward, next_state, done):
    """更新Q表"""
    discrete_state = self._discretize_state(state)
    discrete_next_state = self._discretize_state(next_state)
    
    # 初始化Q表项(如果不存在)
    if discrete_state not in self.q_table:
        self.q_table[discrete_state] = np.zeros(self.action_size)
        
    if discrete_next_state not in self.q_table:
        self.q_table[discrete_next_state] = np.zeros(self.action_size)
        
    # Q学习更新公式
    current_q = self.q_table[discrete_state][action]
    max_next_q = np.max(self.q_table[discrete_next_state])
    
    if done:
        new_q = reward
    else:
        new_q = current_q + self.learning_rate * (reward + self.discount_factor * max_next_q - current_q)
        
    self.q_table[discrete_state][action] = new_q
    
    # 衰减探索率
    if self.exploration_rate > self.min_exploration_rate:
        self.exploration_rate *= self.exploration_decay

act: 根据当前状态选择动作，使用 ε- 贪婪策略平衡探索和利用
update: 根据 Q 学习算法更新 Q 表值，使用贝尔曼方程
随着训练进行，逐渐降低探索率

训练和评估函数

python

运行

def train_agent(env, agent, num_episodes=1000):
    """训练智能体"""
    rewards_history = []
    
    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        
        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.update(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
            
        rewards_history.append(total_reward)
        
        if (episode + 1) % 100 == 0:
            print(f"Episode {episode+1}/{num_episodes}, Average Reward: {np.mean(rewards_history[-100:]):.2f}")
    
    return rewards_history

def evaluate_agent(env, agent, num_episodes=10):
    """评估智能体性能"""
    success_count = 0
    total_steps = []
    
    for episode in range(num_episodes):
        state = env.reset()
        done = False
        steps = 0
        
        while not done:
            action = agent.act(state)  # 利用模式，不探索
            state, _, done, _ = env.step(action)
            steps += 1
            
        if env.agent_pos == env.goal_pos:
            success_count += 1
            total_steps.append(steps)
        
        print(f"Evaluation Episode {episode+1}: {'Success' if done and env.agent_pos == env.goal_pos else 'Failure'}")
    
    success_rate = success_count / num_episodes
    avg_steps = np.mean(total_steps) if total_steps else float('inf')
    
    print(f"Success Rate: {success_rate:.2%}, Average Steps: {avg_steps:.2f}")
    return success_rate, avg_steps

train_agent: 训练智能体，记录每轮的总奖励
evaluate_agent: 评估智能体性能，计算成功率和平均步数

主函数

python

运行

if __name__ == "__main__":
    # 创建环境和智能体
    env = TransportationEnv(grid_size=(10, 10))
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = QLearningAgent(state_size, action_size)
    
    # 训练智能体
    print("开始训练智能体...")
    rewards = train_agent(env, agent, num_episodes=500)
    
    # 评估智能体
    print("\n评估智能体性能...")
    evaluate_agent(env, agent, num_episodes=5)
    
    # 可视化训练过程
    plt.figure(figsize=(10, 6))
    plt.plot(rewards)
    plt.title('训练奖励')
    plt.xlabel('回合')
    plt.ylabel('总奖励')
    plt.grid(True)
    plt.show()
    
    # 可视化最优路径
    print("\n可视化最优路径...")
    state = env.reset()
    done = False
    
    while not done:
        action = agent.act(state)  # 利用模式
        state, _, done, _ = env.step(action)
    
    env.render()

创建环境和智能体
训练智能体并显示训练进度
评估智能体性能
可视化训练奖励变化
展示智能体找到的最优路径

总结

这段代码实现了一个基于强化学习的运输路径优化系统，主要包括：

自定义 Gym 环境，模拟带有障碍物和拥堵区域的网格世界
Q 学习智能体，通过离散化状态空间来处理连续状态
训练和评估函数，用于训练智能体并评估其性能
可视化功能，展示环境状态和训练过程

系统的核心是 Q 学习算法，通过不断尝试和学习，智能体逐渐找到从起点到终点的最优路径，同时避开障碍物和拥堵区域。

import numpy as np
import gym
from gym import spaces
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import random
 
# 添加中文字体配置
# ==============================
plt.rcParams['font.sans-serif'] = ['SimSun']  # 设置宋体
plt.rcParams['axes.unicode_minus'] = False     # 解决负号显示问题

class TransportationEnv(gym.Env):
    """运输路径优化环境"""
    metadata = {'render.modes': ['human']}
    
    def __init__(self, grid_size=(10, 10), num_obstacles=15, congestion_factor=0.3):
        super(TransportationEnv, self).__init__()
        
        # 环境参数
        self.grid_size = grid_size
        self.num_obstacles = num_obstacles
        self.congestion_factor = congestion_factor
        
        # 动作空间: 上、下、左、右
        self.action_space = spaces.Discrete(4)
        
        # 状态空间: 智能体位置(x,y) + 终点位置(x,y) + 拥堵信息(简化为距离终点的曼哈顿距离)
        low = np.array([0, 0, 0, 0, 0])
        high = np.array([grid_size[0]-1, grid_size[1]-1, 
                         grid_size[0]-1, grid_size[1]-1, 
                         grid_size[0]+grid_size[1]])
        self.observation_space = spaces.Box(low, high, dtype=np.int32)
        
        # 初始化环境
        self.reset()
        
    def reset(self):
        """重置环境"""
        # 随机生成障碍物
        self.obstacles = set()
        while len(self.obstacles) < self.num_obstacles:
            x = random.randint(0, self.grid_size[0]-1)
            y = random.randint(0, self.grid_size[1]-1)
            self.obstacles.add((x, y))
        
        # 随机生成起点和终点，确保不在障碍物上
        while True:
            self.agent_pos = (random.randint(0, self.grid_size[0]-1), 
                             random.randint(0, self.grid_size[1]-1))
            if self.agent_pos not in self.obstacles:
                break
                
        while True:
            self.goal_pos = (random.randint(0, self.grid_size[0]-1), 
                            random.randint(0, self.grid_size[1]-1))
            if self.goal_pos != self.agent_pos and self.goal_pos not in self.obstacles:
                break
        
        # 初始化路径和步数
        self.path = [self.agent_pos]
        self.steps = 0
        self.max_steps = self.grid_size[0] * self.grid_size[1] * 2
        
        # 生成拥堵区域
        self.generate_congestion()
        
        # 返回初始状态
        return self.get_state()
    
    def generate_congestion(self):
        """生成拥堵区域"""
        self.congestion = set()
        num_congestion = int(self.grid_size[0] * self.grid_size[1] * self.congestion_factor)
        
        # 拥堵区域倾向于靠近中心
        center_x, center_y = self.grid_size[0]//2, self.grid_size[1]//2
        for _ in range(num_congestion):
            while True:
                # 以中心为基准的高斯分布生成坐标
                x = int(np.clip(np.random.normal(center_x, self.grid_size[0]/4), 0, self.grid_size[0]-1))
                y = int(np.clip(np.random.normal(center_y, self.grid_size[1]/4), 0, self.grid_size[1]-1))
                pos = (x, y)
                if pos not in self.obstacles and pos not in self.congestion:
                    self.congestion.add(pos)
                    break
    
    def step(self, action):
        """执行动作并返回下一状态、奖励和终止标志"""
        self.steps += 1
        
        # 计算新位置
        x, y = self.agent_pos
        if action == 0:  # 上
            y = max(y-1, 0)
        elif action == 1:  # 下
            y = min(y+1, self.grid_size[1]-1)
        elif action == 2:  # 左
            x = max(x-1, 0)
        elif action == 3:  # 右
            x = min(x+1, self.grid_size[0]-1)
            
        new_pos = (x, y)
        
        # 检查是否撞到障碍物
        if new_pos in self.obstacles:
            reward = -10  # 撞到障碍物的惩罚
            done = False  # 可以继续尝试
        else:
            self.agent_pos = new_pos
            self.path.append(new_pos)
            
            # 计算奖励
            prev_dist = self.manhattan_distance(self.path[-2], self.goal_pos)
            curr_dist = self.manhattan_distance(self.agent_pos, self.goal_pos)
            
            if new_pos == self.goal_pos:
                reward = 100  # 到达目标的奖励
                done = True
            elif self.steps >= self.max_steps:
                reward = -50  # 超时惩罚
                done = True
            else:
                # 距离减少的奖励，进入拥堵区域的惩罚
                reward = (prev_dist - curr_dist) * 2
                if new_pos in self.congestion:
                    reward -= 3  # 拥堵区域惩罚
                done = False
        
        # 返回下一状态、奖励、终止标志和额外信息
        return self.get_state(), reward, done, {}
    
    def get_state(self):
        """获取当前状态"""
        dist_to_goal = self.manhattan_distance(self.agent_pos, self.goal_pos)
        return np.array([self.agent_pos[0], self.agent_pos[1], 
                         self.goal_pos[0], self.goal_pos[1], 
                         dist_to_goal])
    
    def manhattan_distance(self, pos1, pos2):
        """计算曼哈顿距离"""
        return abs(pos1[0] - pos2[0]) + abs(pos1[1] - pos2[1])
    
    def render(self, mode='human'):
        """渲染环境"""
        if mode == 'human':
            # 创建网格
            grid = np.zeros(self.grid_size)
            
            # 标记障碍物
            for x, y in self.obstacles:
                grid[y, x] = -1  # 障碍物用-1表示
                
            # 标记拥堵区域
            for x, y in self.congestion:
                if grid[y, x] != -1:  # 不覆盖障碍物
                    grid[y, x] = 0.5  # 拥堵区域用0.5表示
            
            # 标记路径
            for x, y in self.path:
                if grid[y, x] not in [-1, 0.5]:  # 不覆盖障碍物和拥堵
                    grid[y, x] = 0.2  # 路径用0.2表示
            
            # 标记当前位置和目标位置
            grid[self.agent_pos[1], self.agent_pos[0]] = 1  # 当前位置用1表示
            grid[self.goal_pos[1], self.goal_pos[0]] = 2  # 目标位置用2表示
            
            # 创建自定义颜色映射
            colors = [(0, 0, 0), (0.5, 0.5, 0.5), (0.7, 0.7, 0.7), (0, 1, 0), (1, 0, 0)]  # 黑, 灰, 浅灰, 绿, 红
            cmap_name = 'custom_cmap'
            cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=5)
            
            # 显示网格
            plt.figure(figsize=(10, 8))
            plt.imshow(grid, cmap=cmap, interpolation='nearest')
            plt.colorbar(ticks=[-1, 0, 0.5, 1, 2], 
                         label='-1:障碍物, 0:空, 0.5:拥堵, 1:当前位置, 2:目标')
            plt.title('运输路径优化环境')
            plt.grid(True, which='both', linestyle='-', color='gray', alpha=0.5)
            plt.show()
        else:
            super(TransportationEnv, self).render(mode=mode)
 
class QLearningAgent:
    """Q学习智能体"""
    def __init__(self, state_size, action_size, learning_rate=0.1, 
                 discount_factor=0.9, exploration_rate=1.0, 
                 exploration_decay=0.995, min_exploration_rate=0.01):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.min_exploration_rate = min_exploration_rate
        
        # 离散化状态空间以适应Q表
        self.state_bins = self._create_state_bins()
        
        # 初始化Q表
        self.q_table = {}
        
    def _create_state_bins(self):
        """创建状态离散化的分箱"""
        bins = []
        # 位置和目标坐标的分箱
        for _ in range(4):
            bins.append(np.linspace(0, 9, 10))  # 假设网格大小为10x10
        # 距离的分箱
        bins.append(np.linspace(0, 18, 10))  # 最大曼哈顿距离为18
        return bins
    
    def _discretize_state(self, state):
        """将连续状态离散化"""
        discrete_state = []
        for i, s in enumerate(state):
            if i < 4:  # 位置和目标坐标
                discrete_state.append(int(s))
            else:  # 距离
                discrete_state.append(min(int(s / 2), 9))  # 将距离分箱为0-9
        return tuple(discrete_state)
    
    def act(self, state):
        """选择动作"""
        discrete_state = self._discretize_state(state)
        
        # 探索或利用
        if np.random.uniform(0, 1) < self.exploration_rate:
            return np.random.choice(self.action_size)  # 随机探索
            
        # 利用已知最优动作
        if discrete_state not in self.q_table:
            return np.random.choice(self.action_size)
            
        q_values = self.q_table[discrete_state]
        return np.argmax(q_values)
    
    def update(self, state, action, reward, next_state, done):
        """更新Q表"""
        discrete_state = self._discretize_state(state)
        discrete_next_state = self._discretize_state(next_state)
        
        # 初始化Q表项(如果不存在)
        if discrete_state not in self.q_table:
            self.q_table[discrete_state] = np.zeros(self.action_size)
            
        if discrete_next_state not in self.q_table:
            self.q_table[discrete_next_state] = np.zeros(self.action_size)
            
        # Q学习更新公式
        current_q = self.q_table[discrete_state][action]
        max_next_q = np.max(self.q_table[discrete_next_state])
        
        if done:
            new_q = reward
        else:
            new_q = current_q + self.learning_rate * (reward + self.discount_factor * max_next_q - current_q)
            
        self.q_table[discrete_state][action] = new_q
        
        # 衰减探索率
        if self.exploration_rate > self.min_exploration_rate:
            self.exploration_rate *= self.exploration_decay
 
def train_agent(env, agent, num_episodes=1000):
    """训练智能体"""
    rewards_history = []
    
    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        
        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.update(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
            
        rewards_history.append(total_reward)
        
        if (episode + 1) % 100 == 0:
            print(f"Episode {episode+1}/{num_episodes}, Average Reward: {np.mean(rewards_history[-100:]):.2f}")
    
    return rewards_history
 
def evaluate_agent(env, agent, num_episodes=10):
    """评估智能体性能"""
    success_count = 0
    total_steps = []
    
    for episode in range(num_episodes):
        state = env.reset()
        done = False
        steps = 0
        
        while not done:
            action = agent.act(state)  # 利用模式，不探索
            state, _, done, _ = env.step(action)
            steps += 1
            
        if env.agent_pos == env.goal_pos:
            success_count += 1
            total_steps.append(steps)
        
        print(f"Evaluation Episode {episode+1}: {'Success' if done and env.agent_pos == env.goal_pos else 'Failure'}")
    
    success_rate = success_count / num_episodes
    avg_steps = np.mean(total_steps) if total_steps else float('inf')
    
    print(f"Success Rate: {success_rate:.2%}, Average Steps: {avg_steps:.2f}")
    return success_rate, avg_steps
 
# 主函数
if __name__ == "__main__":
    # 创建环境和智能体
    env = TransportationEnv(grid_size=(10, 10))
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = QLearningAgent(state_size, action_size)
    
    # 训练智能体
    print("开始训练智能体...")
    rewards = train_agent(env, agent, num_episodes=500)
    
    # 评估智能体
    print("\n评估智能体性能...")
    evaluate_agent(env, agent, num_episodes=5)
    
    # 可视化训练过程
    plt.figure(figsize=(10, 6))
    plt.plot(rewards)
    plt.title('训练奖励')
    plt.xlabel('回合')
    plt.ylabel('总奖励')
    plt.grid(True)
    plt.show()
    
    # 可视化最优路径
    print("\n可视化最优路径...")
    state = env.reset()
    done = False
    
    while not done:
        action = agent.act(state)  # 利用模式
        state, _, done, _ = env.step(action)
    
    env.render()