VRP_用MDP建模_用SB3改进-未完_pymo建模vrp-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_44253237/article/details/142963775

文章目录

前言
一、配置
二、代码_改1
使用 DQN 解决 VRP 问题的详细解释
总结

前言

VRP_用MDP建模_20241015

一、配置

运行以下命令重新创建环境，并确保正确创建和激活：

conda create --name sb3_env python=3.9
conda activate sb3_env

二、代码_改1

在 Stable Baselines3 中，当你的环境返回一个字典类型的观测空间时，你应该使用 MultiInputPolicy，这是专门为处理多输入（例如字典类型观测空间）设计的策略。

# 创建 DQN 模型，使用 MultiInputPolicy
model = DQN(
    "MultiInputPolicy",  # 使用多输入策略来支持字典类型观测空间
    env,
    learning_rate=0.001,  # 可根据情况调整学习率
    gamma=0.99,           # 折扣因子
    exploration_fraction=0.1,  # 探索率
    exploration_final_eps=0.02,  # epsilon 最低值
    buffer_size=50000,    # 经验回放缓冲区大小
    learning_starts=1000,  # 开始训练前需要收集的步骤数
    batch_size=32,        # 批次大小
    target_update_interval=500,  # 目标网络更新频率
    train_freq=1,         # 每个动作之后进行多少次训练
    max_grad_norm=10,     # 梯度裁剪
    verbose=1             # 打印训练过程中的信息
)

在这里插入图片描述

import gymnasium as gym
from gymnasium import spaces
import numpy as np
import sys
sys.path.append('d:/RL_Code/MDP_VRP_test_02_sb3')
import matplotlib.pyplot as plt
# import os
# os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
# import os
# os.environ['OMP_NUM_THREADS'] = '1'

class VRPEnv(gym.Env):
    """
    自定义的车辆路径问题（VRP）环境，用于MDP求解。
    """
    def __init__(self, num_customers=5, vehicle_capacity=100, max_steps=100):
        super(VRPEnv, self).__init__()
        
        # 参数
        self.num_customers = num_customers  # 客户数量
        self.vehicle_capacity = vehicle_capacity  # 车辆最大容量
        self.max_steps = max_steps  # 最大步数限制
        self.current_step = 0

        # 定义状态空间: 当前车辆的位置和剩余容量，访问过的客户
        self.observation_space = spaces.Dict({
            "vehicle_position": spaces.Discrete(num_customers + 1),  # 位置: 0表示配送中心，1到num_customers表示客户
            "vehicle_capacity": spaces.Discrete(vehicle_capacity + 1),  # 剩余容量
            "visited_customers": spaces.MultiBinary(num_customers),  # 访问过的客户集合
        })

        # 定义动作空间: 车辆可以选择访问下一个客户
        self.action_space = spaces.Discrete(num_customers + 1)  # 动作空间为0到num_customers的整数，表示访问哪一个客户

        # 初始化客户需求和位置
        self.customer_demands = np.random.randint(1, 5, size=num_customers)  # 随机生成每个客户的需求量
        self.customer_positions = np.random.rand(num_customers, 2) * 10  # 在二维空间内生成客户位置        
        self.depot_position = np.array([0, 0])  # 配送中心位置固定在 (0, 0)

        # 将配送中心的位置插入到第一个位置
        self.customer_positions = np.vstack([self.depot_position, self.customer_positions])

        # 初始化环境状态
        self.state = None

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.state = {
            "vehicle_position": 0,  # 车辆初始位置为配送中心
            "vehicle_capacity": self.vehicle_capacity,  # 初始化车辆满载
            "visited_customers": np.zeros(self.num_customers, dtype=np.int8)  # 指定为 int8 类型
        }
        self.current_step = 0
        return self.state, {}

    def step(self, action):
        assert self.action_space.contains(action), f"非法动作: {action}"

        # 获取当前状态
        vehicle_position = self.state["vehicle_position"]
        vehicle_capacity = self.state["vehicle_capacity"]
        visited_customers = self.state["visited_customers"]

        # 动作范围应在 [0, num_customers]
        if action < 0 or action > self.num_customers:
            raise ValueError(f"Action {action} is out of bounds!")

        # 计算距离并奖励（负的行驶距离）
        if action == 0:  # 返回配送中心
            if vehicle_position == 0:
                reward = 0  # 已经在配送中心，无需移动
            else:
                reward = -np.linalg.norm(self.customer_positions[vehicle_position] - self.depot_position)
            terminated = all(visited_customers)  # 如果所有客户都被访问过，结束
            truncated = self.current_step >= self.max_steps  # 如果达到最大步数限制，截断
        else:
            # 访问客户
            if visited_customers[action - 1] == 1:  # 客户已被访问，奖励为负
                reward = -10  # 惩罚
                terminated = False
                truncated = False
            else:
                # 计算到客户的距离
                reward = -np.linalg.norm(self.customer_positions[vehicle_position] - self.customer_positions[action])

                # 检查车辆容量是否足够
                demand = self.customer_demands[action - 1]
                if demand <= vehicle_capacity:
                    # 更新车辆位置，更新已访问客户列表，减少剩余容量
                    vehicle_capacity -= demand
                    visited_customers[action - 1] = 1  # 标记为已访问
                    terminated = False
                    truncated = False
                else:
                    reward = -100  # 容量不足的惩罚
                    terminated = False
                    truncated = False

        # 更新状态
        self.state = {
            "vehicle_position": action,
            "vehicle_capacity": vehicle_capacity,
            "visited_customers": visited_customers
        }
        self.current_step += 1
        terminated = terminated or self.current_step >= self.max_steps  # 强制结束条件
        info = {}  # 可以添加额外信息

        return self.state, reward, terminated, truncated, info

    # def render(self):
    #     print(f"车辆位置: {self.state['vehicle_position']}, 剩余容量: {self.state['vehicle_capacity']}")
    #     print(f"已访问客户: {self.state['visited_customers']}")
    def render(self, mode='human'):
        """
        可视化环境状态，包括车辆的位置、客户的位置和已访问的客户。
        """
        plt.figure(figsize=(6, 6))
        plt.xlim(-1, 11)
        plt.ylim(-1, 11)
        
        # 绘制配送中心
        plt.scatter(self.depot_position[0], self.depot_position[1], color='red', label='Depot', s=100)

        # 绘制客户的位置
        for i, (pos, visited) in enumerate(zip(self.customer_positions[1:], self.state["visited_customers"])):
            color = 'green' if visited else 'blue'
            plt.scatter(pos[0], pos[1], color=color, label=f'Customer {i+1}', s=100)
            plt.text(pos[0]+0.1, pos[1]+0.1, f'{i+1}', fontsize=12)

        # 绘制车辆的位置
        vehicle_position = self.state['vehicle_position']
        if vehicle_position == 0:
            vehicle_pos = self.depot_position
        else:
            vehicle_pos = self.customer_positions[vehicle_position]
        plt.scatter(vehicle_pos[0], vehicle_pos[1], color='black', label='Vehicle', marker='x', s=200)
        plt.text(vehicle_pos[0]+0.1, vehicle_pos[1]+0.1, 'Vehicle', fontsize=12, color='black')

        plt.title(f"Step: {self.current_step} | Remaining Capacity: {self.state['vehicle_capacity']}")
        plt.legend(loc='upper right')
        plt.grid(True)
        plt.show()

    def close(self):
        pass

import gymnasium as gym
from stable_baselines3 import DQN
from Gymnasium_env_sb3 import VRPEnv

import sys
sys.path.append('d:/RL_Code/MDP_VRP_test_02_sb3')

# import os
# os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

# import os
# os.environ['OMP_NUM_THREADS'] = '1'

# 初始化环境
num_customers = 5
env = VRPEnv(num_customers=num_customers, vehicle_capacity=100, max_steps=100)

# 验证环境是否兼容 SB3
from stable_baselines3.common.env_checker import check_env
check_env(env)

# 创建 DQN 模型
# 创建 DQN 模型，使用 MultiInputPolicy
model = DQN(
    "MultiInputPolicy",  # 使用多输入策略来支持字典类型观测空间
    env,
    learning_rate=0.001,  # 可根据情况调整学习率
    gamma=0.99,           # 折扣因子
    exploration_fraction=0.1,  # 探索率
    exploration_final_eps=0.02,  # epsilon 最低值
    buffer_size=50000,    # 经验回放缓冲区大小
    learning_starts=1000,  # 开始训练前需要收集的步骤数
    batch_size=32,        # 批次大小
    target_update_interval=500,  # 目标网络更新频率
    train_freq=1,         # 每个动作之后进行多少次训练
    max_grad_norm=10,     # 梯度裁剪
    verbose=1             # 打印训练过程中的信息
)

# 训练模型
model.learn(total_timesteps=50000)

# 保存模型
model.save("dqn_vrp_model")

# 测试模型
env = VRPEnv(num_customers=num_customers, vehicle_capacity=100, max_steps=100)  # 重置环境
model = DQN.load("dqn_vrp_model", env=env)  # 加载模型

# 验证模型的表现
obs, _ = env.reset()
for _ in range(100):  # 运行100个步骤
    action, _states = model.predict(obs)
    # obs, rewards, done, info = env.step(action)
    obs, rewards, terminated, truncated, info = env.step(action)
    done = terminated or truncated  # 将 terminated 和 truncated 合并为一个 done 标志
    
    env.render()  # 如果环境有渲染功能
    if done:
        obs, _ = env.reset()

print("测试完成")

使用 DQN 解决 VRP 问题的详细解释

该代码展示了如何将Deep Q-Network (DQN) 算法应用于车辆路径规划问题 (VRP)，并使用 Gymnasium 创建自定义强化学习环境。以下是代码的逐步解析。

1. 导入必要的模块

import gymnasium as gym
from stable_baselines3 import DQN
from Gymnasium_env_sb3 import VRPEnv
import sys
sys.path.append('d:/RL_Code/MDP_VRP_test_02_sb3')

使用 gymnasium 处理自定义环境。
从 Stable-Baselines3 导入 DQN 算法。
使用 sys.path.append() 将自定义环境文件路径加入 Python 的搜索路径。

2. 初始化 VRP 环境

num_customers = 5
env = VRPEnv(num_customers=num_customers, vehicle_capacity=100, max_steps=100)

初始化包含 $5$ 个客户的 VRP 环境，车辆容量为 $100$ ，最多运行 $100$ 步。

3. 检查环境的兼容性

from stable_baselines3.common.env_checker import check_env
check_env(env)

使用 check_env() 检查环境是否符合 Gym 和 Stable-Baselines3 的标准。

4. 创建 DQN 模型

model = DQN(
    "MultiInputPolicy",  
    env,
    learning_rate=0.001,  
    gamma=0.99,           
    exploration_fraction=0.1,  
    exploration_final_eps=0.02,  
    buffer_size=50000,    
    learning_starts=1000,  
    batch_size=32,        
    target_update_interval=500,  
    train_freq=1,         
    max_grad_norm=10,     
    verbose=1             
)

DQN 算法通过 $\epsilon$ -greedy 策略平衡探索和利用。
重要参数：
- learning_rate=0.001：学习率控制梯度下降的步长。
- $\gamma = 0.99$ ：折扣因子，用于平衡长期与短期奖励。
- exploration_fraction=0.1：探索率从 $0.1$ 开始衰减。
- exploration_final_eps=0.02：探索率的最低值为 $0.02$ 。
- buffer_size=50000：经验回放缓冲区的大小。
- target_update_interval=500：每 $500$ 步更新一次目标网络。

5. 训练模型

model.learn(total_timesteps=50000)

在 $50000$ 个时间步内训练模型，使用经验回放优化策略。

6. 保存模型

model.save("dqn_vrp_model")

将训练完成的模型保存为 dqn_vrp_model，方便后续使用。

7. 加载模型并测试

env = VRPEnv(num_customers=num_customers, vehicle_capacity=100, max_steps=100)
model = DQN.load("dqn_vrp_model", env=env)

创建新的 VRP 环境并加载已保存的 DQN 模型。

8. 验证模型表现

obs, _ = env.reset()
for _ in range(100):
    action, _states = model.predict(obs)
    obs, rewards, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    
    env.render()  
    if done:
        obs, _ = env.reset()