DRL1-分布式多智能体Q-learning

import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

class DistributedQLearningAgent:
    def __init__(self, n_actions, learning_rate=0.1, discount_factor=0.9, exploration_rate=0.1):
        self.q_table = defaultdict(lambda: np.zeros(n_actions))
        self.alpha = learning_rate
        self.gamma = discount_factor
        self.epsilon = exploration_rate
        self.n_actions = n_actions
    
    def choose_action(self, state):
        if np.random.random() < self.epsilon:
            return np.random.choice(self.n_actions)
        else:
            return np.argmax(self.q_table[state])
    
    def learn(self, state, action, reward, next_state, neighbors_q_tables):
        max_neighbor_q = -np.inf
        for neighbor_q in neighbors_q_tables:
            neighbor_max_q = np.max(neighbor_q[next_state])
            if neighbor_max_q > max_neighbor_q:
                max_neighbor_q = neighbor_max_q
        
        if max_neighbor_q == -np.inf:
            max_neighbor_q = np.max(self.q_table[next_state])
        
        td_target = reward + self.gamma * max_neighbor_q
        td_error = td_target - self.q_table[state][action]
        self.q_table[state][action] += self.alpha * td_error

# 参数设置
n_agents = 3
n_actions = 2
n_states = 5
agents = [DistributedQLearningAgent(n_actions) for _ in range(n_agents)]

# 奖励函数
def get_reward(state, action):
    return np.random.normal(loc=state*action, scale=0.1)

# 训练过程
episode_rewards = []
for episode in range(1000):
    states = [np.random.randint(0, n_states) for _ in range(n_agents)]
    episode_reward = 0
    
    for t in range(100):
        actions = [agent.choose_action(state) for agent, state in zip(agents, states)]
        rewards = [get_reward(state, action) for state, action in zip(states, actions)]
        episode_reward += sum(rewards)
        next_states = [(state + action) % n_states for state, action in zip(states, actions)]
        
        for i in range(n_agents):
            neighbors = [agents[j].q_table for j in range(n_agents) if j != i]
            agents[i].learn(states[i], actions[i], rewards[i], next_states[i], neighbors)
        
        states = next_states
    
    episode_rewards.append(episode_reward)

# 1. 打印所有智能体的Q表
for i, agent in enumerate(agents):
    print(f"\nAgent {i+1} Q-table:")
    for state in range(n_states):
        print(f"State {state}: {agent.q_table[state]}")

# 2. 可视化Q值矩阵
plt.figure(figsize=(15, 5))
for i, agent in enumerate(agents):
    plt.subplot(1, n_agents, i+1)
    
    # 将Q表转换为矩阵
    q_matrix = np.zeros((n_states, n_actions))
    for state in range(n_states):
        q_matrix[state] = agent.q_table[state]
    
    # 绘制热力图
    sns.heatmap(q_matrix, annot=True, fmt=".2f", cmap="YlGnBu", 
                xticklabels=[f"Action {a}" for a in range(n_actions)],
                yticklabels=[f"State {s}" for s in range(n_states)])
    plt.title(f"Agent {i+1} Q-Value Matrix")

plt.tight_layout()

# 3. 绘制奖励曲线
plt.figure(figsize=(10, 5))
plt.plot(episode_rewards, alpha=0.3, label='Episode Reward')
window_size = 50
moving_avg = np.convolve(episode_rewards, np.ones(window_size)/window_size, mode='valid')
plt.plot(np.arange(window_size-1, len(episode_rewards)), moving_avg, 'r-', 
         linewidth=2, label=f'Moving Average (window={window_size})')
plt.title("System Reward Over Episodes")
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.legend()
plt.grid(True)

plt.show()

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

以下是对这段代码的详细解释,分为核心类、训练流程和可视化三个部分:


一、DistributedQLearningAgent 类(核心算法)

class DistributedQLearningAgent:
    def __init__(self, n_actions, learning_rate=0.1, discount_factor=0.9, exploration_rate=0.1):
        self.q_table = defaultdict(lambda: np.zeros(n_actions))  # 状态-动作值表
        self.alpha = learning_rate      # 学习率(更新步长)
        self.gamma = discount_factor    # 折扣因子(未来奖励权重)
        self.epsilon = exploration_rate # 探索率
        self.n_actions = n_actions      # 动作空间大小
关键方法:
  1. 动作选择 (ε-greedy策略)
def choose_action(self, state):
    if np.random.random() < self.epsilon:  # 以ε概率随机探索
        return np.random.choice(self.n_actions)
    else:                                 # 以1-ε概率选择最优动作
        return np.argmax(self.q_table[state])
  1. 分布式Q学习更新
def learn(self, state, action, reward, next_state, neighbors_q_tables):
    # 从邻居Q表中获取最大Q值(核心分布式逻辑)
    max_neighbor_q = max([np.max(q[next_state]) for q in neighbors_q_tables], default=-np.inf)
    
    # 无邻居时使用自身Q值
    if max_neighbor_q == -np.inf:
        max_neighbor_q = np.max(self.q_table[next_state])
    
    # Q学习更新公式
    td_target = reward + self.gamma * max_neighbor_q  # 目标值
    td_error = td_target - self.q_table[state][action]  # 时序差分误差
    self.q_table[state][action] += self.alpha * td_error  # 更新Q值

二、训练流程

1. 初始化设置
n_agents = 3    # 智能体数量
n_actions = 2   # 每个智能体的动作数
n_states = 5    # 状态空间大小
agents = [DistributedQLearningAgent(n_actions) for _ in range(n_agents)]  # 创建智能体群
2. 环境交互逻辑
def get_reward(state, action):
    # 定义奖励函数:基于状态和动作的线性关系+高斯噪声
    return np.random.normal(loc=state*action, scale=0.1)
3. 主训练循环
for episode in range(1000):  # 1000轮训练
    states = [np.random.randint(0, n_states) for _ in range(n_agents)]  # 随机初始状态
    
    for t in range(100):  # 每轮最多100步
        # 1. 所有智能体选择动作
        actions = [agent.choose_action(state) for agent, state in zip(agents, states)]
        
        # 2. 获取奖励和转移状态
        rewards = [get_reward(state, action) for state, action in zip(states, actions)]
        next_states = [(state + action) % n_states for state, action in zip(states, actions)]  # 循环状态空间
        
        # 3. 分布式学习
        for i in range(n_agents):
            neighbors = [agents[j].q_table for j in range(n_agents) if j != i]  # 获取邻居Q表
            agents[i].learn(states[i], actions[i], rewards[i], next_states[i], neighbors)
        
        states = next_states  # 状态更新
    
    episode_rewards.append(sum(rewards))  # 记录本轮总奖励

三、结果可视化

1. Q表打印
for i, agent in enumerate(agents):
    print(f"\nAgent {i+1} Q-table:")
    for state in range(n_states):
        print(f"State {state}: {agent.q_table[state]}")

输出:

Agent 1 Q-table:
State 0: [0.12 0.45]
State 1: [1.23 0.89]
...
2. Q值矩阵热力图
plt.figure(figsize=(15, 5))
for i, agent in enumerate(agents):
    plt.subplot(1, n_agents, i+1)
    q_matrix = np.array([agent.q_table[state] for state in range(n_states)])  # 转换为矩阵
    sns.heatmap(q_matrix, annot=True, fmt=".2f", cmap="YlGnBu",
                xticklabels=[f"Action {a}" for a in range(n_actions)],
                yticklabels=[f"State {s}" for s in range(n_states)])
    plt.title(f"Agent {i+1} Q-Value Matrix")
  • 颜色深浅表示Q值大小
  • 数字标注显示具体值
  • 每行对应一个状态,每列对应一个动作
3. 奖励曲线分析
plt.plot(episode_rewards, alpha=0.3)  # 原始奖励(半透明)
moving_avg = np.convolve(episode_rewards, np.ones(50)/50, mode='valid')  # 滑动平均
plt.plot(np.arange(49, 1000), moving_avg, 'r-', linewidth=2)  # 平滑趋势线
  • 蓝色曲线:原始奖励(高波动)
  • 红色曲线:50轮滑动平均(显示收敛趋势)

四、关键设计思想

  1. 分布式特性
    • 每个智能体通过neighbors_q_tables获取其他智能体的Q表
    • 更新时参考邻居的最大Q值(max_neighbor_q
  2. 环境设计
    • 状态空间:离散整数(0-4)
    • 动作影响:next_state = (state + action) % n_states
    • 奖励函数:state * action + noise
  3. 探索-利用平衡
    • ε-greedy策略保证持续探索
    • 随训练进行,Q表逐渐收敛到最优策略

这个实现展示了多智能体系统中通过分布式Q学习实现协作决策的完整流程,可视化部分帮助直观理解学习动态。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值