state, action, next_state, reward, done = map(np.stack, zip(*batch))

文章介绍了如何通过zip和numpy.stack在深度Q网络(DQN)中实现经验回放,将batch中的状态、动作等拆分并转换为NumPy数组,以便于机器学习和强化学习算法的训练过程。
  1. zip(*batch)使用解包操作*batch列表中的每个经验解包成单独的状态、动作等,并使用zip将它们重新组合成五个单独的列表,每个列表包含所有经验中对应的一个元素(例如,所有状态组成一个列表)。
  2. map(np.stack, ...)np.stack函数应用于这五个列表中的每一个,将它们转换为NumPy数组。np.stack函数用于将列表中的元素堆叠成一个新的数组,这对于后续的数值计算通常是必要的。
  3. 最后,这五个NumPy数组被分别赋值给变量stateactionnext_staterewarddone

 这样,就得到了五个NumPy数组,每个数组都包含一批经验中的一个特定部分(状态、动作等),这些数组可以用于后续的机器学习或强化学习算法中。

这种经验回放机制是深度Q网络(DQN)等算法的关键组成部分,它允许智能体从存储的经验中学习,而不是仅仅依赖于连续的实时经验。

# config.py import numpy as np C0 = 3e8 def xi_from_fc(fc_hz: float) -> float: lam = C0 / fc_hz return (lam / (4.0 * np.pi)) ** 2 def sigma2_pdf(B_hz: float, F_db: float) -> float: return 10 ** ((-174 + 10 * np.log10(B_hz) + F_db) / 10.0) * 1e-3 # 收敛优化参数 - 重新调整权重 convergence_params = { # 奖励权重重新平衡 - 大幅提高速率权重 "ee_weight": 1e-4, # 提高能效权重 "rate_weight": 5e-5, # 大幅提高速率权重 "pen_kin": 0.5, # 增加运动惩罚 "pen_qos": 0.8, # 增加QoS惩罚 "noma_bonus": 1.0, # 增加NOMA奖励 "ris_bonus": 0.8, # 增加RIS奖励 # 训练参数 "max_episodes": 400, "convergence_window": 30, "convergence_threshold": 0.01, } # 高级训练参数 advanced_training_params = { # TD3算法参数 "actor_lr": 5e-4, # 提高学习率 "critic_lr": 5e-4, "tau": 0.01, # 提高目标网络更新率 "gamma": 0.95, # 调整折扣因子 "policy_noise": 0.15, "noise_clip": 0.3, "policy_freq": 2, # 更频繁的策略更新 # 探索参数 "initial_noise": 0.4, "min_noise": 0.05, "noise_decay": 0.995, # 训练参数 "replay_buffer_size": 100000, "batch_size": 128, "warmup_steps": 1000, "training_frequency": 2, } # Sub-6 GHz - 优化收敛 cfg_A = { "name": "Sub6", "f_c": 2.4e9, "B": 20e6, "F": 7.0, "xi": xi_from_fc(2.4e9), "sigma2": sigma2_pdf(20e6, 7.0), "P": 2.0, # UAV / 场景 "dt": 1.0, "N": 40, "hmin": 50, "hmax": 150, "Vh_max": 20, "Vv_max": 5, "uav_start": [100, 100, 100], "uav_end": [600, 600, 100], "Xmax": 1000, "Ymax": 1000, "dx_max": 15 * 1.0, "dy_max": 15 * 1.0, "dh_max": 3 * 1.0, # RIS "x_RIS": 300, "y_RIS": 300, "z_RIS": 30, "Mr": 10, "Mc": 10, "a_ris": 0.9, "P_RIS": 0.5, "P_ris_unit": 5e-3, # 路径损耗参数 - 调整以体现差异 "alpha_los": 2.0, "alpha_nlos": 3.5, # 增加NLOS损耗 "alpha_ur": 2.0, "alpha_rg": 2.0, # NOMA / QoS "xi_sic": 0.001, # 降低SIC误差 "rmin": 1.0e6, # 提高最低速率要求 # 推进能耗 "P0": 158.0, "Utip": 120.0, "d0": 0.6, "rho": 1.225, "s": 0.05, "G": 0.503, "P1": 88.6, "v0": 4.03, "P2": 12.4, } # mmWave - 保持原有 cfg_B = { "name": "mmWave", "f_c": 28e9, "B": 100e6, "F": 7.0, "xi": xi_from_fc(28e9), "sigma2": sigma2_pdf(100e6, 7.0), "P": 1.0, "dt": 1.0, "N": 40, "hmin": 50, "hmax": 150, "Vh_max": 20, "Vv_max": 5, "uav_start": [100, 100, 100], "uav_end": [600, 600, 100], "Xmax": 1000, "Ymax": 1000, "dx_max": 15 * 1.0, "dy_max": 15 * 1.0, "dh_max": 3 * 1.0, "x_RIS": 300, "y_RIS": 300, "z_RIS": 30, "Mr": 16, "Mc": 16, "a_ris": 0.9, "P_RIS": 2.0, "P_ris_unit": 8e-3, "alpha_los": 2.5, "alpha_nlos": 4.5, # 增加NLOS损耗 "alpha_ur": 2.5, "alpha_rg": 2.5, "xi_sic": 0.001, # 降低SIC误差 "rmin": 3e6, # 提高最低速率要求 "P0": 158.0, "Utip": 120.0, "d0": 0.6, "rho": 1.225, "s": 0.05, "G": 0.503, "P1": 88.6, "v0": 4.03, "P2": 12.4, } # TD3相位优化参数 - 更稳定 td3_phase_params = { "block_size": 4, "max_phase_perturb": 0.5, # 增加相位扰动范围 "phase_smooth_penalty": 0.001, } # 应用所有优化参数 cfg_A.update(convergence_params) cfg_B.update(convergence_params) cfg_A.update(advanced_training_params) cfg_B.update(advanced_training_params) cfg_A.update(td3_phase_params) cfg_B.update(td3_phase_params) # 用户位置 - 调整位置以增加信道差异 cfg_A["user1"] = np.array([150, 150, 0]) cfg_A["user2"] = np.array([450, 450, 0]) cfg_A["seed"] = 2025 cfg_B["user1"] = np.array([150, 150, 0]) cfg_B["user2"] = np.array([450, 450, 0]) cfg_B["seed"] = 2025 cfg = cfg_A # RIS_UAV_env.py import numpy as np class RISEnv2User: """优化收敛版本的RIS环境""" def __init__(self, cfg: dict, ris_mode='random', access_mode='noma'): assert ris_mode in ('none', 'random', 'td3_optimized') assert access_mode in ('noma', 'oma') self.cfg = cfg.copy() self.ris_mode = ris_mode self.access_mode = access_mode self.user1 = cfg['user1'].copy() self.user2 = cfg['user2'].copy() self.ris_pos = np.array([cfg['x_RIS'], cfg['y_RIS'], cfg['z_RIS']]) self.rng = np.random.default_rng(self.cfg.get('seed', 2025)) self.phase_blocks = None self.ris_elements = cfg['Mr'] * cfg['Mc'] # 收敛优化:历史记录 self.reward_history = [] self.max_history = 20 self.reset() def _calculate_channel_gain_convergent(self, user_pos): """改进的信道增益计算""" c = self.cfg uav_pos = np.array([self.x, self.y, self.h]) # 距离计算 d_ug = np.linalg.norm(uav_pos - user_pos) d_ug = max(d_ug, 10.0) # 防止除零 # 改进的LoS概率计算 horizontal_dist = np.linalg.norm(uav_pos[:2] - user_pos[:2]) elevation_angle = np.arctan2(self.h, horizontal_dist) if horizontal_dist > 0 else np.pi / 2 p_los = 1.0 / (1 + 9.61 * np.exp(-0.16 * (elevation_angle * 180 / np.pi - 9.61))) # 直达链路 - 使用更准确的信道模型 g_dir_los = c['xi'] / (d_ug ** c['alpha_los']) g_dir_nlos = c['xi'] / (d_ug ** c['alpha_nlos']) * 0.1 # NLOS有额外衰减 g_dir = p_los * g_dir_los + (1 - p_los) * g_dir_nlos # RIS辅助链路 g_ris = 0.0 if self.ris_mode != 'none': d_ur = np.linalg.norm(uav_pos - self.ris_pos) d_ur = max(d_ur, 10.0) d_rg = np.linalg.norm(self.ris_pos - user_pos) d_rg = max(d_rg, 10.0) # RIS路径损耗 path_loss_ur = c['xi'] / (d_ur ** c['alpha_ur']) path_loss_rg = c['xi'] / (d_rg ** c['alpha_rg']) if self.ris_mode == 'random': # 随机相位 - 较低增益 effective_gain = np.sqrt(self.ris_elements) * 0.3 elif self.ris_mode == 'td3_optimized': if self.phase_blocks is not None: # 优化相位 - 更高增益 phase_coherence = 1.0 - 0.3 * np.std(self.phase_blocks) / np.pi effective_gain = self.ris_elements * 0.8 * phase_coherence else: effective_gain = self.ris_elements * 0.6 g_ris = c['a_ris'] * effective_gain * path_loss_ur * path_loss_rg return max(g_dir + g_ris, 1e-20) def _calculate_rates_convergent(self, g1, g2, power_ratio): """改进的速率计算 - 体现NOMA优势""" c = self.cfg P, B, sig2 = c['P'], c['B'], c['sigma2'] if self.access_mode == 'noma': # 确定用户强弱顺序 if g1 >= g2: g_strong, g_weak = g1, g2 user_order = [1, 2] else: g_strong, g_weak = g2, g1 user_order = [2, 1] # 功率分配 - 弱用户分配更多功率 p_weak = P * np.clip(power_ratio, 0.4, 0.8) # 增加弱用户功率 p_strong = P - p_weak # SINR计算 sinr_weak = (p_weak * g_weak) / (sig2 + p_strong * g_weak * c['xi_sic'] + 1e-15) sinr_strong = (p_strong * g_strong) / (sig2 + 1e-15) # 速率计算 r_weak = B * np.log2(1 + np.clip(sinr_weak, 0, 100)) r_strong = B * np.log2(1 + np.clip(sinr_strong, 0, 100)) if user_order == [1, 2]: return r_strong, r_weak else: return r_weak, r_strong else: # OMA - 均分资源和功率 p_per_user = P / 2 bandwidth_per_user = B / 2 sinr1 = (p_per_user * g1) / (sig2 + 1e-15) sinr2 = (p_per_user * g2) / (sig2 + 1e-15) r1 = bandwidth_per_user * np.log2(1 + np.clip(sinr1, 0, 100)) r2 = bandwidth_per_user * np.log2(1 + np.clip(sinr2, 0, 100)) return r1, r2 def _improved_reward_function(self, total_rate, ee, g1, g2, power_ratio, kin_violate, r1, r2): """改进的奖励函数 - 更好体现性能差异""" c = self.cfg reward = 0 # 能效奖励 - 线性奖励 ee_reward = ee / 50000 # 归一化 reward += c['ee_weight'] * ee_reward * 100 # 速率奖励 - 线性奖励 rate_reward = total_rate / 5e7 # 归一化 reward += c['rate_weight'] * rate_reward * 1000 # NOMA性能奖励 - 与OMA对比 if self.access_mode == 'noma': # 计算OMA基准性能 r1_oma, r2_oma = self._calculate_rates_convergent(g1, g2, 0.5) oma_rate = r1_oma + r2_oma if oma_rate > 1e6: noma_gain = (total_rate - oma_rate) / oma_rate if noma_gain > 0: reward += c['noma_bonus'] * min(noma_gain, 1.0) else: reward -= 0.5 # NOMA性能不如OMA的惩罚 # RIS性能奖励 - 与无RIS对比 if self.ris_mode != 'none': # 计算无RIS时的性能 g1_dir = self._calculate_direct_channel_gain(self.user1) g2_dir = self._calculate_direct_channel_gain(self.user2) r1_dir, r2_dir = self._calculate_rates_convergent(g1_dir, g2_dir, power_ratio) dir_rate = r1_dir + r2_dir if dir_rate > 1e6: ris_gain = (total_rate - dir_rate) / dir_rate if ris_gain > 0: reward += c['ris_bonus'] * min(ris_gain, 1.0) else: reward -= 0.3 # RIS性能不如无RIS的惩罚 # 约束惩罚 penalty = 0 if kin_violate: penalty += c['pen_kin'] if r1 < c['rmin']: penalty += c['pen_qos'] * (1.0 - r1 / c['rmin']) if r2 < c['rmin']: penalty += c['pen_qos'] * (1.0 - r2 / c['rmin']) # 功率分配合理性奖励 if 0.4 <= power_ratio <= 0.7: # 弱用户应该分配更多功率 reward += 0.2 else: reward -= 0.1 final_reward = reward - penalty # 温和裁剪 final_reward = np.clip(final_reward, -2.0, 5.0) return final_reward def _calculate_direct_channel_gain(self, user_pos): """计算无RIS时的直达信道增益""" c = self.cfg uav_pos = np.array([self.x, self.y, self.h]) d_ug = np.linalg.norm(uav_pos - user_pos) d_ug = max(d_ug, 10.0) horizontal_dist = np.linalg.norm(uav_pos[:2] - user_pos[:2]) elevation_angle = np.arctan2(self.h, horizontal_dist) if horizontal_dist > 0 else np.pi / 2 p_los = 1.0 / (1 + 9.61 * np.exp(-0.16 * (elevation_angle * 180 / np.pi - 9.61))) g_dir_los = c['xi'] / (d_ug ** c['alpha_los']) g_dir_nlos = c['xi'] / (d_ug ** c['alpha_nlos']) * 0.1 g_dir = p_los * g_dir_los + (1 - p_los) * g_dir_nlos return max(g_dir, 1e-20) def step(self, action): """改进的step函数""" c, dt = self.cfg, self.cfg['dt'] old_pos = np.array([self.x, self.y, self.h]) # 解析动作 if self.ris_mode == 'td3_optimized': dx = np.clip(action[0], -1.0, 1.0) * c['dx_max'] dy = np.clip(action[1], -1.0, 1.0) * c['dy_max'] dh = np.clip(action[2], -1.0, 1.0) * c['dh_max'] power_action = np.clip(action[3], -1.0, 1.0) phase_actions = action[4:] if len(action) > 4 else None else: dx = np.clip(action[0], -1.0, 1.0) * c['dx_max'] dy = np.clip(action[1], -1.0, 1.0) * c['dy_max'] dh = np.clip(action[2], -1.0, 1.0) * c['dh_max'] power_action = np.clip(action[3], -1.0, 1.0) phase_actions = None # 功率分配 - 弱用户分配40%-80%功率 power_ratio = 0.4 + 0.4 * (power_action * 0.5 + 0.5) # 位置更新 new_x = np.clip(self.x + dx, 50.0, c['Xmax'] - 50) new_y = np.clip(self.y + dy, 50.0, c['Ymax'] - 50) new_h = np.clip(self.h + dh, c['hmin'] + 10, c['hmax'] - 10) actual_dx = new_x - self.x actual_dy = new_y - self.y actual_dh = new_h - self.h self.x, self.y, self.h = new_x, new_y, new_h # 运动学约束 vh = np.hypot(actual_dx, actual_dy) / dt vv = abs(actual_dh) / dt kin_violate = (vh > c['Vh_max']) or (vv > c['Vv_max']) # RIS相位更新 phase_penalty = self.update_ris_phases(phase_actions) # 计算信道增益和速率 g1 = self._calculate_channel_gain_convergent(self.user1) g2 = self._calculate_channel_gain_convergent(self.user2) r1, r2 = self._calculate_rates_convergent(g1, g2, power_ratio) total_rate = r1 + r2 # 计算能效 propulsion_energy = self._propulsion_energy_simple(actual_dx, actual_dy, actual_dh, dt) transmit_energy = (c['P'] + (0 if self.ris_mode == 'none' else c['P_RIS'])) * dt total_energy = transmit_energy + propulsion_energy total_bits = total_rate * dt ee = total_bits / (total_energy + 1e-20) # 使用改进的奖励函数 reward = self._improved_reward_function(total_rate, ee, g1, g2, power_ratio, kin_violate, r1, r2) reward -= phase_penalty self.time_step += 1 done = (self.time_step >= c['N']) obs = self._get_observation() info = { 'ee': ee, 'total_rate': total_rate, 'r1': r1, 'r2': r2, 'g1': g1, 'g2': g2, 'power_ratio': power_ratio, 'position': [self.x, self.y, self.h], 'kin_violate': kin_violate, 'trajectory_reward': reward, } return obs, float(reward), done, info def _propulsion_energy_simple(self, dx, dy, dh, dt): c = self.cfg vh = np.hypot(dx, dy) / dt vv = abs(dh) / dt base_power = c['P0'] + c['P1'] * vh + c['P2'] * vv return base_power * dt def update_ris_phases(self, phase_actions): if self.ris_mode == 'td3_optimized' and phase_actions is not None and self.phase_blocks is not None: max_perturb = self.cfg.get('max_phase_perturb', 0.5) n_blocks = len(self.phase_blocks) n_actions = len(phase_actions) if n_actions >= n_blocks: phase_perturb = np.clip(phase_actions[:n_blocks], -1, 1) * max_perturb else: phase_perturb = np.full(n_blocks, np.clip(phase_actions[0], -1, 1) * max_perturb) new_phases = (self.phase_blocks + phase_perturb) % (2 * np.pi) self.phase_blocks = new_phases return np.sum(np.abs(phase_perturb)) * self.cfg.get('phase_smooth_penalty', 0.001) return 0.0 def _init_ris_phases(self): if self.ris_mode == 'none': self.phase_blocks = None elif self.ris_mode == 'random': num_blocks = self.ris_elements // self.cfg.get('block_size', 4) self.phase_blocks = self.rng.uniform(0, 2 * np.pi, num_blocks) elif self.ris_mode == 'td3_optimized': num_blocks = self.ris_elements // self.cfg.get('block_size', 4) self.phase_blocks = np.zeros(num_blocks) # 初始零相位 def reset(self): c = self.cfg self.x, self.y, self.h = c['uav_start'] self.time_step = 0 self.reward_history = [] self._init_ris_phases() return self._get_observation() def _get_observation(self): base_obs = [ self.x / 1000.0, self.y / 1000.0, self.h / 200.0, self.user1[0] / 1000.0, self.user1[1] / 1000.0, self.user2[0] / 1000.0, self.user2[1] / 1000.0, self.ris_pos[0] / 1000.0, self.ris_pos[1] / 1000.0 ] if self.ris_mode == 'td3_optimized' and self.phase_blocks is not None: n_phase_obs = min(3, len(self.phase_blocks)) phase_obs = (self.phase_blocks[:n_phase_obs] / np.pi) - 1 base_obs.extend(phase_obs) return np.array(base_obs, dtype=np.float32) class OMAEnv(RISEnv2User): def __init__(self, cfg: dict, ris_mode='random'): super().__init__(cfg, ris_mode, 'oma') # TD3.py import torch import torch.nn as nn import torch.optim as optim import numpy as np import random from collections import deque class Actor(nn.Module): def __init__(self, state_dim, act_dim, max_action=1.0, hidden_dim=256): super(Actor, self).__init__() self.max_action = max_action self.net = nn.Sequential( nn.Linear(state_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim // 2), nn.ReLU(), nn.Linear(hidden_dim // 2, act_dim), nn.Tanh() ) def forward(self, state): return self.max_action * self.net(state) class Critic(nn.Module): def __init__(self, state_dim, act_dim, hidden_dim=256): super(Critic, self).__init__() # Q1 网络 self.q1_net = nn.Sequential( nn.Linear(state_dim + act_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim // 2), nn.ReLU(), nn.Linear(hidden_dim // 2, 1) ) # Q2 网络 self.q2_net = nn.Sequential( nn.Linear(state_dim + act_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim // 2), nn.ReLU(), nn.Linear(hidden_dim // 2, 1) ) def forward(self, state, action): sa = torch.cat([state, action], 1) q1 = self.q1_net(sa) q2 = self.q2_net(sa) return q1, q2 def q1(self, state, action): sa = torch.cat([state, action], 1) return self.q1_net(sa) class ReplayBuffer: def __init__(self, capacity=100000): self.buffer = deque(maxlen=capacity) def push(self, state, action, reward, next_state, done): self.buffer.append((state, action, reward, next_state, done)) def sample(self, batch_size): if len(self.buffer) < batch_size: return None batch = random.sample(self.buffer, batch_size) state, action, reward, next_state, done = map(np.stack, zip(*batch)) return ( torch.FloatTensor(state), torch.FloatTensor(action), torch.FloatTensor(reward).unsqueeze(1), torch.FloatTensor(next_state), torch.FloatTensor(done).unsqueeze(1) ) def __len__(self): return len(self.buffer) class TD3: def __init__(self, state_dim, act_dim, max_action=1.0, device='cpu', actor_lr=5e-4, critic_lr=5e-4, tau=0.01): self.device = device self.max_action = max_action # 演员网络 self.actor = Actor(state_dim, act_dim, max_action).to(device) self.actor_target = Actor(state_dim, act_dim, max_action).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) # 评论家网络 self.critic = Critic(state_dim, act_dim).to(device) self.critic_target = Critic(state_dim, act_dim).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr) # TD3 超参数 self.policy_noise = 0.15 self.noise_clip = 0.3 self.policy_freq = 2 self.total_it = 0 self.gamma = 0.95 self.tau = tau # 训练记录 self.actor_losses = [] self.critic_losses = [] self.q_values = [] @torch.no_grad() def select_action(self, state, noise_scale=0.1): state_tensor = torch.FloatTensor(state.reshape(1, -1)).to(self.device) action = self.actor(state_tensor).cpu().data.numpy().flatten() if noise_scale > 0: noise = np.random.normal(0, noise_scale, size=action.shape) action = (action + noise).clip(-self.max_action, self.max_action) return action def train(self, replay_buffer, batch_size=128): self.total_it += 1 batch = replay_buffer.sample(batch_size) if batch is None: return state, action, reward, next_state, done = batch state = state.to(self.device) action = action.to(self.device) reward = reward.to(self.device) next_state = next_state.to(self.device) done = done.to(self.device) with torch.no_grad(): # 目标策略平滑 noise = (torch.randn_like(action) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip) next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action) # 计算目标Q值 target_q1, target_q2 = self.critic_target(next_state, next_action) target_q = torch.min(target_q1, target_q2) target_q = reward + (1 - done) * self.gamma * target_q target_q = torch.clamp(target_q, -5.0, 10.0) # 调整裁剪范围 # 获取当前Q估计 current_q1, current_q2 = self.critic(state, action) # 计算评论家损失 critic_loss = nn.MSELoss()(current_q1, target_q) + nn.MSELoss()(current_q2, target_q) # 优化评论家 self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1.0) self.critic_optimizer.step() self.q_values.append(current_q1.mean().item()) # 延迟策略更新 if self.total_it % self.policy_freq == 0: actor_actions = self.actor(state) actor_loss = -self.critic.q1(state, actor_actions).mean() # 添加策略熵正则化鼓励探索 action_std = torch.std(actor_actions, dim=0).mean() entropy_bonus = -0.02 * action_std actor_loss += entropy_bonus self.actor_optimizer.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1.0) self.actor_optimizer.step() # 软更新目标网络 for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) self.actor_losses.append(actor_loss.item()) self.critic_losses.append(critic_loss.item()) def save(self, filename): torch.save({ 'actor': self.actor.state_dict(), 'critic': self.critic.state_dict(), 'actor_target': self.actor_target.state_dict(), 'critic_target': self.critic_target.state_dict(), 'actor_optimizer': self.actor_optimizer.state_dict(), 'critic_optimizer': self.critic_optimizer.state_dict(), }, filename) def load(self, filename): checkpoint = torch.load(filename, map_location=self.device) self.actor.load_state_dict(checkpoint['actor']) self.critic.load_state_dict(checkpoint['critic']) self.actor_target.load_state_dict(checkpoint['actor_target']) self.critic_target.load_state_dict(checkpoint['critic_target']) self.actor_o# run_pytorch.py import numpy as np import matplotlib.pyplot as plt import torch import random import time import os from RIS_UAV_env import RISEnv2User, OMAEnv from TD3 import TD3, ReplayBuffer from config import cfg_A, cfg_B, td3_phase_params def set_seed(seed): torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) class ConvergenceMonitor: """改进的收敛监控器""" def __init__(self, window_size=30, threshold=0.01): self.window_size = window_size self.threshold = threshold self.rewards = [] self.converged = False def update(self, reward): self.rewards.append(reward) if len(self.rewards) > self.window_size: self.rewards.pop(0) if len(self.rewards) == self.window_size: recent_mean = np.mean(self.rewards[-self.window_size // 2:]) older_mean = np.mean(self.rewards[:self.window_size // 2]) reward_std = np.std(self.rewards) if (abs(recent_mean - older_mean) < self.threshold and reward_std < 0.5 and recent_mean > older_mean): self.converged = True return True return False def train_td3_agent_advanced(cfg, ris_mode, episodes=400, steps_per_ep=40): """高级训练函数""" print(f"\nAdvanced Training {cfg['name']} - {ris_mode} mode") # 状态和动作维度 if ris_mode == 'td3_optimized': phase_dim = 3 action_dim = 4 + phase_dim state_dim = 9 + phase_dim else: action_dim = 4 state_dim = 9 env = RISEnv2User(cfg, ris_mode=ris_mode, access_mode='noma') # 使用改进的TD3参数 agent = TD3( state_dim, action_dim, max_action=1.0, device='cpu', actor_lr=cfg.get('actor_lr', 5e-4), critic_lr=cfg.get('critic_lr', 5e-4), tau=cfg.get('tau', 0.01) ) replay_buffer = ReplayBuffer(capacity=cfg.get('replay_buffer_size', 100000)) # 改进的收敛监控 monitor = ConvergenceMonitor(window_size=35, threshold=0.008) training_history = { 'episodes': [], 'rewards': [], 'moving_avg_rewards': [], 'ees': [], 'rates': [], 'actor_losses': [], 'critic_losses': [], 'trajectory_rewards': [], 'power_ratios': [], 'q_values': [] } best_reward = -np.inf convergence_episode = None early_stop_count = 0 for episode in range(episodes): state = env.reset() episode_reward = 0 episode_ee = 0 episode_rate = 0 episode_trajectory_reward = 0 episode_power_ratios = [] steps = 0 # 改进的噪声衰减 - 指数衰减 current_noise = cfg.get('initial_noise', 0.4) * (cfg.get('noise_decay', 0.995) ** episode) current_noise = max(current_noise, cfg.get('min_noise', 0.05)) for step in range(steps_per_ep): action = agent.select_action(state, noise_scale=current_noise) next_state, reward, done, info = env.step(action) replay_buffer.push(state, action, reward, next_state, done) # 改进的训练逻辑 if len(replay_buffer) > cfg.get('warmup_steps', 1000) and step % cfg.get('training_frequency', 2) == 0: agent.train(replay_buffer, batch_size=cfg.get('batch_size', 128)) state = next_state episode_reward += reward episode_ee += info['ee'] episode_rate += info['total_rate'] episode_trajectory_reward += info.get('trajectory_reward', 0) episode_power_ratios.append(info['power_ratio']) steps += 1 if done: break # 计算平均性能 if steps > 0: avg_reward = episode_reward / steps avg_ee = episode_ee / steps avg_rate = episode_rate / steps avg_trajectory_reward = episode_trajectory_reward / steps avg_power_ratio = np.mean(episode_power_ratios) else: avg_reward = avg_ee = avg_rate = avg_trajectory_reward = avg_power_ratio = 0 # 更新训练历史 training_history['episodes'].append(episode) training_history['rewards'].append(avg_reward) training_history['ees'].append(avg_ee) training_history['rates'].append(avg_rate) training_history['trajectory_rewards'].append(avg_trajectory_reward) training_history['power_ratios'].append(avg_power_ratio) # 记录Q值 - 只在有训练时记录 if len(agent.q_values) > 0: training_history['q_values'].append( np.mean(agent.q_values[-10:]) if len(agent.q_values) >= 10 else agent.q_values[-1]) else: training_history['q_values'].append(0) # 计算移动平均 window = min(20, episode + 1) if episode >= window: moving_avg = np.mean(training_history['rewards'][-window:]) else: moving_avg = avg_reward training_history['moving_avg_rewards'].append(moving_avg) # 记录损失 if len(agent.actor_losses) > 0: training_history['actor_losses'].append(agent.actor_losses[-1]) training_history['critic_losses'].append(agent.critic_losses[-1]) else: training_history['actor_losses'].append(0) training_history['critic_losses'].append(0) # 改进的收敛检测 convergence_detected = monitor.update(moving_avg) if convergence_detected and convergence_episode is None: convergence_episode = episode print(f" -> Converged at episode {episode}") # 更智能的提前停止 if convergence_episode is not None: early_stop_count += 1 # 如果收敛后性能下降,提前停止 if early_stop_count > 50 and moving_avg < best_reward * 0.95: print(f" -> Early stopping at episode {episode} (performance degradation)") break # 正常收敛后训练足够回合 if early_stop_count > 100: print(f" -> Early stopping at episode {episode} (sufficient training)") break # 保存最佳模型 if avg_reward > best_reward: best_reward = avg_reward os.makedirs('models', exist_ok=True) agent.save(f'models/best_{cfg["name"]}_{ris_mode}.pth') # 更详细的进度输出 if episode % 35 == 0 or episode < 10: q_value = training_history['q_values'][-1] if training_history['q_values'] else 0 print(f"Episode {episode:4d} | Reward: {avg_reward:7.3f} | " f"EE: {avg_ee / 1000:7.1f}K | Rate: {avg_rate / 1e6:6.2f} Mbps | " f"Noise: {current_noise:.3f} | Q: {q_value:6.2f}") return agent, training_history, convergence_episode def comprehensive_evaluation(cfg, agents, num_episodes=20): """全面性能评估""" print(f"\nEvaluating {cfg['name']} configuration...") schemes = { 'NOMA - No RIS': ('noma', 'none'), 'NOMA - Random RIS': ('noma', 'random'), 'NOMA - TD3 Optimized': ('noma', 'td3_optimized'), 'OMA - No RIS': ('oma', 'none'), 'OMA - Random RIS': ('oma', 'random'), } results = {} trajectory_data = {} for scheme_name, (access, ris) in schemes.items(): print(f" Evaluating {scheme_name}...") all_ees, all_rates, all_positions = [], [], [] for ep in range(num_episodes): episode_seed = cfg['seed'] + ep * 100 set_seed(episode_seed) if access == 'oma': env = OMAEnv(cfg, ris_mode=ris) else: env = RISEnv2User(cfg, ris_mode=ris, access_mode=access) state = env.reset() episode_ees, episode_rates, episode_positions = [], [], [] for t in range(cfg['N']): if access == 'noma' and ris == 'td3_optimized' and ris in agents: agent = agents[ris] action = agent.select_action(state, noise_scale=0.0) else: action = heuristic_policy(state, env, ris, access) next_state, reward, done, info = env.step(action) state = next_state episode_ees.append(info['ee']) episode_rates.append(info['total_rate']) episode_positions.append(info['position']) if done: break all_ees.extend(episode_ees) all_rates.extend(episode_rates) all_positions.append(episode_positions) results[scheme_name] = { 'ee_mean': np.mean(all_ees), 'ee_std': np.std(all_ees), 'rate_mean': np.mean(all_rates), 'rate_std': np.std(all_rates), 'ee_samples': all_ees, 'rate_samples': all_rates } trajectory_data[scheme_name] = all_positions return results, trajectory_data def heuristic_policy(state, env, ris_mode, access_mode): """启发式策略""" action_dim = 4 if ris_mode == 'td3_optimized': phase_dim = 3 action_dim = 4 + phase_dim action = np.zeros(action_dim) uav_x, uav_y = state[0] * 1000, state[1] * 1000 user1_x, user1_y = env.user1[0], env.user1[1] user2_x, user2_y = env.user2[0], env.user2[1] ris_x, ris_y = env.ris_pos[0], env.ris_pos[1] center_x, center_y = (user1_x + user2_x) / 2, (user1_y + user2_y) / 2 to_center = np.array([center_x - uav_x, center_y - uav_y]) to_ris = np.array([ris_x - uav_x, ris_y - uav_y]) to_center_norm = np.linalg.norm(to_center) to_ris_norm = np.linalg.norm(to_ris) if to_center_norm > 0: to_center = to_center / to_center_norm if to_ris_norm > 0: to_ris = to_ris / to_ris_norm if ris_mode == 'td3_optimized': direction = 0.6 * to_ris + 0.4 * to_center elif ris_mode == 'random': direction = 0.4 * to_ris + 0.6 * to_center else: direction = to_center action[0] = direction[0] * 0.8 action[1] = direction[1] * 0.8 action[2] = 0.1 # 轻微爬升 if access_mode == 'noma': action[3] = 0.2 # 弱用户分配更多功率 else: action[3] = 0.0 if ris_mode == 'td3_optimized' and len(action) > 4: action[4:] = np.random.uniform(-0.3, 0.3, len(action) - 4) return action def plot_training_curves(training_histories, cfg_name): """绘制训练曲线 - 英文标签""" fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10)) colors = ['blue', 'red', 'green'] ris_modes = ['none', 'random', 'td3_optimized'] labels = ['No RIS', 'Random RIS', 'TD3 Optimized'] for i, ris_mode in enumerate(ris_modes): if ris_mode in training_histories: hist = training_histories[ris_mode] episodes = hist['episodes'] ax1.plot(episodes, hist['rewards'], color=colors[i], alpha=0.6, label=labels[i]) ax1.plot(episodes, hist['moving_avg_rewards'], color=colors[i], linewidth=2) ax2.plot(episodes, np.array(hist['ees']) / 1000, color=colors[i], label=labels[i]) ax3.plot(episodes, np.array(hist['rates']) / 1e6, color=colors[i], label=labels[i]) ax1.set_title(f'{cfg_name} - Training Rewards') ax1.set_xlabel('Episode') ax1.set_ylabel('Average Reward') ax1.legend() ax1.grid(True, alpha=0.3) ax2.set_title(f'{cfg_name} - Energy Efficiency') ax2.set_xlabel('Episode') ax2.set_ylabel('EE (Kbits/J)') ax2.legend() ax2.grid(True, alpha=0.3) ax3.set_title(f'{cfg_name} - Data Rate') ax3.set_xlabel('Episode') ax3.set_ylabel('Rate (Mbps)') ax3.legend() ax3.grid(True, alpha=0.3) if 'td3_optimized' in training_histories: hist = training_histories['td3_optimized'] if len(hist['actor_losses']) > 0 and len(hist['actor_losses']) == len(hist['episodes']): ax4.plot(hist['episodes'], hist['actor_losses'], label='Actor Loss', color='blue') ax4.plot(hist['episodes'], hist['critic_losses'], label='Critic Loss', color='red') ax4.set_title('TD3 Training Losses') ax4.set_xlabel('Episode') ax4.set_ylabel('Loss') ax4.legend() ax4.grid(True, alpha=0.3) plt.tight_layout() plt.savefig(f'training_curves_{cfg_name}.png', dpi=300, bbox_inches='tight') plt.show() def plot_performance_comparison(all_results, cfg_name): """性能对比图 - 英文标签""" schemes = ['NOMA - No RIS', 'NOMA - Random RIS', 'NOMA - TD3 Optimized', 'OMA - No RIS', 'OMA - Random RIS'] ees, ee_stds = [], [] rates, rate_stds = [], [] for scheme in schemes: if scheme in all_results: result = all_results[scheme] ees.append(result['ee_mean'] / 1000) ee_stds.append(result['ee_std'] / 1000) rates.append(result['rate_mean'] / 1e6) rate_stds.append(result['rate_std'] / 1e6) else: ees.append(0) ee_stds.append(0) rates.append(0) rate_stds.append(0) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6)) x = np.arange(len(schemes)) width = 0.35 bars1 = ax1.bar(x, ees, width, yerr=ee_stds, capsize=5, alpha=0.7, color=['blue', 'green', 'red', 'orange', 'purple']) ax1.set_xlabel('Scheme') ax1.set_ylabel('Energy Efficiency (Kbits/J)') ax1.set_title(f'{cfg_name} - Energy Efficiency Comparison') ax1.set_xticks(x) ax1.set_xticklabels(['NOMA\nNo RIS', 'NOMA\nRandom', 'NOMA\nTD3', 'OMA\nNo RIS', 'OMA\nRandom'], rotation=45, ha='right') ax1.grid(True, alpha=0.3) for i, bar in enumerate(bars1): height = bar.get_height() ax1.text(bar.get_x() + bar.get_width() / 2., height + max(ees) * 0.02, f'{height:.0f}K', ha='center', va='bottom', fontsize=9) bars2 = ax2.bar(x, rates, width, yerr=rate_stds, capsize=5, alpha=0.7, color=['blue', 'green', 'red', 'orange', 'purple']) ax2.set_xlabel('Scheme') ax2.set_ylabel('Data Rate (Mbps)') ax2.set_title(f'{cfg_name} - Data Rate Comparison') ax2.set_xticks(x) ax2.set_xticklabels(['NOMA\nNo RIS', 'NOMA\nRandom', 'NOMA\nTD3', 'OMA\nNo RIS', 'OMA\nRandom'], rotation=45, ha='right') ax2.grid(True, alpha=0.3) for i, bar in enumerate(bars2): height = bar.get_height() ax2.text(bar.get_x() + bar.get_width() / 2., height + max(rates) * 0.02, f'{height:.1f}', ha='center', va='bottom', fontsize=9) plt.tight_layout() plt.savefig(f'performance_comparison_{cfg_name}.png', dpi=300, bbox_inches='tight') plt.show() def plot_ee_cdf(all_results, cfg_name): """EE CDF曲线 - 英文标签""" plt.figure(figsize=(10, 6)) schemes = ['NOMA - No RIS', 'NOMA - Random RIS', 'NOMA - TD3 Optimized'] colors = ['blue', 'green', 'red'] linestyles = ['--', '-.', '-'] for i, scheme in enumerate(schemes): if scheme in all_results: data = np.array(all_results[scheme]['ee_samples']) / 1000 sorted_data = np.sort(data) cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data) plt.plot(sorted_data, cdf, label=scheme, color=colors[i], linestyle=linestyles[i], linewidth=2) plt.xlabel('Energy Efficiency (Kbits/J)') plt.ylabel('CDF') plt.title(f'{cfg_name} - Energy Efficiency CDF') plt.legend() plt.grid(True, alpha=0.3) plt.savefig(f'ee_cdf_{cfg_name}.png', dpi=300, bbox_inches='tight') plt.show() def plot_trajectories(trajectory_data, cfg, cfg_name): """UAV轨迹图 - 英文标签""" plt.figure(figsize=(12, 8)) schemes = ['NOMA - No RIS', 'NOMA - Random RIS', 'NOMA - TD3 Optimized'] colors = ['blue', 'green', 'red'] plt.scatter(cfg['user1'][0], cfg['user1'][1], s=200, marker='^', color='black', label='User 1', zorder=5) plt.scatter(cfg['user2'][0], cfg['user2'][1], s=200, marker='^', color='gray', label='User 2', zorder=5) plt.scatter(cfg['x_RIS'], cfg['y_RIS'], s=200, marker='s', color='orange', label='RIS', zorder=5) plt.scatter(cfg['uav_start'][0], cfg['uav_start'][1], s=100, marker='o', color='purple', label='Start Point', zorder=5) for i, scheme in enumerate(schemes): if scheme in trajectory_data: trajectories = trajectory_data[scheme] # 取第一个轨迹进行展示 if len(trajectories) > 0: trajectory = trajectories[0] x_pos = [pos[0] for pos in trajectory] y_pos = [pos[1] for pos in trajectory] plt.plot(x_pos, y_pos, color=colors[i], linewidth=2, label=scheme, alpha=0.8) plt.scatter(x_pos[::5], y_pos[::5], color=colors[i], s=20, alpha=0.6) plt.xlabel('X Coordinate (m)') plt.ylabel('Y Coordinate (m)') plt.title(f'{cfg_name} - UAV Trajectory Comparison') plt.legend() plt.grid(True, alpha=0.3) plt.axis('equal') plt.savefig(f'trajectories_{cfg_name}.png', dpi=300, bbox_inches='tight') plt.show() def plot_convergence_analysis(training_histories, cfg_name): """收敛分析图 - 英文标签""" plt.figure(figsize=(12, 4)) ris_modes = ['none', 'random', 'td3_optimized'] colors = ['blue', 'green', 'red'] labels = ['No RIS', 'Random RIS', 'TD3 Optimized'] for i, ris_mode in enumerate(ris_modes): if ris_mode in training_histories: hist = training_histories[ris_mode] episodes = hist['episodes'] moving_avg = hist['moving_avg_rewards'] plt.plot(episodes, moving_avg, color=colors[i], linewidth=2, label=labels[i]) plt.xlabel('Training Episode') plt.ylabel('Moving Average Reward') plt.title(f'{cfg_name} - Convergence Performance Comparison') plt.legend() plt.grid(True, alpha=0.3) plt.savefig(f'convergence_{cfg_name}.png', dpi=300, bbox_inches='tight') plt.show() def plot_joint_optimization_results(training_histories, cfg_name): """三变量联合优化结果 - 英文标签""" fig, axes = plt.subplots(2, 3, figsize=(18, 10)) ris_modes = ['none', 'random', 'td3_optimized'] colors = ['blue', 'green', 'red'] labels = ['No RIS', 'Random RIS', 'TD3 Optimized'] for i, ris_mode in enumerate(ris_modes): if ris_mode in training_histories: hist = training_histories[ris_mode] episodes = hist['episodes'] axes[0, 0].plot(episodes, hist['rewards'], color=colors[i], alpha=0.7, label=labels[i]) axes[0, 0].set_title('Total Reward') axes[0, 0].set_ylabel('Reward') axes[0, 0].grid(True, alpha=0.3) axes[0, 1].plot(episodes, np.array(hist['ees']) / 1000, color=colors[i], label=labels[i]) axes[0, 1].set_title('Energy Efficiency') axes[0, 1].set_ylabel('EE (Kbits/J)') axes[0, 1].grid(True, alpha=0.3) if 'trajectory_rewards' in hist and len(hist['trajectory_rewards']) == len(episodes): axes[0, 2].plot(episodes, hist['trajectory_rewards'], color=colors[i], label=labels[i]) axes[0, 2].set_title('Trajectory Reward') axes[0, 2].set_ylabel('Reward') axes[0, 2].grid(True, alpha=0.3) if 'power_ratios' in hist and len(hist['power_ratios']) == len(episodes): axes[1, 0].plot(episodes, hist['power_ratios'], color=colors[i], label=labels[i]) axes[1, 0].set_title('Power Allocation Ratio') axes[1, 0].set_ylabel('Power Ratio') axes[1, 0].set_ylim(0.1, 0.9) axes[1, 0].grid(True, alpha=0.3) # 修复Q值绘图维度问题 if ris_mode == 'td3_optimized' and 'q_values' in hist and len(hist['q_values']) > 0: q_episodes = episodes[:len(hist['q_values'])] axes[1, 1].plot(q_episodes, hist['q_values'], color='purple', label='Q Value', linewidth=2) axes[1, 1].set_title('Q Value Evolution') axes[1, 1].set_ylabel('Q Value') axes[1, 1].legend() axes[1, 1].grid(True, alpha=0.3) for i, ris_mode in enumerate(ris_modes): if ris_mode in training_histories: hist = training_histories[ris_mode] if len(hist['rates']) == len(hist['episodes']): axes[1, 2].plot(hist['episodes'], np.array(hist['rates']) / 1e6, color=colors[i], label=labels[i]) axes[1, 2].set_title('Data Rate') axes[1, 2].set_ylabel('Rate (Mbps)') axes[1, 2].legend() axes[1, 2].grid(True, alpha=0.3) fig.suptitle(f'{cfg_name} - Joint Three-Variable Optimization Performance', fontsize=16, y=0.98) plt.tight_layout() plt.savefig(f'joint_optimization_{cfg_name}.png', dpi=300, bbox_inches='tight') plt.show() def analyze_joint_optimization(training_histories, cfg_name): """分析三变量优化效果""" print(f"\n{cfg_name} - Joint Optimization Analysis:") print("-" * 50) if 'td3_optimized' in training_histories: hist = training_histories['td3_optimized'] final_reward = hist['rewards'][-1] if hist['rewards'] else 0 final_ee = hist['ees'][-1] if hist['ees'] else 0 final_rate = hist['rates'][-1] if hist['rates'] else 0 if 'power_ratios' in hist and len(hist['power_ratios']) > 0: power_data = hist['power_ratios'][-20:] if len(hist['power_ratios']) >= 20 else hist['power_ratios'] power_std = np.std(power_data) avg_power = np.mean(power_data) print(f"Final Performance:") print(f" - Reward: {final_reward:.3f}") print(f" - Energy Efficiency: {final_ee / 1000:.1f} Kbits/J") print(f" - Data Rate: {final_rate / 1e6:.2f} Mbps") print(f"Power Allocation Analysis:") print(f" - Average: {avg_power:.3f}") print(f" - Stability: {power_std:.4f} (std)") if 0.4 <= avg_power <= 0.7: print(" ✓ Power allocation follows NOMA principles") else: print(" ⚠ Power allocation needs adjustment") def main(): """主函数""" print("=== RIS-assisted UAV Communication System Energy Efficiency Optimization ===") print("Start time:", time.strftime("%Y-%m-%d %H:%M:%S")) cfgs = { 'Sub6': cfg_A, 'mmWave': cfg_B } all_training_results = {} all_evaluation_results = {} all_trajectory_data = {} for cfg_name, cfg in cfgs.items(): print(f"\n{'=' * 60}") print(f"Processing {cfg_name} configuration") print(f"{'=' * 60}") set_seed(cfg['seed']) agents = {} training_histories = {} ris_modes = ['none', 'random', 'td3_optimized'] for ris_mode in ris_modes: agent, history, conv_ep = train_td3_agent_advanced(cfg, ris_mode, episodes=400) agents[ris_mode] = agent training_histories[ris_mode] = history if conv_ep is not None: print(f" {ris_mode}: Converged at episode {conv_ep}") else: print(f" {ris_mode}: Not fully converged") evaluation_results, trajectory_data = comprehensive_evaluation(cfg, agents) all_training_results[cfg_name] = training_histories all_evaluation_results[cfg_name] = evaluation_results all_trajectory_data[cfg_name] = trajectory_data plot_training_curves(training_histories, cfg_name) plot_performance_comparison(evaluation_results, cfg_name) plot_ee_cdf(evaluation_results, cfg_name) plot_trajectories(trajectory_data, cfg, cfg_name) plot_convergence_analysis(training_histories, cfg_name) plot_joint_optimization_results(training_histories, cfg_name) analyze_joint_optimization(training_histories, cfg_name) print(f"\n{'=' * 60}") print("Performance Analysis Summary") print(f"{'=' * 60}") for cfg_name in cfgs.keys(): print(f"\n{cfg_name} Configuration:") results = all_evaluation_results[cfg_name] if 'NOMA - TD3 Optimized' in results and 'OMA - No RIS' in results: td3_ee = results['NOMA - TD3 Optimized']['ee_mean'] oma_ee = results['OMA - No RIS']['ee_mean'] improvement = (td3_ee - oma_ee) / oma_ee * 100 if oma_ee > 0 else 0 td3_rate = results['NOMA - TD3 Optimized']['rate_mean'] oma_rate = results['OMA - No RIS']['rate_mean'] rate_improvement = (td3_rate - oma_rate) / oma_rate * 100 if oma_rate > 0 else 0 print(f" TD3 Optimized vs OMA No RIS:") print(f" - EE Improvement: {improvement:+.1f}%") print(f" - Rate Improvement: {rate_improvement:+.1f}%") if improvement > 0: print(" ✓ Validates performance advantages of RIS and NOMA") if 'NOMA - TD3 Optimized' in results and 'NOMA - Random RIS' in results: td3_ee = results['NOMA - TD3 Optimized']['ee_mean'] random_ee = results['NOMA - Random RIS']['ee_mean'] improvement = (td3_ee - random_ee) / random_ee * 100 if random_ee > 0 else 0 print(f" TD3 Optimized vs Random RIS:") print(f" - EE Improvement: {improvement:+.1f}%") if improvement > 0: print(" ✓ Validates optimization effectiveness of TD3 algorithm") else: print(" ⚠ TD3 optimization needs improvement") print(f"\nSimulation completed at: {time.strftime('%Y-%m-%d %H:%M:%S')}") print("All charts saved as PNG files") if __name__ == "__main__": main()ptimizer.load_state_dict(checkpoint['actor_optimizer']) self.critic_optimizer.load_state_dict(checkpoint['critic_optimizer']) 以上代码有什么问题吗
最新发布
12-01
import numpy as np import torch import torch.nn as nn import torch.optim as optim import matplotlib.pyplot as plt import matplotlib.colors as mcolors from collections import deque import random import os EPISODES = 5 # 训练轮数 # 修改文件扩展名为TXT DATA_PATH = "/tmp/pycharm_project_79/D3QN-main/mk/Mk01.txt" # ================= 增强版数据集解析 ================= def parse_mk01(file_path): """动态解析MK01文件,包含多层数据校验""" jobs = [] machine_ids = set() try: with open(file_path, 'r') as f: lines = [line.strip() for line in f if line.strip()] # 动态解析机器数 machine_count = 10 # 默认值 if len(lines) > 0: header = list(map(int, lines[0].split()[:2])) # 只取前两个有效数字 job_count = max(header[0], 0) if len(header) > 0 else 0 if len(header) > 1: machine_count = max(machine_count, header[1]) lines = lines[1:] current_idx = 0 for job_id in range(job_count): if current_idx >= len(lines): break # 解析工序数 op_info = lines[current_idx].split() op_count = int(op_info[0]) if op_info else 0 current_idx += 1 operations = [] for _ in range(op_count): if current_idx >= len(lines): break parts = list(map(int, lines[current_idx].split())) current_idx += 1 machines = [] # 动态解析机器-时间对 for i in range(0, len(parts), 2): if i + 1 >= len(parts): break raw_machine = parts[i] duration = max(parts[i + 1], 1) # 确保持续时间有效 # 计算有效机器ID machine = (abs(raw_machine) - 1) % machine_count machine_ids.add(machine) machines.append((machine, duration)) # 确保至少有一个有效机器 if not machines: machine = 0 machines.append((machine, 10)) machine_ids.add(machine) operations.append({"machines": machines}) # 确保至少有一个有效工序 if operations: jobs.append({ "operations": operations, "due_date": None }) # 生成有效机器列表 valid_machines = list(machine_ids) if machine_ids else [0] return { "jobs": jobs if jobs else [{ "operations": [{"machines": [(0, 10)]}], "due_date": 100 }], "machines": valid_machines } except Exception as e: print(f"解析警告:{str(e)},已启用备用数据") return { "jobs": [{ "operations": [{"machines": [(0, 10)]}], "due_date": 100 }], "machines": [0] } # ================= 神经网络架构 ================= class DuelingDQN(nn.Module): def __init__(self, input_dim, output_dim): super().__init__() self.feature = nn.Sequential( nn.Linear(input_dim, 256), nn.ReLU(), nn.Linear(256, 128), nn.ReLU() ) self.value_stream = nn.Sequential( nn.Linear(128, 64), nn.ReLU(), nn.Linear(64, 1) ) self.advantage_stream = nn.Sequential( nn.Linear(128, 64), nn.ReLU(), nn.Linear(64, output_dim) ) def forward(self, x): x = self.feature(x) value = self.value_stream(x) advantage = self.advantage_stream(x) return value + (advantage - advantage.mean(dim=-1, keepdim=True)) # ================= 增强版智能体类定义 ================= class OperationAgent: """工序智能体:带安全保护的机器选择""" def __init__(self, state_dim, action_dim): self.policy_net = DuelingDQN(state_dim, action_dim) self.target_net = DuelingDQN(state_dim, action_dim) self.target_net.load_state_dict(self.policy_net.state_dict()) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-4) self.memory = deque(maxlen=100000) self.batch_size = 128 self.gamma = 0.99 self.epsilon = 1.0 self.epsilon_min = 0.01 self.epsilon_decay = 0.995 def select_action(self, state, valid_actions): try: if not valid_actions: return 0 # 返回默认机器0(需确保其存在) # 添加有效性检查 valid_actions = list(set(valid_actions)) # 去重 if not valid_actions: return 0 if np.random.random() < self.epsilon: return np.random.choice(valid_actions) else: with torch.no_grad(): state_tensor = torch.FloatTensor(state).unsqueeze(0) q_values = self.policy_net(state_tensor) return valid_actions[torch.argmax(q_values[0, valid_actions]).item()] except Exception as e: print(f"动作选择异常: {str(e)}, 使用默认动作") return valid_actions[0] if valid_actions else 0 def update_epsilon(self): self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay) def store_experience(self, state, action, reward, next_state, done): self.memory.append(( torch.FloatTensor(state), torch.LongTensor([action]), torch.FloatTensor([reward]), torch.FloatTensor(next_state), torch.BoolTensor([done]) )) def optimize(self): if len(self.memory) < self.batch_size: return batch = random.sample(self.memory, self.batch_size) states, actions, rewards, next_states, dones = zip(*batch) states = torch.stack(states) actions = torch.stack(actions) rewards = torch.stack(rewards) next_states = torch.stack(next_states) dones = torch.stack(dones) current_q = self.policy_net(states).gather(1, actions) next_q = self.target_net(next_states).max(1)[0].detach() target_q = rewards + (1 - dones.float()) * self.gamma * next_q.unsqueeze(1) loss = nn.MSELoss()(current_q, target_q) self.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0) self.optimizer.step() def update_target_net(self): self.target_net.load_state_dict(self.policy_net.state_dict()) class QueueAgent: """队列智能体:管理工序优先级排序""" def __init__(self, state_dim): self.policy_net = DuelingDQN(state_dim, 2) # 接受/拒绝 self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-4) self.memory = deque(maxlen=50000) self.batch_size = 64 def prioritize_operations(self, state, operations): with torch.no_grad(): state_tensor = torch.FloatTensor(state).unsqueeze(0) priority_scores = self.policy_net(state_tensor).squeeze().numpy() sorted_indices = np.argsort(-priority_scores) return [operations[i] for i in sorted_indices] class MachineAgent: """机器智能体:执行加工决策""" def __init__(self, state_dim): self.policy_net = DuelingDQN(state_dim, 3) # 加工/维护/空闲 self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-4) self.memory = deque(maxlen=50000) self.batch_size = 64 def select_action(self, state): with torch.no_grad(): state_tensor = torch.FloatTensor(state).unsqueeze(0) q_values = self.policy_net(state_tensor) return torch.argmax(q_values).item() # ================= 调度环境 ================= class FlexibleJobShopEnv: def __init__(self, config): # 数据校验与修复 self.jobs = [] for job in config.get("jobs", []): valid_ops = [] for op in job.get("operations", []): if len(op.get("machines", [])) > 0: valid_ops.append(op) if valid_ops: self.jobs.append({ "operations": valid_ops, "due_date": job.get("due_date", 100) }) # 确保至少有一个有效作业 if not self.jobs: self.jobs = [{ "operations": [{"machines": [(0, 10)]}], "due_date": 100 }] # 动态生成机器列表 self.machines = list(set( m for job in self.jobs for op in job["operations"] for m, _ in op["machines"] )) or [0] # 其他初始化参数 self.max_steps = 1000 self.job_arrival_rate = 0.1 self.machine_break_prob = 0.02 self.reset() def reset(self): self.current_step = 0 self.schedule = {m: [] for m in self.machines} self.active_jobs = [] self.completed_jobs = [] self.machine_states = { m: { "status": "idle", "current_job": None, "remaining_time": 0 } for m in self.machines } self.event_queue = deque() # 安全初始化作业 try: for _ in range(min(3, len(self.jobs))): self._add_job(random.choice(range(len(self.jobs)))) except: self._add_job(0) def _add_job(self, job_idx): try: job_data = self.jobs[job_idx] job = { "id": len(self.active_jobs), "operations": [op.copy() for op in job_data["operations"]], "current_op": 0, "arrival_time": self.current_step, "due_date": self.current_step + np.random.randint(50, 100) } self.active_jobs.append(job) except: self.active_jobs.append({ "id": len(self.active_jobs), "operations": [{"machines": [(0, 10)]}], "current_op": 0, "arrival_time": self.current_step, "due_date": self.current_step + 100 }) def _get_state(self): """构建全局状态向量""" state = [] # 机器状态特征 for m in self.machines: state += [ 1 if self.machine_states[m]["status"] == "busy" else 0, self.machine_states[m]["remaining_time"] / 100 ] # 作业状态特征 job_features = [] for job in self.active_jobs: job_features += [ (job["due_date"] - self.current_step) / 100, job["current_op"] / len(job["operations"]) ] state += job_features[:10] # 取前5个作业的特征 # 队列状态 state.append(len(self.event_queue) / 20) return np.array(state, dtype=np.float32) def step(self, machine_actions): """执行一个时间步""" reward = 0 done = False # 处理机器动作 for machine, action in machine_actions.items(): state = self.machine_states[machine] if action == 0 and state["status"] == "idle": if self.event_queue: selected_op = self.event_queue.popleft() duration = selected_op["duration"] self.schedule[machine].append({ "job_id": selected_op["job_id"], "op_idx": selected_op["op_idx"], "start": self.current_step, "end": self.current_step + duration }) state.update({ "status": "busy", "current_job": selected_op["job_id"], "remaining_time": duration }) reward += 2.0 # 成功处理奖励 # 更新机器状态 for machine in self.machines: state = self.machine_states[machine] if state["status"] == "busy": state["remaining_time"] -= 1 if state["remaining_time"] <= 0: job_id = state["current_job"] job = next(j for j in self.active_jobs if j["id"] == job_id) job["current_op"] += 1 if job["current_op"] >= len(job["operations"]): self.completed_jobs.append(job) self.active_jobs.remove(job) reward += 10.0 # 完成作业奖励 state.update({"status": "idle", "current_job": None}) # 处理动态事件 self._handle_events() # 计算奖励 reward += self._calculate_utilization_reward() reward -= self._calculate_tardiness_penalty() reward -= len(self.event_queue) * 0.1 # 队列长度惩罚 # 检查终止条件 self.current_step += 1 if self.current_step >= self.max_steps: done = True if len(self.completed_jobs) >= len(self.jobs): done = True reward += 100.0 # 提前完成所有作业奖励 return self._get_state(), reward, done, {} def _calculate_utilization_reward(self): busy_machines = sum(1 for m in self.machines if self.machine_states[m]["status"] == "busy") return (busy_machines / len(self.machines)) * 2.5 def _calculate_tardiness_penalty(self): penalty = 0 for job in self.active_jobs: if self.current_step > job["due_date"]: penalty += (self.current_step - job["due_date"]) * 0.2 return min(penalty, 15.0) # 限制最大惩罚 def _handle_events(self): # 新作业到达 if np.random.poisson(self.job_arrival_rate): self._add_job(random.choice(range(len(self.jobs)))) # 机器故障处理 for machine in self.machines: if self.machine_states[machine]["status"] == "busy": if np.random.rand() < self.machine_break_prob: self.machine_states[machine]["status"] = "break" self.machine_states[machine]["remaining_time"] = np.random.randint(5, 15) def render(self): """可视化当前状态""" plt.figure(figsize=(12, 6)) colors = list(mcolors.TABLEAU_COLORS.values()) # 机器利用率 plt.subplot(1, 2, 1) util = [len(ops) for ops in self.schedule.values()] plt.bar(range(len(self.machines)), util, color=colors[0]) plt.title("Machine Utilization") plt.xlabel("Machine ID") plt.ylabel("Completed Operations") # 作业进度 plt.subplot(1, 2, 2) if self.active_jobs: progress = [ (job["current_op"] / max(len(job["operations"]), 1)) * 100 # 防止除零 for job in self.active_jobs ] plt.bar(range(len(progress)), progress, color=colors[1]) plt.title("Job Progress") plt.xlabel("Job ID") plt.ylabel("Completion (%)") plt.tight_layout() plt.pause(0.01) plt.close() # ================= 训练框架 ================= class Trainer: def __init__(self, env_config): self.env = FlexibleJobShopEnv(env_config) # 动态获取状态维度 self.env.reset() sample_state = self.env._get_state() state_dim = len(sample_state) self.op_agents = [ OperationAgent(state_dim=state_dim, action_dim=10) for _ in range(len(env_config["jobs"])) ] self.queue_agent = QueueAgent(state_dim=state_dim) self.machine_agents = { m: MachineAgent(state_dim=state_dim) for m in env_config["machines"] } def train(self, episodes=5): rewards_history = [] moving_avg = [] for ep in range(episodes): state = self.env.reset() total_reward = 0 done = False while not done: # 工序智能体决策 # 在Trainer的train方法中修改: op_actions = [] for job in self.env.active_jobs: if job["current_op"] < len(job["operations"]): # 确保存在可选机器 machines = job["operations"][job["current_op"]]["machines"] if not machines: machines = [(0, 10)] # 默认机器 valid_machines = [m[0] for m in machines] agent = self.op_agents[job["id"]] action = agent.select_action(state, valid_machines) # 查找对应duration try: duration = next(d for m, d in machines if m == action) except StopIteration: duration = 10 # 默认值 op_actions.append({ "job_id": job["id"], "op_idx": job["current_op"], "machine": action, "duration": duration }) # 队列智能体排序 sorted_ops = self.queue_agent.prioritize_operations(state, op_actions) self.env.event_queue = deque(sorted_ops) # 机器智能体执行 machine_actions = {} for machine in self.env.machines: if self.env.machine_states[machine]["status"] == "idle": action = self.machine_agents[machine].select_action(state) machine_actions[machine] = action # 环境步进 next_state, reward, done, _ = self.env.step(machine_actions) total_reward += reward # 存储经验 for agent in self.op_agents: agent.store_experience(state, action, reward, next_state, done) state = next_state # 优化模型 for agent in self.op_agents: agent.optimize() agent.update_epsilon() if ep % 10 == 0: agent.update_target_net() rewards_history.append(total_reward) moving_avg.append(np.mean(rewards_history[-10:])) print(f"Episode {ep + 1}/{episodes}, Reward: {total_reward:.2f}, Epsilon: {self.op_agents[0].epsilon:.3f}") # 定期保存模型 if ep % 50 == 0: self.save_models(f"checkpoint_ep{ep}") # 训练后可视化 self.plot_training(rewards_history, moving_avg) self.env.render() def save_models(self, path): os.makedirs(path, exist_ok=True) for idx, agent in enumerate(self.op_agents): torch.save(agent.policy_net.state_dict(), f"{path}/op_agent_{idx}.pth") torch.save(self.queue_agent.policy_net.state_dict(), f"{path}/queue_agent.pth") for m in self.env.machines: torch.save(self.machine_agents[m].policy_net.state_dict(), f"{path}/machine_{m}.pth") def plot_training(self, rewards, moving_avg): plt.figure(figsize=(12, 6)) plt.plot(rewards, alpha=0.6, label='Episode Reward') plt.plot(moving_avg, linewidth=2, label='Moving Average (10)') plt.xlabel("Episode") plt.ylabel("Reward") plt.title("Training Progress") plt.legend() plt.savefig("training_progress.png") plt.close() # ================= 主程序 ================= if __name__ == "__main__": # 加载配置 mk_data = parse_mk01(DATA_PATH) # 使用配置好的路径 # 初始化训练器 trainer = Trainer(mk_data) # 开始训练 try: trainer.train(episodes=EPISODES) except KeyboardInterrupt: print("\n训练被用户中断,正在保存当前模型...") trainer.save_models("interrupted_training") except Exception as e: print(f"训练过程中出现异常:{str(e)}") exit(1) print("训练完成!结果已保存至当前目录") 这个代码报错,如何解决?new(): data must be a sequence (got NoneType)
05-12
G:\Project\pycharm_project\Pytorch_Experiment\.venv\Scripts\python.exe G:\Project\pycharm_project\Pytorch_Experiment\Mario\Mario.py G:\Project\pycharm_project\Pytorch_Experiment\Mario\Mario.py:163: SyntaxWarning: invalid escape sequence '\g' """计算时间差分目标值[^1]$$r + \gamma \max_{a}Q_{\text{target}}(s_{t+1},a)$$""" G:\Project\pycharm_project\Pytorch_Experiment\Mario\Mario.py:170: SyntaxWarning: invalid escape sequence '\e' """更新探索率[^1]$$\epsilon = \max(\epsilon_{\text{min}}, \epsilon \times \text{decay})$$""" G:\Project\pycharm_project\Pytorch_Experiment\.venv\Lib\site-packages\gym\envs\registration.py:593: UserWarning: WARN: The environment SuperMarioBros-1-1-v0 is out of date. You should consider upgrading to version `v3`. logger.warn( Traceback (most recent call last): File "G:\Project\pycharm_project\Pytorch_Experiment\Mario\Mario.py", line 303, in <module> train() File "G:\Project\pycharm_project\Pytorch_Experiment\Mario\Mario.py", line 229, in train env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0") ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "G:\Project\pycharm_project\Pytorch_Experiment\.venv\Lib\site-packages\gym\envs\registration.py", line 662, in make env = env_creator(**_kwargs) ^^^^^^^^^^^^^^^^^^^^^^ File "G:\Project\pycharm_project\Pytorch_Experiment\.venv\Lib\site-packages\gym_super_mario_bros\smb_env.py", line 52, in __init__ super(SuperMarioBrosEnv, self).__init__(rom) File "G:\Project\pycharm_project\Pytorch_Experiment\.venv\Lib\site-packages\nes_py\nes_env.py", line 126, in __init__ _ = rom.prg_rom ^^^^^^^^^^^ File "G:\Project\pycharm_project\Pytorch_Experiment\.venv\Lib\site-packages\nes_py\_rom.py", line 204, in prg_rom return self.raw_data[self.prg_rom_start:self.prg_rom_stop] ^^^^^^^^^^^^^^^^^ File "G:\Project\pycharm_project\Pytorch_Experiment\.venv\Lib\site-packages\nes_py\_rom.py", line 198, in prg_rom_stop return self.prg_rom_start + self.prg_rom_size * 2**10 ~~~~~~~~~~~~~~~~~~^~~~~~~ OverflowError: Python integer 1024 out of bounds for uint8
06-06
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值