%% main_dqn
% 基于深度Q网络(DQN)的超启发式算法求解PACP问题
clear;
clc;
%% 加载预先生成的仓库数据
data = load('warehouse_layout_p5_k3_s42.mat'); % 加载之前保存的.mat文件
I = data.I;
I0 = data.I0;
Ap = data.Ap;
Dij = data.Dij;
task_partition = data.task_partition;
%% 参数设置(需与生成数据时保持一致)
total_partitions = data.total_partitions; % 直接从数据获取
k = data.k; % 直接从数据获取
C = 6; % AMR容量
pop_size = 50;
max_gen = 1000;
vp = 1; % 拣选员速度
vr = 2; % AMR速度
cp = 10; % 拣选员单位时间成本
cr = 5; % AMR单位时间成本
%% DQN
[best_solution, best_cost, stats] = dqn_hyperheuristic_pacp(I, I0, Dij, task_partition, k, C, pop_size, max_gen, vp, vr, cp, cr);
% 输出结果
disp('最优任务顺序(OI):');
disp(best_solution.OI);
disp('AMR分配(RA):');
disp(best_solution.RA);
fprintf('最小总成本:%.2f 元\n', best_cost);
% 可视化结果
visualize_convergence_main(stats);
%% 主函数修改(基于DQN)
function [best_solution, best_cost, stats] = dqn_hyperheuristic_pacp(I, I0, Dij, task_partition, k, C, pop_size, max_gen, vp, vr, cp, cr)
% 参数初始化
N = size(I, 1) - 1;
t = rand(1, N) * 10;
pheromone = ones(N+1,N+1)*0.1; % 信息素矩阵初始化
global_best = struct('OI',[], 'RA',[], 'cost',inf); % 初始化全局最优
% DQN参数
state_dim = 3; % delta_cost, diversity, phase
action_dim = 9;
hidden_units = 64;
replay_buffer_size = 1000;
batch_size = 32;
update_target_freq = 50;
gamma = 0.9;
epsilon_start = 0.3;
epsilon_min = 0.01;
epsilon_decay = 0.995;
% 初始化DQN网络
online_net = build_dqn(state_dim, hidden_units, action_dim);
target_net = build_dqn(state_dim, hidden_units, action_dim);
updateTargetNetwork(target_net, online_net); % 同步参数
% 初始化种群和统计
population = initialize_population(pop_size, N, C, Dij, task_partition);
[costs, current_best] = evaluate_population(population, Dij, task_partition, t, vp, vr, cp, cr, C);
[best_cost, min_idx] = min(costs);
best_solution = population(min_idx);
global_best = current_best; % 初始全局最优
% 经验回放池
replay_buffer = struct('state', {}, 'action', {}, 'reward', {}, 'next_state', {}, 'done', {});
% 训练统计
stats.best_costs = zeros(max_gen,1);
epsilon = epsilon_start;
for gen = 1:max_gen
% 获取当前状态(连续特征)
current_state = get_continuous_state(costs, best_cost, population, gen, max_gen, N);
% ε-greedy选择动作
if rand < epsilon
action = randi(action_dim);
else
% 将状态向量转换为3x1的列向量,并指定维度为通道
current_state_col = current_state'; % 转换为列向量
current_state_dl = dlarray(current_state_col, 'C'); % 'C' 表示通道
q_values = predict(online_net, current_state_dl);
q_values = gather(q_values); % 将结果转回普通数组
[~, action] = max(q_values);
end
% 应用LLH算子
new_population = apply_LLH(population, action, C, N, Dij, vr, task_partition, t, vp, cp, cr, global_best, pheromone);
[new_costs, current_best] = evaluate_population(new_population, Dij, task_partition, t, vp, vr, cp, cr, C);
% 更新全局最优
if current_best.cost < global_best.cost
global_best = current_best;
end
% 计算奖励和下一状态
[new_min_cost, ~] = min(new_costs);
reward = calculate_reward(best_cost, new_min_cost);
next_state = get_continuous_state(new_costs, new_min_cost, new_population, gen, max_gen, N);
done = (gen == max_gen); % 是否终止
% 存储经验
experience = struct(...
'state', current_state, ...
'action', action, ...
'reward', reward, ...
'next_state', next_state, ...
'done', done);
if length(replay_buffer) >= replay_buffer_size
replay_buffer(1) = [];
end
replay_buffer(end+1) = experience;
% 经验回放训练
if length(replay_buffer) >= batch_size
% 随机采样批次
batch_idx = randperm(length(replay_buffer), batch_size);
batch = replay_buffer(batch_idx);
% 提取数据
states = cat(1, batch.state);
actions = [batch.action];
rewards = [batch.reward];
next_states = cat(1, batch.next_state);
dones = [batch.done];
% 将状态数据转换为dlarray,确保通道维度为3
states_dl = dlarray(states', 'CB'); % 转置后,每列代表一个样本,维度为3xbatch_size
next_states_dl = dlarray(next_states', 'CB');
% 计算目标Q值
target_q = rewards';
non_final = ~dones;
if any(non_final)
next_q = predict(target_net, next_states_dl(:,non_final));
next_q = gather(next_q); % 转回普通数组
% 使用正确的维度操作
max_next_q = max(next_q, [], 1); % 按行取最大值,得到1x(non_final_count)的数组
target_q(non_final) = target_q(non_final) + gamma * max_next_q;
end
% 训练在线网络
q_pred = predict(online_net, states_dl);
q_pred = gather(q_pred); % 转回普通数组
q_target = q_pred;
for i = 1:batch_size
q_target(actions(i), i) = target_q(i);
end
% 将目标值转回dlarray
q_target_dl = dlarray(q_target, 'CB');
% 训练网络
options = trainingOptions('adam', ...
'MaxEpochs', 1, ...
'MiniBatchSize', batch_size, ...
'Shuffle', 'never', ...
'Verbose', false, ...
'Plots', 'none');
[online_net, info] = train(online_net, states_dl, q_target_dl, options);
end
% 更新目标网络
if mod(gen, update_target_freq) == 0
updateTargetNetwork(target_net, online_net);
end
% 信息素更新(使用全局最优路径)
update_pheromone(pheromone, global_best.OI);
% ε衰减
epsilon = max(epsilon_min, epsilon * epsilon_decay);
population = new_population;
costs = new_costs; % 必须更新costs变量
stats.best_costs(gen) = best_cost;
end
end
%% 辅助函数:构建DQN网络
function net = build_dqn(input_dim, hidden_units, output_dim)
layers = [
featureInputLayer(input_dim)
fullyConnectedLayer(hidden_units)
reluLayer()
fullyConnectedLayer(hidden_units) % 添加一层隐藏层
reluLayer()
fullyConnectedLayer(output_dim)
];
net = layerGraph(layers);
net = dlnetwork(net);
end
%% 辅助函数:更新目标网络
function updateTargetNetwork(targetNet, onlineNet)
targetNet.Learnables = onlineNet.Learnables;
end
%% 修改后的状态获取函数(返回连续特征)
function state = get_continuous_state(costs, best_cost, population, gen, max_gen, N)
% 1. 成本下降率
delta_cost = (mean(costs) - best_cost) / (best_cost + 1e-6); % 防止除零
% 2. 种群多样性
diversity = calculate_diversity(population, N);
% 3. 迭代阶段(0~1)
phase = gen / max_gen;
state = [delta_cost, diversity, phase];
end
function update_pheromone(pheromone, best_path)
pheromone = pheromone * 0.9; % 蒸发
for i = 1:length(best_path)-1
pheromone(best_path(i)+1, best_path(i+1)+1) = ...
pheromone(best_path(i)+1, best_path(i+1)+1) + 0.1;
end
end相关子函数已给出,请检查基于DQN的超启发式算法的主函数部分和其对应的辅助子函数。给出修正完善后的完整版MATLAB代码