基于MATLAB的Q-learning强化学习与回归算法结合的轨迹规划实现

最新推荐文章于 2025-12-01 18:45:00 发布

原创最新推荐文章于 2025-12-01 18:45:00 发布 · 228 阅读

8 ·

CC 4.0 BY-SA版权

文章标签：

#matlab #回归 #开发语言

一、系统架构设计

1. 状态空间定义

% 状态向量：[x, y, θ, v] (位置、航向角、速度)
state = [2, 3, pi/4, 0.5]; % 初始状态
goal = [8, 8]; % 目标点

2. 动作空间设计

% 离散动作集合：速度+转向组合
actions = [0.2, 0;   % 前进
           0.1, 0.1;  % 左转前进
           0.1, -0.1];% 右转前进

3. 奖励函数设计

function r = calc_reward(state, action, next_state)
    % 距离奖励
    dist = norm(next_state(1:2) - goal);
    progress = 1 / (1 + dist); % 进度奖励
    
    % 方向奖励
    angle_error = angle2vec(next_state(3) - atan2(goal(2)-state(2), goal(1)-state(1)));
    angle_reward = 0.5 * (1 - abs(angle_error)/pi);
    
    % 碰撞惩罚
    if norm(next_state(1:2) - [3,3]) < 0.5 % 障碍物在(3,3)
        collision_penalty = -10;
    else
        collision_penalty = 0;
    end
    
    r = 0.6*progress + 0.3*angle_reward + collision_penalty;
end

二、Q-learning算法实现

1. 核心训练流程

% 参数设置
alpha = 0.1;   % 学习率
gamma = 0.9;   % 折扣因子
epsilon = 0.2; % 探索率
episodes = 1000;

% Q表初始化
num_states = 10; % 离散化网格大小
Q = zeros(num_states, num_states, size(actions,1));

for ep = 1:episodes
    state = env.reset(); % 重置环境
    done = false;
    
    while ~done
        % 动作选择
        if rand < epsilon
            action_idx = randi(size(actions,1)); % 随机探索
        else
            [~, action_idx] = max(Q(state(1), state(2), :)); % 利用
        end
        
        % 执行动作
        next_state = env.step(action_idx);
        reward = calc_reward(state, action_idx, next_state);
        
        % Q值更新
        Q(state(1), state(2), action_idx) = Q(state(1), state(2), action_idx) + ...
            alpha*(reward + gamma*max(Q(next_state(1), next_state(2), :)) - Q(state(1), state(2), action_idx));
        
        state = next_state;
        if norm(state(1:2)-goal) < 0.2
            done = true;
        end
    end
    
    % 绘制训练过程
    plot(ep, mean(Q(:)), 'bo'); 
    hold on;
    xlabel('Episode'); ylabel('Avg Q-value');
end

2. 环境交互类

classdef PathEnv < handle
    properties
        gridSize = [10,10];
        obstacles = [3,3; 5,5; 7,7];
        startPos = [2,3];
        goalPos = [8,8];
    end
    
    methods
        function state = reset(obj)
            state = [obj.startPos(1), obj.startPos(2), pi/4, 0.5];
        end
        
        function next_state = step(obj, action_idx)
            action = obj.actions(action_idx,:);
            % 执行运动学模型
            next_state = obj.kinematics(state, action);
            % 边界处理
            next_state(1) = max(1, min(obj.gridSize(1), next_state(1)));
            next_state(2) = max(1, min(obj.gridSize(2), next_state(2)));
        end
        
        function next = kinematics(obj, state, action)
            dt = 0.1;
            v = action(1);
            w = action(2);
            x = state(1) + v*cos(state(3))*dt;
            y = state(2) + v*sin(state(3))*dt;
            theta = state(3) + w*dt;
            next = [x, y, theta, v];
        end
    end
end

三、回归算法集成

1. 轨迹预测模型

% 使用历史数据训练回归模型
load('training_data.mat'); % 包含state-action-next_state数据
mdl = fitlm(trainingData(:,1:3), trainingData(:,4)); % 状态预测动作

% 预测函数
function action = predict_action(model, state)
    action = predict(model, state);
end

2. 混合决策策略

function action_idx = hybrid_decision(Q, model, state)
    % Q-learning决策
    q_vals = Q(state(1), state(2), :);
    
    % 回归预测
    pred_action = predict(model, state);
    [~, pred_idx] = min(pred_action);
    
    % 混合策略
    if rand < 0.7
        action_idx = pred_idx; % 优先使用回归预测
    else
        [~, action_idx] = max(q_vals); % 否则使用Q-learning
    end
end

四、可视化模块

1. 训练过程监控

figure;
subplot(2,1,1);
plot(episodes, rewards, '-r', 'LineWidth',2);
title('训练奖励曲线');
xlabel('Episode'); ylabel('累计奖励');

subplot(2,1,2);
imagesc(Q(:,:,1)); colorbar;
title('Q值分布（动作1）');

2. 路径规划演示

function visualize_path(env, agent, num_steps)
    state = env.reset();
    path = state(1:2);
    
    for step = 1:num_steps
        action_idx = agent.choose_action(state);
        state = env.step(action_idx);
        path = [path; state(1:2)];
        
        % 绘制环境
        cla;
        hold on;
        plot(env.goalPos(1), env.goalPos(2),'go','MarkerSize',10);
        plot(env.obstacles(:,1), env.obstacles(:,2),'rx');
        plot(path(:,1), path(:,2),'b-o');
        xlim([1 env.gridSize(1)]);
        ylim([1 env.gridSize(2)]);
        drawnow;
    end
end

五、完整训练流程

% 初始化环境与代理
env = PathEnv();
agent = QLearningAgent(env);

% 训练循环
for ep = 1:1000
    state = env.reset();
    total_reward = 0;
    
    for step = 1:200
        % 选择动作
        action_idx = agent.choose_action(state);
        
        % 执行动作
        next_state = env.step(action_idx);
        reward = calc_reward(state, action_idx, next_state);
        
        % 存储经验
        agent.memory.add(state, action_idx);
        
        % 更新Q值
        agent.update(state, action_idx, reward, next_state);
        
        state = next_state;
        total_reward = total_reward + reward;
        
        if norm(state(1:2)-env.goalPos) < 0.2
            break;
        end
    end
    
    % 更新目标网络
    if mod(ep,10) == 0
        agent.update_target_network();
    end
    
    % 记录数据
    agent.log(ep, total_reward);
end