# ============= 训练参数 =============
num_epochs_rl: int = 20 # RL训练总轮数
lr: float = 1e-3 # 初始学习率
batch_size: int = 12 # 批次大小
seed = 2025
num_epochs = 20
num_workers = 12
freeze_layers = ['vfe', 'map_to_bev', 'backbone_2d']
sync_bn = True
warm_up = 2 # epoch
# ============= PPO参数 =============
clip_param: float = 0.3 # PPO裁剪参数
ppo_epochs: int = 5 # 每次经验收集后的PPO更新轮数,不宜过大
gamma: float = 0.95 # 折扣因子
tau: float = 0.90 # GAE参数
value_coef: float = 0.7 # 值函数损失权重
entropy_coef: float = 0.05 # 熵正则化权重
max_grad_norm: float = 1.0 # 梯度裁剪阈值
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import torch.optim as optim
from torch.distributions import Normal
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
import copy
import random
import numpy as np
from tqdm import tqdm
from collections import deque
from rl_seg.utils.compute_miou import get_miou, fast_hist, fast_hist_crop
from rl_seg.datasets import load_data_to_gpu
# 经验回放缓冲区
# 经验回放缓冲区
class ReplayBuffer:
def __init__(self, capacity=100):
self.buffer = deque(maxlen=capacity)
def add(self, experience):
"""添加经验到缓冲区"""
self.buffer.append(experience)
def sample(self, batch_size):
"""从缓冲区随机采样一批经验"""
return random.sample(self.buffer, min(batch_size, len(self.buffer)))
def clear(self):
"""清空缓冲区"""
self.buffer.clear()
def __len__(self):
return len(self.buffer)
# PPO 代理(Actor-Critic 网络)
class PPOAgent(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim=512):
super(PPOAgent, self).__init__()
self.state_dim = state_dim
self.action_dim = action_dim
# 共享特征提取层
self.shared_layers = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
# nn.ReLU(),
nn.LayerNorm(hidden_dim),
nn.GELU(),
nn.Linear(hidden_dim, hidden_dim),
# nn.ReLU()
nn.LayerNorm(hidden_dim),
nn.GELU(),
)
# Actor 网络 (策略)
self.actor = nn.Sequential(
# nn.Linear(hidden_dim, hidden_dim),
# # nn.ReLU(),
# nn.GELU(),
nn.Linear(hidden_dim, action_dim),
nn.Tanh() # 输出在[-1,1]范围内
)
# Critic 网络 (值函数)
self.critic = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
# nn.ReLU(),
nn.GELU(),
nn.Linear(hidden_dim, 1)
)
# 动作标准差 (可学习参数)
self.log_std = nn.Parameter(torch.zeros(1, action_dim))
# 初始化权重
self.apply(self._init_weights)
def _init_weights(self, module):
"""初始化网络权重"""
if isinstance(module, nn.Linear):
nn.init.orthogonal_(module.weight, gain=0.01)
nn.init.constant_(module.bias, 0.0)
def forward(self, state):
features = self.shared_layers(state)
action_mean = self.actor(features)
value = self.critic(features)
return action_mean, value.squeeze(-1)
def act(self, state):
"""与环境交互时选择动作"""
# state = torch.FloatTensor(state).unsqueeze(0).to(device) # 确保是 [1, state_dim]
with torch.no_grad():
action_mean, value = self.forward(state)
# 创建动作分布 (添加最小标准差确保稳定性)
action_std = torch.clamp(self.log_std.exp(), min=0.01, max=0.5)
dist = Normal(action_mean, action_std)
# 采样动作
action = dist.sample() # [B, action_dim]
log_prob = dist.log_prob(action).sum(-1)
return action, log_prob, value
def evaluate(self, state, action):
"""评估动作的概率和值"""
action_mean, value = self.forward(state)
# 创建动作分布
action_std = torch.clamp(self.log_std.exp(), min=0.01, max=0.5)
dist = Normal(action_mean, action_std)
# 计算对数概率和熵
log_prob = dist.log_prob(action).sum(-1)
entropy = dist.entropy().sum(-1)
return log_prob, entropy, value
# 强化学习优化器 IbMggIlv
class PPOTrainer:
"""PPO训练器,整合了策略优化和模型微调"""
def __init__(self, seg_net, agent, cfg):
"""
Args:
seg_net: 预训练的分割网络
agent: PPO智能体
cfg: 配置对象,包含以下属性:
- lr: 学习率
- clip_param: PPO裁剪参数
- ppo_epochs: PPO更新轮数
- gamma: 折扣因子
- tau: GAE参数
- value_coef: 值函数损失权重
- entropy_coef: 熵正则化权重
- max_grad_norm: 梯度裁剪阈值
"""
self.seg_net = seg_net
self._base_seg_net = seg_net.module if isinstance(seg_net, DDP) else seg_net
self._base_seg_net.device = self.seg_net.device
self.agent = agent
self.cfg = cfg
self.writer = SummaryWriter(log_dir=f'{cfg.exp_dir}/runs/ppo_trainer') if cfg.local_rank == 0 else None
# 使用分离的优化器
self.optimizer_seg = optim.AdamW(
self.seg_net.parameters(),
lr=cfg.lr,
weight_decay=1e-4
)
self.optimizer_agent = optim.AdamW(
self.agent.parameters(),
lr=cfg.lr*0.1,
weight_decay=1e-4
)
# 训练记录
self.best_miou = 0.0
self.step_count = 0.0
self.metrics = {
'loss': [],
'reward': [],
'miou': [],
'class_ious': [],
'lr': []
}
# 梯度缩放因子
self.neck_grad_scale = 1.0
self.head_grad_scale = 1.0
self.current_lr = cfg.lr # 当前学习率
self.class_weights = torch.ones(21).to(seg_net.device) # 初始类别权重
# 经验回放缓冲区
self.replay_buffer = ReplayBuffer(capacity=50)
def compute_state(self, features, pred, gt_seg, epoch_progress):
"""
计算强化学习状态向量
Args:
features: 从extract_features获取的字典包含:
- spatial_features: [B, C1, H, W]
- bev_features: [B, C2, H, W]
- neck_features: [B, C3, H, W]
pred: 网络预测的分割结果 [B, num_classes, H, W]
gt_seg: 真实分割标签 [B, H, W]
Returns:
state: 状态向量 [state_dim]
"""
# 主要使用neck_features作为代表特征 torch.Size([4, 64, 496, 432])
feats = features["neck_features"] # [B, C, H, W]
B, C, H, W = feats.shape
# 初始化状态列表
states = []
# 为批次中每个样本单独计算状态
for i in range(B):
# 特征统计
feat_mean = feats[i].mean(dim=(1, 2)) # [C]
feat_std = feats[i].std(dim=(1, 2)) # [C]
feat_max = feats[i].max(dim=1)[0].mean(dim=1)# [C]
feat_min = feats[i].min(dim=1)[0].mean(dim=1)# [C]
# 预测类别分布
pred_classes = pred[i].argmax(dim=0) # [H, W]
class_dist = torch.bincount(
pred_classes.flatten(),
minlength=21
).float() / (H * W) # 21
# 预测置信度统计
pred_probs = torch.softmax(pred[i], dim=0)
confidence = pred_probs.max(dim=0)[0] # 最大类别概率
conf_mean = confidence.mean()
conf_std = confidence.std()
conf_stats = torch.tensor([
confidence.mean(),
confidence.std(),
(confidence < 0.5).float().mean() # 低置信度像素比例
], device=feats.device) # 3
gt_grid_ind = gt_seg["grid_ind"][i]
gt_labels_ori = gt_seg["labels_ori"][i]
# 各类IoU (需实现单样本IoU计算)
sample_miou, sample_cls_iou = self.compute_sample_iou(pred[i], gt_grid_ind, gt_labels_ori, list(range(21)))
sample_cls_iou = torch.FloatTensor(sample_cls_iou).to(feats.device) # 21
# 添加额外状态信息
additional_state = torch.tensor([
self.current_lr / self.cfg.lr, # 归一化学习率
epoch_progress, # 训练进度
*self.class_weights.cpu().numpy() # 当前类别权重
], device=feats.device)
# 组合状态
state = torch.cat([
feat_mean,
feat_std,
class_dist,
sample_cls_iou,
conf_mean.unsqueeze(0),
conf_std.unsqueeze(0),
additional_state
])
states.append(state)
return torch.stack(states).to(feats.device)
def compute_sample_iou(self, pred, gt_grid_ind, gt_labels_ori, classes=list(range(21))):
"""计算单个样本的IoU"""
pred_labels = torch.argmax(pred, dim=0).cpu().detach().numpy()
gt_grid_idx = pred_labels[
gt_grid_ind[:, 1], gt_grid_ind[:, 0], gt_grid_ind[:, 2]
]
hist = fast_hist_crop(
gt_grid_idx, gt_labels_ori, classes
)
iou = np.diag(hist) / ((hist.sum(1) + hist.sum(0) - np.diag(hist)) + 1e-8)
miou = np.nanmean(iou)
return miou, iou
def compute_reward(self, miou, prev_miou, class_ious, prev_class_ious):
"""
计算复合奖励函数
Args:
miou: 当前mIoU
prev_miou: 前一次mIoU
class_ious: 当前各类IoU [num_classes]
prev_class_ious: 前一次各类IoU [num_classes]
Returns:
reward: 综合奖励值
"""
# 基础奖励: mIoU提升
# miou_reward = 5.0 * (miou - prev_miou) * (1 + miou) # 高性能时奖励更大
miou_reward = 15.0 * np.sign(miou - prev_miou) * np.exp(3 * abs(miou - prev_miou)) # 1. 基础mIoU奖励(指数放大改进)
# 类别平衡奖励: 鼓励所有类别均衡提升
class_reward = 0.0
for cls, (iou, prev_iou) in enumerate(zip(class_ious, prev_class_ious)):
if iou > prev_iou:
# 对稀有类别给予更高奖励
weight = 1.0 + (1.0 - prev_iou) # 性能越差的类权重越高
improvement = max(0, iou - prev_iou)
class_reward += weight * improvement
# 惩罚项: 防止某些类别性能严重下降
penalty = 0.0
for cls, (iou, prev_iou) in enumerate(zip(class_ious, prev_class_ious)):
if iou < prev_iou * 0.7: # 性能下降超过10%
penalty += 3.0 * (prev_iou - iou) * (1 - prev_iou)
# 4. 探索奖励
entropy_bonus = 0.2 * self.agent.log_std.mean().exp().item()
# 平衡奖励 (鼓励所有类别均衡提升)
balance_reward = 0.5 * (1.0 - torch.std(torch.tensor(class_ious)))
total_reward = miou_reward + class_reward - penalty + entropy_bonus + balance_reward
return np.clip(total_reward, -2.0, 5.0) # 限制奖励范围
def compute_advantages(self, rewards, values):
"""计算GAE优势"""
if isinstance(rewards, list):
rewards = torch.tensor(rewards).to(values.device)
advantages = torch.zeros_like(rewards)
last_advantage = 0
# 反向计算GAE
for t in reversed(range(len(rewards))):
if t == len(rewards) - 1:
next_value = 0
else:
next_value = values[t+1]
delta = rewards[t] + self.cfg.gamma * next_value - values[t]
advantages[t] = delta + self.cfg.gamma * self.cfg.tau * last_advantage
last_advantage = advantages[t]
# 标准化优势
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
return advantages
def apply_action(self, action):
"""
应用智能体动作调整模型参数
Args:
action: [6] 连续动作向量,范围[-1, 1]
"""
action = action.squeeze(0)
# 动作0-1: 调整学习率
lr_scale = 0.8 + 0.4 * (action[0] + 1) / 2 # 映射到[0.5, 1.0]
new_lr = self.cfg.lr * lr_scale
# 更新学习率
if abs(new_lr - self.current_lr) > 1e-6:
for param_group in self.optimizer_seg.param_groups:
param_group['lr'] = new_lr
self.current_lr = new_lr
# # 动作2-3: 调整特征提取层权重 (范围[0.9, 1.1])
# neck_scale = 0.9 + 0.1 * (action[2] + 1) / 2
# with torch.no_grad():
# for param in self.seg_net.module.at_seg_neck.parameters():
# param.data *= neck_scale # (0.9 + 0.1 * action[2]) # 调整范围[0.9,1.1]at_seg_neck
# # 动作4-5: 调整分类头权重
# head_scale = 0.8 + 0.2 * (action[4] + 1) / 2
# with torch.no_grad():
# for param in self.seg_net.module.at_seg_head.parameters():
# param.data *= head_scale # (0.9 + 0.1 * action[4]) # 调整范围[0.9,1.1]
# 动作1: 设置特征提取层梯度缩放因子 (范围[0.5, 1.5])
self.neck_grad_scale = 0.5 + (action[1] + 1) # [-1,1] -> [0.5, 1.5]
# 动作2: 设置分类头梯度缩放因子 (范围[0.5, 1.5])
self.head_grad_scale = 0.5 + (action[2] + 1) # [-1,1] -> [0.5, 1.5]
# 4. 损失函数权重调整
# if hasattr(self.seg_net.module.at_seg_head, 'loss_weights'):
# new_weights = F.softmax(torch.tensor([
# 1.0 + action[4],
# 1.0 + action[5]
# ]), dim=0)
# self.seg_net.module.at_seg_head.loss_weights = new_weights
# 动作1: 调整最差类别的权重 (范围[0.8, 1.2])
cls_weight_scale = 0.8 + 0.4 * (action[3] + 1.0) / 2.0
# 动作2: 选择要调整的类别 (范围[0, 20])
cls_idx = int((action[4] + 1.0) * 10) # 映射到[0,20]
cls_idx = max(0, min(20, cls_idx))
# 更新类别权重
self.class_weights[cls_idx] = torch.clamp(
self.class_weights[cls_idx] * cls_weight_scale,
min=0.5, max=2.0
)
def train_epoch(self, train_loader, epoch):
"""执行一个训练周期"""
epoch_metrics = {
'seg_loss': 0.0,
'reward': 0.0,
'miou': 0.0,
'class_ious': np.zeros(21),
'policy_loss': 0.0,
'value_loss': 0.0,
'entropy_loss': 0.0,
'batch_count': 0
}
self.seg_net.train()
self.agent.train()
adjusted_hist_all = []
# 自适应探索参数
current_std = max(0.1, 0.5 * (1 - epoch/self.cfg.num_epochs_rl))
total_batches = len(train_loader)
for batch_idx, data_dicts in enumerate(tqdm(train_loader, desc=f"RL Epoch {epoch+1}/{self.cfg.num_epochs_rl}")):
load_data_to_gpu(data_dicts)
# 计算当前进度
epoch_progress = batch_idx / total_batches
# 1. 保存原始网络参数
original_state = copy.deepcopy(self.seg_net.state_dict())
# 2. 初始预测和特征 首先用分割网络计算初始预测(不计算梯度)
with torch.no_grad():
initial_pred = self.seg_net(data_dicts)
initial_miou_batch, initial_class_ious_batch, _ = get_miou(
initial_pred,
data_dicts,
classes=list(range(21))
)
features = self.seg_net.module.extract_features(data_dicts) # DDP包装了
# features = self._base_seg_net.extract_features(data_dicts)
# 3. 计算初始状态并选择动作
states = self.compute_state(features, initial_pred, data_dicts, epoch_progress) # [B, state_dim]
# print(states.shape)
# 设置自适应探索
with torch.no_grad():
self.agent.log_std.data = torch.clamp(self.agent.log_std, max=current_std)
# 为每个状态选择动作(循环中调用`agent.act`)
actions, log_probs, values = self.agent.act(states) # torch.Size([4, 6]) torch.Size([4]) torch.Size([4])
# 分布式训练中同步动作
if self.cfg.distributed:
mean_action = actions.mean(dim=0)
mean_action = mean_action.to(self.cfg.local_rank)
dist.all_reduce(mean_action, op=dist.ReduceOp.SUM)
mean_action /= dist.get_world_size()
else:
mean_action = actions.mean(dim=0)
# 4. 应用动作(调整网络参数和优化器学习率)
self.apply_action(mean_action)
# # 5. 调整后预测(不计算梯度)
# with torch.no_grad():
adjusted_pred = self.seg_net(data_dicts)
adjusted_miou_batch, adjusted_class_ious_batch, adjusted_hist_batch = get_miou(
adjusted_pred,
data_dicts,
classes=list(range(21))
)
adjusted_hist_all += adjusted_hist_batch
# # === 步骤6: 恢复原始参数 ===
# self.seg_net.load_state_dict(original_state)
# 7. 计算奖励 (使用整个批次的平均改进)
reward = self.compute_reward(
adjusted_miou_batch,
initial_miou_batch,
adjusted_class_ious_batch,
initial_class_ious_batch
)
# === 步骤9: PPO优化 ===
# 存储经验
experience = {
'states': states,
'actions': actions,
'rewards': [reward] * len(actions),
'old_log_probs': log_probs,
'old_values': values,
# 'advantages': advantages,
}
self.replay_buffer.add(experience)
# PPO优化
if len(self.replay_buffer) >= 10: # 缓冲区有足够样本
policy_loss, value_loss, entropy_loss = self.ppo_update(experience)
epoch_metrics['policy_loss'] += policy_loss
epoch_metrics['value_loss'] += value_loss
epoch_metrics['entropy_loss'] += entropy_loss
# === 步骤8: 正常监督学习 ===
# 前向传播
# pred = self.seg_net(data_dicts)
seg_loss = self.seg_net.module.at_seg_head.get_loss(
adjusted_pred,
data_dicts["gt_seg"].to(adjusted_pred.device),
class_weights=self.class_weights
)
# 反向传播
self.optimizer_seg.zero_grad()
seg_loss.backward()
# 应用梯度缩放
for name, param in self.seg_net.named_parameters():
if 'at_seg_neck' in name and param.grad is not None:
param.grad *= self.neck_grad_scale
elif 'at_seg_head' in name and param.grad is not None:
param.grad *= self.head_grad_scale
# 梯度裁剪和更新
torch.nn.utils.clip_grad_norm_(
self.seg_net.parameters(),
self.cfg.max_grad_norm
)
self.optimizer_seg.step()
# === 步骤10: 记录指标 ===
epoch_metrics['seg_loss'] += seg_loss.item()
epoch_metrics['reward'] += reward
epoch_metrics['miou'] += adjusted_miou_batch
epoch_metrics['class_ious'] += adjusted_class_ious_batch
# epoch_metrics['policy_loss'] += policy_loss
# epoch_metrics['value_loss'] += value_loss
# epoch_metrics['entropy_loss'] += entropy_loss
epoch_metrics['batch_count'] += 1
self.step_count += 1
# 记录到TensorBoard
if self.step_count % 10 == 0:
if self.writer:
self.writer.add_scalar('Loss/seg_loss', seg_loss.item(), self.step_count)
self.writer.add_scalar('Reward/total', reward, self.step_count)
self.writer.add_scalar('mIoU/train', adjusted_miou_batch, self.step_count)
self.writer.add_scalar('Loss/policy', policy_loss, self.step_count)
self.writer.add_scalar('Loss/value', value_loss, self.step_count)
self.writer.add_scalar('Loss/entropy', entropy_loss, self.step_count)
self.writer.add_scalar('Params/lr_scale', self.optimizer_seg.param_groups[0]['lr'] / self.cfg.lr, self.step_count)
self.writer.add_scalar('Params/neck_grad_scale', self.neck_grad_scale, self.step_count)
self.writer.add_scalar('Params/head_grad_scale', self.head_grad_scale, self.step_count)
self.writer.add_scalar('Params/exploration_std', current_std, self.step_count)
# 计算平均指标
avg_metrics = {}
for k in epoch_metrics:
if k != 'batch_count':
avg_metrics[k] = epoch_metrics[k] / epoch_metrics['batch_count']
hist = sum(adjusted_hist_all) #(21, 21)
all_iou_overall = np.diag(hist) / ((hist.sum(1) + hist.sum(0) - np.diag(hist)) + 1e-8) # (21,)
miou_epoch = np.nanmean(all_iou_overall)
# # 记录到TensorBoard
# self.writer.add_scalar('Loss/seg_loss', avg_metrics['seg_loss'], epoch)
# self.writer.add_scalar('Reward/total', avg_metrics['reward'], epoch)
# self.writer.add_scalar('mIoU/train', avg_metrics['miou'], epoch)
# self.writer.add_scalar('Loss/policy', avg_metrics['policy_loss'], epoch)
# self.writer.add_scalar('Loss/value', avg_metrics['value_loss'], epoch)
# self.writer.add_scalar('Loss/entropy', avg_metrics['entropy_loss'], epoch)
return avg_metrics
def ppo_update(self, experience):
"""
PPO策略优化步骤
Args:
batch: 包含以下键的字典:
- states: [batch_size, state_dim]
- actions: [batch_size, action_dim]
- old_log_probs: [batch_size]
- old_values: [batch_size]
- rewards: [batch_size]
- advantages: [batch_size]
Returns:
policy_loss: 策略损失值
value_loss: 值函数损失值
entropy_loss: 熵损失值
"""
# 从缓冲区采样经验
experiences = self.replay_buffer.sample(batch_size=8)
policy_losses, value_losses, entropy_losses = [], [], []
for exp in experiences:
states = exp['states']
actions = exp['actions']
old_log_probs = exp['old_log_probs']
old_values = exp['old_values']
rewards = exp['rewards']
# 计算GAE优势
advantages = self.compute_advantages(rewards, old_values)
returns = advantages + old_values
for _ in range(self.cfg.ppo_epochs):
# 评估当前策略
log_probs, entropy, values = self.agent.evaluate(states, actions)
# 比率
ratios = torch.exp(log_probs - old_log_probs.detach())
# 裁剪目标
surr1 = ratios * advantages.detach()
surr2 = torch.clamp(ratios,
1.0 - self.cfg.clip_param,
1.0 + self.cfg.clip_param) * advantages.detach()
# 策略损失
policy_loss = -torch.min(surr1, surr2).mean()
# 值函数损失
value_loss = 0.5 * (returns.detach() - values).pow(2).mean()
# 熵损失
entropy_loss = -entropy.mean()
# 总损失
loss = policy_loss + self.cfg.value_coef * value_loss + self.cfg.entropy_coef * entropy_loss
# 智能体参数更新
self.optimizer_agent.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(
self.agent.parameters(),
self.cfg.max_grad_norm
)
self.optimizer_agent.step()
policy_losses.append(policy_loss.item())
value_losses.append(value_loss.item())
entropy_losses.append(entropy_loss.item())
# 清空缓冲区
self.replay_buffer.clear()
return (
np.mean(policy_losses) if policy_losses else 0.0,
np.mean(value_losses) if value_losses else 0.0,
np.mean(entropy_losses) if entropy_losses else 0.0,
)
def close(self):
"""关闭资源"""
if self.writer:
self.writer.close()
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import os
import argparse
from collections import deque
from torch.utils.data import Dataset, DataLoader
from torch.distributions import Normal, Categorical
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
from mmengine.registry import MODELS, DATASETS
from mmengine.config import Config
from rl_seg.datasets.build_dataloader import init_dist_pytorch, build_dataloader
from rl_seg.datasets import load_data_to_gpu
from rl_seg.agents.ppo_agent import PPOAgent, PPOTrainer
from rl_seg.utils.compute_miou import get_miou, fast_hist, fast_hist_crop
from rl_seg.utils.logger import logger, get_root_logger
# 监督学习预训练
def supervised_pretrain(cfg):
seg_net = MODELS.build(cfg.model).to('cuda')
seg_head = MODELS.build(cfg.model.at_seg_head).to('cuda')
if cfg.pretrained_path:
ckpt = torch.load(cfg.pretrained_path)
seg_net.load_state_dict(ckpt['state_dict'], strict=True)
logger.info(f'Load pretrained ckpt: {cfg.pretrained_path}')
freeze_pre_backbone_layers(seg_net, freeze_layers = ['vfe', 'map_to_bev', 'backbone_2d'])
if cfg.sync_bn:
seg_net = nn.SyncBatchNorm.convert_sync_batchnorm(seg_net)
n_parameters = sum(p.numel() for p in seg_net.parameters() if p.requires_grad)
# logger.info(f"Model: \n{self.model}")
logger.info(f"Num params: {n_parameters}")
seg_net = DDP(seg_net, device_ids=[cfg.local_rank])
if cfg.local_rank == 0:
logger.info(seg_net)
optimizer = optim.Adam(seg_net.parameters(), lr=cfg.lr)
writer = SummaryWriter(log_dir=f'{cfg.exp_dir}/runs/pretrain') if cfg.local_rank == 0 else None
train_losses = []
train_mious = []
train_class_ious = [] # 存储每个epoch的各类IoU
best_miou = 0
for epoch in range(cfg.num_epochs):
cfg.sampler.set_epoch(epoch)
epoch_loss = 0.0
epoch_miou = 0.0
epoch_class_ious = np.zeros(21) # 初始化各类IoU累加器
seg_net.train()
all_miou = []
all_hist = []
batch_count = 0
for data_dicts in tqdm(cfg.train_loader, desc=f"Pretrain Epoch {epoch+1}/{cfg.num_epochs}"):
optimizer.zero_grad()
pred = seg_net(data_dicts)
device = pred.device
seg_head = seg_head.to(device)
loss = seg_head.get_loss(pred, data_dicts["gt_seg"].to(device))
loss.backward()
optimizer.step()
epoch_loss += loss.item()
# import pdb;pdb.set_trace()
# 计算mIoU
class_ious = []
batch_miou, cls_iou, hist_batch = get_miou(pred, data_dicts, classes=list(range(21)))
all_miou.append(batch_miou)
all_hist += hist_batch
batch_count += 1
if batch_count % 100 == 0 and cfg.local_rank == 0:
logger.debug(f"Epoch {epoch+1}/{cfg.num_epochs}, Batch {batch_count}, \
Loss: {loss.item():.4f}, miou: {batch_miou}")
# 计算epoch平均指标
avg_loss = epoch_loss / batch_count if batch_count > 0 else 0.0
hist = sum(all_hist)
class_ious = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
miou = np.nanmean(class_ious)
train_losses.append(avg_loss)
train_mious.append(miou)
train_class_ious.append(class_ious) # 存储各类IoU
# 记录到TensorBoard
if writer:
writer.add_scalar('Loss/train', avg_loss, epoch)
writer.add_scalar('mIoU/train', miou, epoch)
for cls, iou in enumerate(class_ious):
writer.add_scalar(f'IoU/{cfg.class_names[cls]}', iou, epoch)
if cfg.local_rank == 0:
logger.info(f"Epoch {epoch+1}/{cfg.num_epochs} - Loss: {avg_loss:.3f}, mIoU: {miou*100:.3f}")
logger.info("Class IoUs:")
for cls, iou in enumerate(class_ious):
logger.info(f" {cfg.class_names[cls]}: {iou*100:.3f}")
if best_miou < miou:
best_miou = miou
torch.save({'state_dict': seg_net.module.state_dict()},
f"{cfg.exp_dir}/seg_pretrained_best_{best_miou*100:.3f}.pth")
# 同步所有进程
dist.barrier()
# # 保存预训练模型
if cfg.local_rank == 0:
torch.save({'state_dict': seg_net.module.state_dict()}, f"{cfg.exp_dir}/seg_pretrained_latest.pth")
if writer:
writer.close()
return seg_net
# 强化学习微调
def rl_finetune(model, cfg):
state_dim = 64*2 + 21 + 21 + 2 + 23
action_dim = 5
freeze_pre_backbone_layers(model, freeze_layers = ['vfe', 'map_to_bev', 'backbone_2d'])
# 初始化PPO智能体
agent = PPOAgent(state_dim, action_dim).to(device)
if cfg.agent_path:
ckpt = torch.load(cfg.agent_path)
agent.load_state_dict(ckpt['state_dict'])
logger.info(f'Load agent ckpt: {cfg.agent_path}')
trainer = PPOTrainer(model, agent, cfg)
train_losses = []
train_rewards = []
train_mious = []
# 训练循环
for epoch in range(cfg.num_epochs_rl):
avg_metrics = trainer.train_epoch(cfg.train_loader, epoch)
# 记录指标
train_losses.append(avg_metrics['seg_loss'])
train_rewards.append(avg_metrics['reward'])
train_mious.append(avg_metrics['miou'])
# 保存最佳模型
if avg_metrics['miou'] > trainer.best_miou:
trainer.best_miou = avg_metrics['miou']
torch.save({'state_dict': model.module.state_dict()}, f"{cfg.exp_dir}/seg_rl_best_{trainer.best_miou*100:.3f}.pth")
torch.save({'state_dict': agent.state_dict()}, f"{cfg.exp_dir}/ppo_agent_best.pth")
# 打印日志
if cfg.local_rank == 0:
logger.info(f"\nRL Epoch {epoch+1}/{cfg.num_epochs_rl} Results:")
logger.info(f" Seg Loss: {avg_metrics['seg_loss']:.4f}")
logger.info(f" Reward: {avg_metrics['reward']:.4f}")
logger.info(f" mIoU: {avg_metrics['miou']*100:.3f} (Best: {trainer.best_miou*100:.3f})")
logger.info(f" Policy Loss: {avg_metrics['policy_loss']:.4f}")
logger.info(f" Value Loss: {avg_metrics['value_loss']:.4f}")
logger.info(f" Entropy Loss: {avg_metrics['entropy_loss']:.4f}")
logger.info(f" Learning Rate: {trainer.optimizer_seg.param_groups[0]['lr']:.2e}")
logger.info(" Class IoUs:")
for cls, iou in enumerate(avg_metrics['class_ious']):
logger.info(f" {cfg.class_names[cls]}: {iou*100:.3f}")
# 保存最终模型和训练记录
if cfg.local_rank == 0:
torch.save({'state_dict': model.module.state_dict()}, f"{cfg.exp_dir}/seg_rl_final.pth")
torch.save({'state_dict': agent.state_dict()}, f"{cfg.exp_dir}/ppo_agent_final.pth")
logger.info(f"\nTraining completed. Best mIoU: {trainer.best_miou:.4f}")
trainer.close()
return model, agent
# 模型评估
def evaluate_model(model, cfg):
model.eval()
avg_miou = 0
class_ious = np.zeros(21)
hist_list = []
all_miou = []
with torch.no_grad():
for data_dicts in tqdm(cfg.val_loader, desc="Evaluating"):
load_data_to_gpu(data_dicts)
pred = model(data_dicts)
batch_miou, cls_iou, hist_batch = get_miou(pred, data_dicts, classes=list(range(21)))
class_ious += cls_iou
hist_list += hist_batch
all_miou.append(batch_miou)
hist = sum(hist_list)
# True Positives (TP) / (TP + False Negatives (FN) + TP + False Positives (FP))
class_ious = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
miou = np.nanmean(class_ious)
if cfg.local_rank == 0:
logger.info("\nEvaluation Results:")
logger.info(f"Overall mIoU: {miou*100:.3f}")
for cls, iou in enumerate(class_ious):
logger.info(f" {cfg.class_names[cls]}: {iou*100:.3f}")
return miou, class_ious
# 主函数
def main(args):
cfg = Config.fromfile(args.cfg_file)
os.environ['CUBLAS_WORKSPACE_CONFIG']=':16:8'
if int(os.environ['GPU_NUM']) > 1:
args.MASTER_ADDR = os.environ["MASTER_ADDR"]
args.MASTER_PORT = os.environ["MASTER_PORT"]
# 自动获取 torchrun 传入的全局变量
cfg.rank = int(os.environ["RANK"])
cfg.local_rank = int(os.environ["LOCAL_RANK"])
cfg.world_size = int(os.environ["WORLD_SIZE"])
else:
cfg.rank = 0
cfg.local_rank = 0
cfg.world_size = 1
os.environ['MASTER_ADDR'] = 'localhost'
os.environ["MASTER_PORT"] = '23456'
total_gpus, LOCAL_RANK = init_dist_pytorch(cfg)
cfg.distributed = True
# 第一阶段:监督学习预训练
logger.info("="*50)
logger.info("Starting Supervised Pretraining...")
logger.info("="*50)
if args.batch_size:
cfg.batch_size = args.batch_size
cfg.work_dir = os.environ["userPath"]
cfg.jinn_dir = os.environ["JinnTrainResult"]
if args.exp:
cfg.exp_dir = os.path.join(cfg.jinn_dir, args.exp)
else:
cfg.exp_dir = cfg.jinn_dir
if cfg.local_rank == 0 and not os.path.exists(cfg.exp_dir):
os.makedirs(cfg.exp_dir)
logger.info(f'exp dir: {cfg.exp_dir}')
cfg.num_gpus = cfg.world_size
logger.info(f'configs: \n{cfg.pretty_text}')
dist_train = True
train_dataset, train_dataloader, sampler = build_dataloader(dataset_cfg=cfg,
data_path=cfg.train_data_path,
workers=cfg.num_workers,
samples_per_gpu=cfg.batch_size,
num_gpus=total_gpus,
dist=dist_train,
pipeline=cfg.train_pipeline,
training=True)
cfg.train_loader = train_dataloader
cfg.sampler = sampler
seg_net = supervised_pretrain(cfg)
val_dataset, val_dataloader, sampler = build_dataloader(dataset_cfg=cfg,
data_path=cfg.val_data_path,
workers=cfg.num_workers,
samples_per_gpu=cfg.batch_size,
num_gpus=total_gpus,
dist=True,
pipeline=cfg.val_pipeline,
training=False)
cfg.val_loader = val_dataloader
cfg.sampler = sampler
# 评估预训练模型
logger.info("\nEvaluating Pretrained Model...")
pretrain_miou, pretrain_class_ious = evaluate_model(seg_net, cfg)
# return
# 第二阶段:强化学习微调
logger.info("\n" + "="*50)
logger.info("Starting RL Finetuning...")
logger.info("="*50)
rl_seg_net, ppo_agent = rl_finetune(seg_net, cfg)
# 评估强化学习优化后的模型
logger.info("\nEvaluating RL Optimized Model...")
rl_miou, rl_class_ious = evaluate_model(rl_seg_net, cfg)
# 结果对比
if cfg.local_rank == 0:
logger.info("\nPerformance Comparison:")
logger.info(f"Pretrained mIoU: {pretrain_miou*100:.3f}")
logger.info(f"RL Optimized mIoU: {rl_miou*100:.3f}")
logger.info(f"Improvement: {(rl_miou - pretrain_miou)*100:.3f} ({((rl_miou - pretrain_miou)/pretrain_miou+1e-8)*100:.2f}%)")
if pretrain_miou > rl_miou:
torch.save({'state_dict': seg_net.module.state_dict()}, f"{cfg.exp_dir}/bese_model_{pretrain_miou}.pth")
else:
torch.save({'state_dict': rl_seg_net.state_dict()}, f"{cfg.exp_dir}/bese_model_{rl_miou}.pth")
logger.info("\nTraining completed successfully!")
def freeze_pre_backbone_layers(model, freeze_layers=['vfe', 'map_to_bev', 'backbone_2d']):
"""冻结主干网络前的所有层"""
# 常见预主干层名称(根据实际模型结构调整)
if hasattr(model, 'module'): # 处理 DDP 封装
model = model.module
for name, module in model.named_children():
# 冻结所有指定名称的模块
if name in freeze_layers:
logger.info(f"Freezing layer: {name}")
for param in module.parameters():
param.requires_grad = False
# 额外冻结非主干/分割头的模块
elif name not in ['at_seg_neck', 'at_seg_head']:
logger.info(f"Freezing non-core layer: {name}")
for param in module.parameters():
param.requires_grad = False
if __name__ == "__main__":
def args_config():
parser = argparse.ArgumentParser(description='arg parser')
parser.add_argument('--cfg_file', type=str, default="rl_seg/configs/rl_seg_leap.py",
help='specify the config for training')
parser.add_argument('--batch_size', type=int, default=None, required=False, help='batch size for training')
parser.add_argument('--ckpt', type=str, default=None, help='checkpoint to start from')
parser.add_argument('--pretrained_model', type=str, default=None, help='pretrained_model')
parser.add_argument('--exp', type=str, default=None, help='export dir.')
return parser.parse_args()
args = args_config()
main(args)
分析代码并解析代码方案,(Seg Loss: 0.0587, Reward: 0.2000, Policy Loss: -0.0000, Value Loss: 0.4583, Entropy Loss: -3.6290, Learning Rate: 9.95e-04)loss没有什么浮动,以及mIoU没有明显提升的现象,最后结果 Pretrained mIoU: 75.171 - RL Optimized mIoU: 74.355 - Improvement: -0.816 (-1.09%)排查配置参数及代码实现是否逻辑合理,然后提出优化方案
最新发布