yolov5训练部分

原创

已于 2024-10-29 09:29:45 修改 · 348 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#YOLO #深度学习 #机器学习

于 2024-09-24 10:13:26 首次发布

简易版

# 开始训练轮次
    for epoch in range(start_epoch,epochs):
        # 回调函数
        callbacks.run('on_train_epoch_start')
        # 设置模型为训练模式，这样模型会启用dropout和batchnormlization等训练特有的层
        model.train()
        # 初始化损失
        mloss=torch.zeros(3,device=device)

        # 清0优化器梯度
        optimizer.zero_grad()
        pbar=enumerate(train_loader)
        # 遍历每个批次
        for i,(imgs,targets,paths,_) in pbar:
            callbacks.run('on_train_batch_start')
            # 图像处理
            imgs=imgs.to(device,non_blocking=True).float()/255


        # forward
        # 上下文管理器 启用自动混合精度
        with torch.cuda.amp.autocast(amp):
            # 模型前向传播
            pred=model(imgs)
            # 计算损失
            loss,loss_items=compute_loss(pred,targets.to(device))

        # backward
        # 使用scaler 对损失值进行缩放
        scaler.scale(loss).backward()
        
        # optimizer
        # ni：当前的训练步骤 上次优化的步骤  设置的累计步数
        # 是否达到了梯度累计的标准
        if ni-last_opt_step>=accumulate:
            # 梯度从缩放状态转化为原始状态
            scaler.unscale_(optimizer)
            # 进行梯度裁剪，防止梯度爆炸 max_norm=10.0：裁剪的阈值
            torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=10.0)
            # 优化步骤
            scaler.step(optimizer)
            # 更新
            scaler.update()
            # 梯度清0，以准备下一次前向和反向传播
            optimizer.zero_grad()

日志记录表头信息

LOGGER.info(('\n' + '%11s' * 7) % ('Epoch', 'GPU_mem', 'box_loss', 'obj_loss', 'cls_loss', 'Instances', 'Size'))

(‘%11s’ * 7) 创建了一个格式字符串，表示每个字符串占 11 个字符的位置，并且重复了 7 次。

分布式训练准备

LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1))  # https://pytorch.org/docs/stable/elastic/run.html
RANK = int(os.getenv('RANK', -1))
WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
GIT_INFO = check_git_info()

断点训练Resume

def train(hyp, opt, device, callbacks):  # hyp is path/to/hyp.yaml or hyp dictionary
    save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze = \
        Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \
        opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze
    callbacks.run('on_pretrain_routine_start')

    # Directories
    w = save_dir / 'weights'  # weights dir
    # 如果目录已存在则不会报错（exist_ok=True），mkdir 创建目录
    (w.parent if evolve else w).mkdir(parents=True, exist_ok=True)  # make dir
    last, best = w / 'last.pt', w / 'best.pt'

    # Hyperparameters
    if isinstance(hyp, str):
        with open(hyp, errors='ignore') as f:
            hyp = yaml.safe_load(f)  # load hyps dict
    LOGGER.info(colorstr('hyperparameters: ') + ', '.join(f'{
     
     k}={
     
     v}' for k, v in hyp.items()))
    # 将超参数的副本存储在 opt.hyp 中，以便在检查点中保存
    opt.hyp = hyp.copy()  # for saving hyps to checkpoints

    # Save run settings
    if not evolve:
        yaml_save(save_dir / 'hyp.yaml', hyp)
        yaml_save(save_dir / 'opt.yaml', vars(opt))

    # Loggers
    data_dict = None
    # 如果当前进程是主进程（RANK 为 -1 或 0），则实例化日志记录器 Loggers，并传入保存目录、权重、选项、超参数和日志记录器
    if RANK in {
   
   -1, 0}:
        loggers = Loggers(save_dir, weights, opt, hyp, LOGGER)  # loggers instance

        # Register actions
        # 遍历 loggers 中的方法，并将这些方法注册为回调动作，以便在训练过程中能够调用它们
        for k in methods(loggers):
            callbacks.register_action(k, callback=getattr(loggers, k))

        # Process custom dataset artifact link
        # 从 loggers 中获取远程数据集的信息。
        data_dict = loggers.remote_dataset
        if resume:  # If resuming runs from remote artifact
            weights, epochs, hyp, batch_size = opt.weights, opt.epochs, opt.hyp, opt.batch_size

    # Config
    # plots 变量用于决定是否生成训练过程中的可视化图表。只有在不进行超参数进化且没有禁用绘图时，plots 才会为 True。
    plots = not evolve and not opt.noplots  # create plots
    cuda = device.type != 'cpu'
    # 初始化随机种子，以确保训练的可重复性。种子是根据传入的种子加上 RANK（进程标识符）生成的，deterministic=True 确保所有操作都是确定性的。
    init_seeds(opt.seed + 1 + RANK, deterministic=True)
    # 使用 torch_distributed_zero_first 上下文管理器确保在分布式训练中，只有主进程（LOCAL_RANK 为 0）会调用 check_dataset 函数检查数据集。如果 data_dict 是 None，则调用 check_dataset 来验证数据集的有效性。
    with torch_distributed_zero_first(LOCAL_RANK):
        data_dict = data_dict or check_dataset(data)  # check if None
    # 从 data_dict 中提取训练和验证数据集的路径
    train_path, val_path = data_dict['train'], data_dict['val']
    # 根据 single_cls 的值设置类别数量。如果是单类别（single_cls 为 True），则类别数量为 1；否则从 data_dict 中获取
    nc = 1 if single_cls else int(data_dict['nc'])  # number of classes
    # 如果是单类别并且 data_dict 中的名称数量不是 1，则将名称设置为 {0: 'item'}；否则使用 data_dict 中的名称列表。
    names = {
   
   0: 'item'} if single_cls and len(data_dict['names'