mmdetection-3.1.0 用自定义数据集训练目标检测算法RTMDet

本文介绍目标检测模型训练相关内容。首先说明安装环境可参考官方文档,接着阐述用labeImg标注工具制作coco数据集,包括准备自定义数据集和进行数据集转换。然后以rtmdet - l模型为例,配置训练config文件和参数,最后执行训练脚本开启训练。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

一 、 安装环境:

    参考官方文档,这里不重点描述.

二、制作coco数据集:

        本案例是用labeImg标注工具进行标注的xml格式的标注文件(我的是voc格式)。

        ① 准备自定义数据集:

                将所有的标注文件放到Annotations文件夹下,所有的图片放到JPEGImages文件夹下:

        如下图所示:

        

         ② 数据集转换(转换为coco数据集格式):

                  首先划分数据集(训练集和验证机划分),运行python脚本 splitVOCDataset.py ,具体内容:

import os
import random
import math
from tqdm import tqdm

# 划分VOC格式的数据集, 找出没有对应.xml的图片文件
image_extensions = ['.jpg', '.png', '.gif', '.bmp', '.tiff', '.jpeg', '.webp', '.svg', '.psd', '.cr2', '.nef', '.dng']


def split_voc_dataset(dataset_dir, train_ratio, val_ratio, use_random_seed=False, random_seed=999):
    if not (0 < train_ratio + val_ratio <= 1):
        print("Invalid ratio values. They should sum up to 1.")
        return

    annotations_dir = os.path.join(dataset_dir, 'Annotations')
    images_dir = os.path.join(dataset_dir, 'JPEGImages')
    output_dir = os.path.join(dataset_dir, 'ImageSets/Main')

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    dict_info = dict()
    # List all the image files in the JPEGImages directory
    for file in os.listdir(images_dir):
        if any(ext in file for ext in image_extensions):
            jpg_files, endwith = os.path.splitext(file)
            dict_info[jpg_files] = endwith

    # List all the XML files in the Annotations directory
    xml_files = [file for file in os.listdir(annotations_dir) if file.endswith('.xml')]

    if use_random_seed:
        # Set the random seed for reproducibility
        random.seed(random_seed)

    random.shuffle(xml_files)

    num_samples = len(xml_files)
    print('1. >>>>>> num_samples(total xml data): ', num_samples);
    num_train = math.ceil(num_samples * train_ratio)
    print('2. >>>>>> num_train: ', num_train);
    num_val = num_samples - num_train
    print('3. >>>>>> num_val: ', num_val);
    train_xml_files = xml_files[:num_train]
    val_xml_files = xml_files[num_train:num_train + num_val]

    with open(os.path.join(output_dir, 'train_list.txt'), 'w+') as train_file:
        for xml_file in train_xml_files:
            image_name = os.path.splitext(xml_file)[0]
            if image_name in dict_info:
                image_path = os.path.join('JPEGImages', image_name + dict_info[image_name])
                annotation_path = os.path.join('Annotations', xml_file)
                train_file.write(f'{image_path}\t{annotation_path}\n')
            else:
                print(f"没有找到图片 {os.path.join(images_dir, image_name)}")

    with open(os.path.join(output_dir, 'val_list.txt'), 'w+') as val_file:
        for xml_file in val_xml_files:
            image_name = os.path.splitext(xml_file)[0]
            if image_name in dict_info:
                image_path = os.path.join('JPEGImages', image_name + dict_info[image_name])
                annotation_path = os.path.join('Annotations', xml_file)
                val_file.write(f'{image_path}\t{annotation_path}\n')
            else:
                print(f"没有找到图片 {os.path.join(images_dir, image_name)}")

    labels = set()
    for xml_file in tqdm(xml_files):
        annotation_path = os.path.join(annotations_dir, xml_file)
        with open(annotation_path, 'r+', encoding='utf-8') as f:
            lines = f.readlines()
            for line in lines:
                if '<name>' in line:
                    label = line.strip().replace('<name>', '').replace('</name>', '')
                    labels.add(label)

    with open(os.path.join(output_dir, 'labels.txt'), 'w+') as labels_file:
        for label in labels:
            labels_file.write(f'{label}\n')

if __name__ == "__main__":
    train_ratio = 0.85  # Adjust the train-validation split ratio as needed
    val_ratio = 1 - train_ratio

    # 假如使用随机种子,并且自己设置种子数值,则换种子后划分后的训练集和验证集图片会不一样;
    # 如果不使用种子(默认种子),每次的训练集和验证集图片划分的部分是一样的,但txt记录的循序会变;
    random_seed = 3688
    use_random_seed = True
    dataset_dir = '/home/topf/dataSet/customer_data/'
    split_voc_dataset(dataset_dir, train_ratio, val_ratio, use_random_seed, random_seed)


 # python demo/image_demo.py /home/topf/gr/qc/JPEGImages/IMG_20240119_153924.jpg  configs/boxinst/boxinst_r101_fpn_ms-90k_coco.py --weights  check_points/

这步骤执行完后会生成一个ImageSets文件夹:

         

 里面内容是:

   其次转换为coco数据集格式:运行脚本voc2cocoFormat.py, 具体代码内容为:

import os
import json

from xml.etree import ElementTree as ET


def parse_xml(dataset_dir, xml_file):
    xml_path = os.path.join(dataset_dir, xml_file)
    tree = ET.parse(xml_path)
    root = tree.getroot()

    objects = root.findall('object')
    annotations = []

    for obj in objects:
        bbox = obj.find('bndbox')
        xmin = int(bbox.find('xmin').text)
        ymin = int(bbox.find('ymin').text)
        xmax = int(bbox.find('xmax').text)
        ymax = int(bbox.find('ymax').text)

        # Extract label from XML annotation
        label = obj.find('name').text
        if not label:
            print(f"Label not found in XML annotation. Skipping annotation.")
            continue

        annotations.append({
            'xmin': xmin,
            'ymin': ymin,
            'xmax': xmax,
            'ymax': ymax,
            'label': label
        })

    return annotations


def convert_to_coco_format(image_list_file, annotations_dir, output_json_file, dataset_dir):
    images = []
    annotations = []
    categories = []

    # Load labels
    with open(os.path.join(annotations_dir, 'labels.txt'), 'r+', encoding='utf-8') as labels_file:
        label_lines = labels_file.readlines()
        categories = [{'id': i + 1, 'name': label.strip()} for i, label in enumerate(label_lines)]

    annotation_id = 1  # Initialize unique annotation ID

    # Load image list file
    with open(image_list_file, 'r+') as image_list:
        image_lines = image_list.readlines()
        for i, line in enumerate(image_lines):
            image_path, annotation_path = line.strip().split('\t')
            image_id = i + 1
            image_filename = os.path.basename(image_path)

            # Extract image size from XML file
            xml_path = os.path.join(dataset_dir, annotation_path)
            tree = ET.parse(xml_path)
            size = tree.find('size')
            image_height = int(size.find('height').text)
            image_width = int(size.find('width').text)

            images.append({
                'id': image_id,
                'file_name': image_filename,
                'height': image_height,
                'width': image_width,
                'license': None,
                'flickr_url': None,
                'coco_url': None,
                'date_captured': None
            })

            # Load annotations from XML files
            xml_annotations = parse_xml(dataset_dir, annotation_path)
            for xml_annotation in xml_annotations:
                label = xml_annotation['label']
                category_id = next((cat['id'] for cat in categories if cat['name'] == label), None)
                if category_id is None:
                    print(f"Label '{label}' not found in categories. Skipping annotation.")
                    continue

                bbox = {
                    'xmin': xml_annotation['xmin'],
                    'ymin': xml_annotation['ymin'],
                    'xmax': xml_annotation['xmax'],
                    'ymax': xml_annotation['ymax']
                }

                annotations.append({
                    'id': annotation_id,
                    'image_id': image_id,
                    'category_id': category_id,
                    'bbox': [bbox['xmin'], bbox['ymin'], bbox['xmax'] - bbox['xmin'], bbox['ymax'] - bbox['ymin']],
                    'area': (bbox['xmax'] - bbox['xmin']) * (bbox['ymax'] - bbox['ymin']),
                    'segmentation': [],
                    'iscrowd': 0
                })
                annotation_id += 1  # Increment annotation ID for uniqueness

    coco_data = {
        'images': images,
        'annotations': annotations,
        'categories': categories
    }

    with open(output_json_file, 'w+') as json_file:
        json.dump(coco_data, json_file, indent=4)


if __name__ == "__main__":
    # Adjust paths as needed
    output_dataset_dir = '/home/topf/dataSet/customer_data/'
    image_sets_dir = '/home/topf/dataSet/customer_data/ImageSets/Main/'
    train_list_file = os.path.join(image_sets_dir, 'train_list.txt')
    val_list_file = os.path.join(image_sets_dir, 'val_list.txt')
    output_train_json_file = os.path.join(output_dataset_dir, 'train_coco.json')
    output_val_json_file = os.path.join(output_dataset_dir, 'val_coco.json')

    convert_to_coco_format(train_list_file, image_sets_dir, output_train_json_file, output_dataset_dir)
    convert_to_coco_format(val_list_file, image_sets_dir, output_val_json_file, output_dataset_dir)
    print("The JSON file has been successfully generated!!!")

此步骤执行完毕后生成train_coco.jsonval_coco.json 如下图所示: 

三、配置训练config文件和训练参数:

我们以rtmdet-l模型为例,

① 复制文件configs/_base_/datasets/coco_detection.py 重命名为:  coco_detection_person.py,具体内容为(修改项请详看注释部分):

# dataset settings
dataset_type = 'CocoDataset'
###数据集路径
data_root = 'dataset/det/person_det/coco/'
###定义数据集标签(注意!!!在下面dict配置的时候要引用)
metainfo = {
    'classes': ('person', ),
    'palette': [
        (220, 20, 60),
    ]
}

# Example to use different file client
# Method 1: simply set the data root and let the file I/O module
# automatically infer from prefix (not support LMDB and Memcache yet)

# data_root = 's3://openmmlab/datasets/detection/coco/'

# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
# backend_args = dict(
#     backend='petrel',
#     path_mapping=dict({
#         './data/': 's3://openmmlab/datasets/detection/',
#         'data/': 's3://openmmlab/datasets/detection/'
#     }))
backend_args = None

train_pipeline = [
    dict(type='LoadImageFromFile', backend_args=backend_args),
    dict(type='LoadAnnotations', with_bbox=True),
    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
    dict(type='RandomFlip', prob=0.5),
    dict(type='PackDetInputs')
]
test_pipeline = [
    dict(type='LoadImageFromFile', backend_args=backend_args),
    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
    # If you don't have a gt annotation, delete the pipeline
    dict(type='LoadAnnotations', with_bbox=True),
    dict(
        type='PackDetInputs',
        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
                   'scale_factor'))
]


###配置训练集
train_dataloader = dict(
    batch_size=2,
    num_workers=2,
    persistent_workers=True,
    sampler=dict(type='DefaultSampler', shuffle=True),
    batch_sampler=dict(type='AspectRatioBatchSampler'),
    dataset=dict(
        type=dataset_type,
        ###引用上面配置的数据集标签
        metainfo=metainfo,
        ###数据集路径
        data_root=data_root,
        ###数据集->json路径
        ann_file='annotations/train_coco.json',
        ###数据集->图片路径
        data_prefix=dict(img='JPEGImages/'),
        filter_cfg=dict(filter_empty_gt=True, min_size=32),
        pipeline=train_pipeline,
        backend_args=backend_args))

###配置验证集
val_dataloader = dict(
    batch_size=1,
    num_workers=2,
    persistent_workers=True,
    drop_last=False,
    sampler=dict(type='DefaultSampler', shuffle=False),
    dataset=dict(
        type=dataset_type,
        ###引用上面配置的数据集标签
        metainfo=metainfo,
        ###数据集路径
        data_root=data_root,
        ###数据集->json路径
        ann_file='annotations/val_coco.json',
        ###数据集->图片路径
        data_prefix=dict(img='JPEGImages/'),
        test_mode=True,
        pipeline=test_pipeline,
        backend_args=backend_args))
test_dataloader = val_dataloader

val_evaluator = dict(
    type='CocoMetric',
    ann_file=data_root + 'annotations/val_coco.json',
    metric='bbox',
    format_only=False,
    backend_args=backend_args)
test_evaluator = val_evaluator

# inference on test dataset and
# format the output results for submission.
# test_dataloader = dict(
#     batch_size=1,
#     num_workers=2,
#     persistent_workers=True,
#     drop_last=False,
#     sampler=dict(type='DefaultSampler', shuffle=False),
#     dataset=dict(
#         type=dataset_type,
#         data_root=data_root,
#         ann_file=data_root + 'annotations/image_info_test-dev2017.json',
#         data_prefix=dict(img='test2017/'),
#         test_mode=True,
#         pipeline=test_pipeline))
# test_evaluator = dict(
#     type='CocoMetric',
#     metric='bbox',
#     format_only=True,
#     ann_file=data_root + 'annotations/image_info_test-dev2017.json',
#     outfile_prefix='./work_dirs/coco_detection/test')

② 复制 configs/rtmdet/rtmdet_l_8xb32-300e_coco.py 重命名为:

rtmdet_l_8xb32-300e_coco_person_det.py,具体内容为(修改项请详看注释部分):
### ../_base_/datasets/coco_detection 改为 ../_base_/datasets/coco_detection_person.py
_base_ = [
    '../_base_/default_runtime.py', '../_base_/schedules/schedule_1x.py',
    '../_base_/datasets/coco_detection_person.py', './rtmdet_tta.py'
]
###加载预训练模型
load_from = '/home/topf/ai-project/mmdetection-3.1.0/checkpoints/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth'

model = dict(
    type='RTMDet',
    data_preprocessor=dict(
        type='DetDataPreprocessor',
        mean=[103.53, 116.28, 123.675],
        std=[57.375, 57.12, 58.395],
        bgr_to_rgb=False,
        batch_augments=None),
    backbone=dict(
        type='CSPNeXt',
        arch='P5',
        expand_ratio=0.5,
        deepen_factor=1,
        widen_factor=1,
        channel_attention=True,
        norm_cfg=dict(type='SyncBN'),
        act_cfg=dict(type='SiLU', inplace=True)),
    neck=dict(
        type='CSPNeXtPAFPN',
        in_channels=[256, 512, 1024],
        out_channels=256,
        num_csp_blocks=3,
        expand_ratio=0.5,
        norm_cfg=dict(type='SyncBN'),
        act_cfg=dict(type='SiLU', inplace=True)),
    bbox_head=dict(
        type='RTMDetSepBNHead',
        ###修改自定义类别数量
        num_classes=1,
        in_channels=256,
        stacked_convs=2,
        feat_channels=256,
        anchor_generator=dict(
            type='MlvlPointGenerator', offset=0, strides=[8, 16, 32]),
        bbox_coder=dict(type='DistancePointBBoxCoder'),
        loss_cls=dict(
            type='QualityFocalLoss',
            use_sigmoid=True,
            beta=2.0,
            loss_weight=1.0),
        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
        with_objectness=False,
        exp_on_reg=True,
        share_conv=True,
        pred_kernel_size=1,
        norm_cfg=dict(type='SyncBN'),
        act_cfg=dict(type='SiLU', inplace=True)),
    train_cfg=dict(
        assigner=dict(type='DynamicSoftLabelAssigner', topk=13),
        allowed_border=-1,
        pos_weight=-1,
        debug=False),
    test_cfg=dict(
        nms_pre=30000,
        min_bbox_size=0,
        score_thr=0.001,
        nms=dict(type='nms', iou_threshold=0.65),
        max_per_img=300),
)



train_pipeline = [
    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
    dict(type='LoadAnnotations', with_bbox=True),
    dict(type='CachedMosaic', img_scale=(640, 640), pad_val=114.0),
    dict(
        type='RandomResize',
        scale=(1280, 1280),
        ratio_range=(0.1, 2.0),
        keep_ratio=True),
    dict(type='RandomCrop', crop_size=(640, 640)),
    dict(type='YOLOXHSVRandomAug'),
    dict(type='RandomFlip', prob=0.5),
    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
    dict(
        type='CachedMixUp',
        img_scale=(640, 640),
        ratio_range=(1.0, 1.0),
        max_cached_images=20,
        pad_val=(114, 114, 114)),
    dict(type='PackDetInputs')
]

train_pipeline_stage2 = [
    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
    dict(type='LoadAnnotations', with_bbox=True),
    dict(
        type='RandomResize',
        scale=(640, 640),
        ratio_range=(0.1, 2.0),
        keep_ratio=True),
    dict(type='RandomCrop', crop_size=(640, 640)),
    dict(type='YOLOXHSVRandomAug'),
    dict(type='RandomFlip', prob=0.5),
    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
    dict(type='PackDetInputs')
]

test_pipeline = [
    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
    dict(type='Resize', scale=(640, 640), keep_ratio=True),
    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
    dict(type='LoadAnnotations', with_bbox=True),
    dict(
        type='PackDetInputs',
        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
                   'scale_factor'))
]

train_dataloader = dict(
    ###根据自己显卡内存和算力情况修改(我的测试卡是rtx2080ti)
    batch_size=2,
    num_workers=2,
    batch_sampler=None,
    pin_memory=True,
    dataset=dict(pipeline=train_pipeline))

val_dataloader = dict(
    ###根据自己显卡内存和算力情况修改(我的测试卡是rtx2080ti)
    batch_size=2, num_workers=2, dataset=dict(pipeline=test_pipeline))
test_dataloader = val_dataloader

### max_epochs = 300
### 总设置的epoch
max_epochs = 600
stage2_num_epochs = 20
### base_lr = 0.004
### base学习率
base_lr = 0.002
### 权重文件中间存储频次(每30个epoch存一次)
interval = 30

train_cfg = dict(
    max_epochs=max_epochs,
    val_interval=interval,
    dynamic_intervals=[(max_epochs - stage2_num_epochs, 1)])


val_evaluator = dict(proposal_nums=(100, 1, 10))
test_evaluator = val_evaluator

# optimizer
optim_wrapper = dict(
    _delete_=True,
    type='OptimWrapper',
    optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
    paramwise_cfg=dict(
        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))

# learning rate
param_scheduler = [
    dict(
        type='LinearLR',
        start_factor=1.0e-5,
        by_epoch=False,
        begin=0,
        end=1000),
    dict(
        # use cosine lr from 150 to 300 epoch
        type='CosineAnnealingLR',
        eta_min=base_lr * 0.05,
        begin=max_epochs // 2,
        end=max_epochs,
        T_max=max_epochs // 2,
        by_epoch=True,
        convert_to_iter_based=True),
]

# hooks
default_hooks = dict(
    checkpoint=dict(
        interval=interval,
        max_keep_ckpts=3  # only keep latest 3 checkpoints
    ))
custom_hooks = [
    dict(
        type='EMAHook',
        ema_type='ExpMomentumEMA',
        momentum=0.0002,
        update_buffers=True,
        priority=49),
    dict(
        type='PipelineSwitchHook',
        switch_epoch=max_epochs - stage2_num_epochs,
        switch_pipeline=train_pipeline_stage2)
]

配置准备工作准备完毕。

四、开始训练:

执行训练脚本 :

python tools/train.py configs/rtmdet/rtmdet_l_8xb32-300e_coco_person_det.py --work-dir  work_dir_person_det/

至此训练旅程开始~   enjoy yourself ~

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值