【目标检测】swin-transformer的模型推理

1. 源代码

https://github.com/SwinTransformer/Swin-Transformer-Object-Detection

2. 相关文件

在这里插入图片描述

3. 代码

3.1 infer

from mmdet.apis import init_detector, inference_detector, show_result_pyplot
# import mmcv
import numpy as np
import cv2
import random


def InferResult(img, config_file, checkpoint_file, classes, model):
	result = inference_detector(model, img)

	if isinstance(result, tuple):
		bbox_result, segm_result = result
		if isinstance(segm_result, tuple):
			segm_result = segm_result[0]  # ms rcnn
	else:
		bbox_result, segm_result = result, None

	bboxes = np.vstack(bbox_result)
	labels = [
		np.full(bbox.shape[0], i, dtype=np.int32)
		for i, bbox in enumerate(bbox_result)
	]
	labels = np.concatenate(labels)

	assert bboxes.shape[1] == 5
	scores = bboxes[:, -1]
	inds = scores > 0.3
	bboxes = bboxes[inds, :]
	labels = labels[inds]
	# print(bboxes[:, :4])
	# print(bboxes[:, -1:])
	img, result_label = draw(img, bboxes, classes, labels)
	return img, result_label

def plot_one_box(x, img, label=None, color=None, line_thickness=None):
	# Plots one bounding box on image img
	tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1  # line/font thickness
	color = color or [random.randint(0, 255) for _ in range(3)]
	c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
	cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
	if label:
		tf = max(tl - 1, 1)  # font thickness
		t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
		c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
		cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
		cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)

# show the results
def draw(img, bboxes, classes, labels):
	img = cv2.imread(img)
	colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(classes))]
	result_label = []
	for i in range(bboxes.shape[0]):
		xyxy = bboxes[i, :4]
		conf = bboxes[i, -1:]
		label = '%s %.2f' % (classes[labels[i]], conf)
		label = '%s' % (classes[labels[i]])
		result_label.append(label)
		plot_one_box(xyxy, img, label=label, color=colors[0], line_thickness=1)
	result_label = list(set(result_label))
	cv2.imshow('test', img)
	cv2.waitKey()
	return img, result_label

if __name__ == '__main__':
	img = '2.jpg'
	# img = cv2.imread('./2.jpg')
	config_file = 'mask_rcnn_swin_tiny_patch4_window7_mstrain_480-800_adamw_1x_coco.py'
	checkpoint_file = 'epoch_100.pth'
	classes = ['The color of silica gel is abnormal', 'The color of silica gel is normal', 'The door is open', 'The door is close', 'breakage', 'dirt', 'rust', 'foreign object', 'oil leakage', 'animal', 'hat', 'person']
	model = init_detector(config_file, checkpoint_file, device='cuda:0')
	img, result_label = InferResult(img, config_file, checkpoint_file, classes, model)

3.2 mask_rcnn_swin_tiny_patch4_window7_mstrain_480-800_adamw_1x_coco

model = dict(
    type='MaskRCNN',
    pretrained=None,
    backbone=dict(
        type='SwinTransformer',
        embed_dim=96,
        depths=[2, 2, 6, 2],
        num_heads=[3, 6, 12, 24],
        window_size=7,
        mlp_ratio=4.0,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.1,
        ape=False,
        patch_norm=True,
        out_indices=(0, 1, 2, 3),
        use_checkpoint=False),
    neck=dict(
        type='FPN',
        in_channels=[96, 192, 384, 768],
        out_channels=256,
        num_outs=5),
    rpn_head=dict(
        type='RPNHead',
        in_channels=256,
        feat_channels=256,
        anchor_generator=dict(
            type='AnchorGenerator',
            scales=[8],
            ratios=[0.5, 1.0, 2.0],
            strides=[4, 8, 16, 32, 64]),
        bbox_coder=dict(
            type='DeltaXYWHBBoxCoder',
            target_means=[0.0, 0.0, 0.0, 0.0],
            target_stds=[1.0, 1.0, 1.0, 1.0]),
        loss_cls=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
    roi_head=dict(
        type='StandardRoIHead',
        bbox_roi_extractor=dict(
            type='SingleRoIExtractor',
            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
            out_channels=256,
            featmap_strides=[4, 8, 16, 32]),
        bbox_head=dict(
            type='Shared2FCBBoxHead',
            in_channels=256,
            fc_out_channels=1024,
            roi_feat_size=7,
            num_classes=12,
            bbox_coder=dict(
                type='DeltaXYWHBBoxCoder',
                target_means=[0.0, 0.0, 0.0, 0.0],
                target_stds=[0.1, 0.1, 0.2, 0.2]),
            reg_class_agnostic=False,
            loss_cls=dict(
                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
        mask_roi_extractor=dict(
            type='SingleRoIExtractor',
            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
            out_channels=256,
            featmap_strides=[4, 8, 16, 32]),
        mask_head=dict(
            type='FCNMaskHead',
            num_convs=4,
            in_channels=256,
            conv_out_channels=256,
            num_classes=12,
            loss_mask=dict(
                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                pos_iou_thr=0.7,
                neg_iou_thr=0.3,
                min_pos_iou=0.3,
                match_low_quality=True,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
                num=256,
                pos_fraction=0.5,
                neg_pos_ub=-1,
                add_gt_as_proposals=False),
            allowed_border=-1,
            pos_weight=-1,
            debug=False),
        rpn_proposal=dict(
            nms_pre=2000,
            max_per_img=1000,
            nms=dict(type='nms', iou_threshold=0.7),
            min_bbox_size=0),
        rcnn=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                pos_iou_thr=0.5,
                neg_iou_thr=0.5,
                min_pos_iou=0.5,
                match_low_quality=True,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
                num=512,
                pos_fraction=0.25,
                neg_pos_ub=-1,
                add_gt_as_proposals=True),
            mask_size=28,
            pos_weight=-1,
            debug=False)),
    test_cfg=dict(
        rpn=dict(
            nms_pre=1000,
            max_per_img=1000,
            nms=dict(type='nms', iou_threshold=0.7),
            min_bbox_size=0),
        rcnn=dict(
            score_thr=0.05,
            nms=dict(type='nms', iou_threshold=0.5),
            max_per_img=100,
            mask_thr_binary=0.5)))
dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(
        type='AutoAugment',
        policies=[[{
            'type':
            'Resize',
            'img_scale': [(480, 1333), (512, 1333), (544, 1333), (576, 1333),
                          (608, 1333), (640, 1333), (672, 1333), (704, 1333),
                          (736, 1333), (768, 1333), (800, 1333)],
            'multiscale_mode':
            'value',
            'keep_ratio':
            True
        }],
                  [{
                      'type': 'Resize',
                      'img_scale': [(400, 1333), (500, 1333), (600, 1333)],
                      'multiscale_mode': 'value',
                      'keep_ratio': True
                  }, {
                      'type': 'RandomCrop',
                      'crop_type': 'absolute_range',
                      'crop_size': (384, 600),
                      'allow_negative_crop': True
                  }, {
                      'type':
                      'Resize',
                      'img_scale': [(480, 1333), (512, 1333), (544, 1333),
                                    (576, 1333), (608, 1333), (640, 1333),
                                    (672, 1333), (704, 1333), (736, 1333),
                                    (768, 1333), (800, 1333)],
                      'multiscale_mode':
                      'value',
                      'override':
                      True,
                      'keep_ratio':
                      True
                  }]]),
    dict(
        type='Normalize',
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375],
        to_rgb=True),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(
                type='Normalize',
                mean=[123.675, 116.28, 103.53],
                std=[58.395, 57.12, 57.375],
                to_rgb=True),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img'])
        ])
]
data = dict(
    samples_per_gpu=3,
    workers_per_gpu=0,
    train=dict(
        type='CocoDataset',
        ann_file='data/coco/annotations/instances_train2017.json',
        img_prefix='data/coco/train2017/',
        pipeline=[
            dict(type='LoadImageFromFile'),
            dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
            dict(type='RandomFlip', flip_ratio=0.5),
            dict(
                type='AutoAugment',
                policies=[[{
                    'type':
                    'Resize',
                    'img_scale': [(480, 1333), (512, 1333), (544, 1333),
                                  (576, 1333), (608, 1333), (640, 1333),
                                  (672, 1333), (704, 1333), (736, 1333),
                                  (768, 1333), (800, 1333)],
                    'multiscale_mode':
                    'value',
                    'keep_ratio':
                    True
                }],
                          [{
                              'type': 'Resize',
                              'img_scale': [(400, 1333), (500, 1333),
                                            (600, 1333)],
                              'multiscale_mode': 'value',
                              'keep_ratio': True
                          }, {
                              'type': 'RandomCrop',
                              'crop_type': 'absolute_range',
                              'crop_size': (384, 600),
                              'allow_negative_crop': True
                          }, {
                              'type':
                              'Resize',
                              'img_scale': [(480, 1333), (512, 1333),
                                            (544, 1333), (576, 1333),
                                            (608, 1333), (640, 1333),
                                            (672, 1333), (704, 1333),
                                            (736, 1333), (768, 1333),
                                            (800, 1333)],
                              'multiscale_mode':
                              'value',
                              'override':
                              True,
                              'keep_ratio':
                              True
                          }]]),
            dict(
                type='Normalize',
                mean=[123.675, 116.28, 103.53],
                std=[58.395, 57.12, 57.375],
                to_rgb=True),
            dict(type='Pad', size_divisor=32),
            dict(type='DefaultFormatBundle'),
            dict(
                type='Collect',
                keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
        ]),
    val=dict(
        type='CocoDataset',
        ann_file='data/coco/annotations/instances_val2017.json',
        img_prefix='data/coco/val2017/',
        pipeline=[
            dict(type='LoadImageFromFile'),
            dict(
                type='MultiScaleFlipAug',
                img_scale=(1333, 800),
                flip=False,
                transforms=[
                    dict(type='Resize', keep_ratio=True),
                    dict(type='RandomFlip'),
                    dict(
                        type='Normalize',
                        mean=[123.675, 116.28, 103.53],
                        std=[58.395, 57.12, 57.375],
                        to_rgb=True),
                    dict(type='Pad', size_divisor=32),
                    dict(type='ImageToTensor', keys=['img']),
                    dict(type='Collect', keys=['img'])
                ])
        ]),
    test=dict(
        type='CocoDataset',
        ann_file='data/coco/annotations/instances_test2017.json',
        img_prefix='data/coco/val2017/',
        pipeline=[
            dict(type='LoadImageFromFile'),
            dict(
                type='MultiScaleFlipAug',
                img_scale=(1333, 800),
                flip=False,
                transforms=[
                    dict(type='Resize', keep_ratio=True),
                    dict(type='RandomFlip'),
                    dict(
                        type='Normalize',
                        mean=[123.675, 116.28, 103.53],
                        std=[58.395, 57.12, 57.375],
                        to_rgb=True),
                    dict(type='Pad', size_divisor=32),
                    dict(type='DefaultFormatBundle'),
                    dict(type='Collect', keys=['img'])
                ])
        ]))
evaluation = dict(interval=1, metric='bbox')
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    betas=(0.9, 0.999),
    weight_decay=0.05,
    paramwise_cfg=dict(
        custom_keys=dict(
            absolute_pos_embed=dict(decay_mult=0.0),
            relative_position_bias_table=dict(decay_mult=0.0),
            norm=dict(decay_mult=0.0))))
optimizer_config = dict(
    grad_clip=None,
    type='DistOptimizerHook',
    update_interval=1,
    coalesce=True,
    bucket_size_mb=-1,
    use_fp16=True)
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=0.001,
    step=[8, 11])
runner = dict(type='EpochBasedRunnerAmp', max_epochs=100)
checkpoint_config = dict(interval=2)
log_config = dict(interval=10, hooks=[dict(type='TextLoggerHook')])
custom_hooks = [dict(type='NumClassCheckHook')]
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = 'mask_rcnn_swin_12.pth'
resume_from = None
workflow = [('train', 1)]
fp16 = None
work_dir = './work_dirs\mask_rcnn_swin_tiny_patch4_window7_mstrain_480-800_adamw_1x_coco'
gpu_ids = range(0, 1)

### 使用 Swin Transformer 进行目标检测推理实现 为了使用 Swin Transformer 进行目标检测推理,可以按照以下方式构建流程。以下是基于官方文档和社区实践总结的方法。 #### 1. 安装依赖环境 在开始之前,确保已安装所需的 Python 库以及 PyTorch 和 torchvision 的正确版本。如果遇到安装问题,可参考相关资源[^5]来下载适合 CUDA 版本的预编译包并验证其功能。 ```bash pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio===0.8.0 -f https://download.pytorch.org/whl/torch_stable.html ``` #### 2. 下载模型权重文件 对于 Swin Transformer 结合 Mask R-CNN 架构的目标检测任务,需先获取训练好的模型权重文件。可以通过如下命令下载: ```bash wget https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco/mask_rcnn_swin-t-p4-w7_fpn_1x_coco_20210902_120937-9d6b7cfa.pth ``` 此链接提供了 COCO 数据集上预训练的 Swin-Tiny 模型权重[^4]。 #### 3. 配置解析器参数 通过 `argparse` 设置脚本运行所需的基础配置项。例如,在测试阶段可能需要指定输入图像路径、输出目录以及其他超参设置。 ```python import argparse def parse_args(): parser = argparse.ArgumentParser('Swin Transformer Test Script', add_help=False) parser.add_argument('--config-file', type=str, default="configs/swin_t_mask_rcnn.py", help='path to config file') parser.add_argument('--input-image', type=str, required=True, help='path to input image') parser.add_argument('--output-dir', type=str, default="./outputs/", help='directory to save results') return parser.parse_args() ``` 上述代码片段展示了如何定义一个简单的命令行接口用于接收外部传入的关键字参数[^2]。 #### 4. 加载模型与前向传播 加载预训练模型并对单张图片执行预测操作。这里假设已经准备好了一个 MMDetection 格式的配置文件 (`swin_t_mask_rcnn.py`) 来描述网络结构及其初始化细节。 ```python from mmdet.apis import init_detector, inference_detector import mmcv # 初始化探测器实例 args = parse_args() model = init_detector(args.config_file, checkpoint='mask_rcnn_swin-t-p4-w7_fpn_1x_coco_20210902_120937-9d6b7cfa.pth') # 对给定图像进行推断 image_path = args.input_image result = inference_detector(model, image_path) # 可视化结果保存到磁盘 out_file = f"{args.output_dir}/detected_{mmcv.imfrombytes(mmcv.imread(image_path)).shape[:2]}_.png" model.show_result(image_path, result, out_file=out_file, score_thr=0.3) ``` 以上部分实现了从读取用户提供的图像数据直至完成物体识别并将标注后的效果图存储至本地的功能。 #### 5. 调整优化选项 (可选) 如果有特殊需求比如改变窗口大小或者调整特征金字塔层数,则可以根据具体应用场景修改对应的 YAML 或者 Python 配置文件中的字段值后再重新启动整个过程即可。 --- ### 注意事项 - 如果计划部署于 GPU 上加速计算速度,请确认显存容量充足且驱动程序兼容最新框架版本。 - 当处理大规模视频流时考虑采用批量模式进一步提升效率减少延迟时间开销。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

只搬烫手的砖

你的鼓励将是我最大的动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值