本文介绍
为提升 YOLOv8 在目标检测任务中的特征表达能力,本文通过 CVPR2025 OverLock 所提出的Deep-stage Decomposition Strategy中Base-Net的核心模块RepConvBlock改进YOLOv8的C2f模块。RepConvBlock通过膨胀卷积、通道注意力等其他模块实现了在学习到较大感受野的同时增强卷积神经网络的特征表达能力。 实验结果如下(本文通过VOC数据验证算法性能,epoch为100,batchsize为32,imagesize为640*640):
Model | mAP50-95 | mAP50 | run time (h) | params (M) | interence time (ms) |
---|---|---|---|---|---|
YOLOv8 | 0.549 | 0.760 | 1.051 | 3.01 | 0.2+0.3(postprocess) |
YOLO11 | 0.553 | 0.757 | 1.142 | 2.59 | 0.2+0.3(postprocess) |
YOLOv8_C2f-RepConvBlock | 0.557 | 0.765 | 1.481 | 2.54 | 0.4+0.3(postprocess) |
重要声明:本文改进后代码可能只是并不适用于我所使用的数据集,对于其他数据集可能存在有效性。
本文改进是为了降低最新研究进展至YOLO的代码迁移难度,从而为对最新研究感兴趣的同学提供参考。
代码迁移
重点内容
步骤一:迁移代码
ultralytics框架的模块代码主要放在ultralytics/nn
文件夹下,此处为了与官方代码进行区分,可以新增一个extra_modules
文件夹,然后将我们的代码添加进入。
具体代码如下:
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
from torch.utils.checkpoint import checkpoint
from timm.models.layers import DropPath, to_2tuple
__all__ = ['RepConvBlock']
def get_conv2d(in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
attempt_use_lk_impl=True):
kernel_size = to_2tuple(kernel_size)
if padding is None:
padding = (kernel_size[0] // 2, kernel_size[1] // 2)
else:
padding = to_2tuple(padding)
need_large_impl = kernel_size[0] == kernel_size[1] and kernel_size[0] > 5 and padding == (kernel_size[0] // 2, kernel_size[1] // 2)
if attempt_use_lk_impl and need_large_impl:
print('---------------- trying to import iGEMM implementation for large-kernel conv')
try:
from depthwise_conv2d_implicit_gemm import DepthWiseConv2dImplicitGEMM
print('---------------- found iGEMM implementation ')
except:
DepthWiseConv2dImplicitGEMM = None
print('---------------- found no iGEMM. use original conv. follow https://github.com/AILab-CVC/UniRepLKNet to install it.')
if DepthWiseConv2dImplicitGEMM is not None and need_large_impl and in_channels == out_channels \
and out_channels == groups and stride == 1 and dilation == 1:
print(f'===== iGEMM Efficient Conv Impl, channels {in_channels}, kernel size {kernel_size} =====')
return DepthWiseConv2dImplicitGEMM(in_channels, kernel_size, bias=bias)
return nn.Conv2d(in_channels, out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=bias)
def get_bn(dim, use_sync_bn=False):
if use_sync_bn:
return nn.SyncBatchNorm(dim)
else:
return nn.BatchNorm2d(dim)
def fuse_bn(conv, bn):
conv_bias = 0 if conv.bias is None else conv.bias
std = (bn.running_var + bn.eps).sqrt()
return conv.weight * (bn.weight / std).reshape(-1, 1, 1, 1), bn.bias + (conv_bias - bn.running_mean) * bn.weight / std
def convert_dilated_to_nondilated(kernel, dilate_rate):
identity_kernel = torch.ones((1, 1, 1, 1)).to(kernel.device)
if kernel.size(1) == 1:
# This is a DW kernel
dilated = F.conv_transpose2d(kernel, identity_kernel, stride=dilate_rate)
return dilated
else:
# This is a dense or group-wise (but not DW) kernel
slices = []
for i in range(kernel.size(1)):
dilated = F.conv_transpose2d(kernel[:,i:i+1,:,:], identity_kernel, stride=dilate_rate)
slices.append(dilated)
return torch.cat(slices, dim=1)
def merge_dilated_into_large_kernel(large_kernel, dilated_kernel, dilated_r):
large_k = large_kernel.size(2)
dilated_k = dilated_kernel.size(2)
equivalent_kernel_size = dilated_r * (dilated_k - 1) + 1
equivalent_kernel = convert_dilated_to_nondilated(dilated_kernel, dilated_r)
rows_to_pad = large_k // 2 - equivalent_kernel_size // 2
merged_kernel = large_kernel + F.pad(equivalent_kernel, [rows_to_pad] * 4)
return merged_kernel
class SEModule(nn.Module):
def __init__(self, dim, red=8, inner_act=nn.GELU, out_act=nn.Sigmoid):
super().__init__()
inner_dim = max(16, dim // red)
self.proj = nn.Sequential(
nn.AdaptiveAvgPool2d(1),
nn.Conv2d(dim, inner_dim, kernel_size=1),
inner_act(),
nn.Conv2d(inner_dim, dim, kernel_size=1),
out_act(),
)
def forward(self, x):
x = x * self.proj(x)
return x
class LayerScale(nn.Module):
def __init__(self, dim, init_value=1e-5):
super().__init__()
self.weight = nn.Parameter(torch.ones(dim, 1, 1, 1)*init_value,
requires_grad=True)
self.bias = nn.Parameter(torch.zeros(dim), requires_grad=True)
def forward(self, x):
x = F.conv2d(x, weight=self.weight, bias=self.bias, groups=x.shape[1])
return x
class LayerNorm2d(nn.LayerNorm):
def __init__(self, dim):
super().__init__(normalized_shape=dim, eps=1e-6)
def forward(self, x):
x = rearrange(x, 'b c h w -> b h w c')
x = super().forward(x)
x = rearrange(x, 'b h w c -> b c h w')
return x.contiguous()
class GRN(nn.Module):
""" GRN (Global Response Normalization) layer
Originally proposed in ConvNeXt V2 (https://arxiv.org/abs/2301.00808)
This implementation is more efficient than the original (https://github.com/facebookresearch/ConvNeXt-V2)
We assume the inputs to this layer are (N, C, H, W)
"""
def __init__(self, dim, use_bias=True):
super().__init__()
self.use_bias = use_bias
self.gamma = nn.Parameter(torch.zeros(1, dim, 1, 1))
if self.use_bias:
self.beta = nn.Parameter(torch.zeros(1, dim, 1, 1))
def forward(self, x):
Gx = torch.norm(x, p=2, dim=(-1, -2), keepdim=True)
Nx = Gx / (Gx.mean(dim=1, keepdim=True) + 1e-6)
if self.use_bias:
return (self.gamma * Nx + 1) * x + self.beta
else:
return (self.gamma * Nx + 1) * x
class DilatedReparamBlock(nn.Module):
"""
Dilated Reparam Block proposed in UniRepLKNet (https://github.com/AILab-CVC/UniRepLKNet)
We assume the inputs to this block are (N, C, H, W)
"""
def __init__(self, channels, kernel_size, deploy, use_sync_bn=False, attempt_use_lk_impl=True):
super().__init__()
self.lk_origin = get_conv2d(channels, channels, kernel_size, stride=1,
padding=kernel_size//2, dilation=1, groups=channels, bias=deploy,
attempt_use_lk_impl=attempt_use_lk_impl)
self.attempt_use_lk_impl = attempt_use_lk_impl
# Default settings. We did not tune them carefully. Different settings may work better.
if kernel_size == 19:
self.kernel_sizes = [5, 7, 9, 9, 3, 3, 3]
self.dilates = [1, 1, 1, 2, 4, 5, 7]
elif kernel_size == 17:
self.kernel_sizes = [5, 7, 9, 3, 3, 3]
self.dilates = [1, 1, 2, 4, 5, 7]
elif kernel_size == 15:
self.kernel_sizes = [5, 7, 7, 3, 3, 3]
self.dilates = [1, 1, 2, 3, 5, 7]
elif kernel_size == 13:
self.kernel_sizes = [5, 7, 7, 3, 3, 3]
self.dilates = [1, 1, 2, 3, 4, 5]
elif kernel_size == 11:
self.kernel_sizes = [5, 7, 5, 3, 3, 3]
self.dilates = [1, 1, 2, 3, 4, 5]
elif kernel_size == 9:
self.kernel_sizes = [5, 7, 5, 3, 3]
self.dilates = [1, 1, 2, 3, 4]
elif kernel_size == 7:
self.kernel_sizes = [5, 3, 3, 3]
self.dilates = [1, 1, 2, 3]
elif kernel_size == 5:
self.kernel_sizes = [3, 3]
self.dilates = [1, 2]
else:
raise ValueError('Dilated Reparam Block requires kernel_size >= 5')
if not deploy:
self.origin_bn = get_bn(channels, use_sync_bn)
for k, r in zip(self.kernel_sizes, self.dilates):
self.__setattr__('dil_conv_k{}_{}'.format(k, r),
nn.Conv2d(in_channels=channels, out_channels=channels, kernel_size=k, stride=1,
padding=(r * (k - 1) + 1) // 2, dilation=r, groups=channels,
bias=False))
self.__setattr__('dil_bn_k{}_{}'.format(k, r), get_bn(channels, use_sync_bn=use_sync_bn))
def forward(self, x):
if not hasattr(self, 'origin_bn'): # deploy mode
return self.lk_origin(x)
out = self.origin_bn(self.lk_origin(x))
for k, r in zip(self.kernel_sizes, self.dilates):
conv = self.__getattr__('dil_conv_k{}_{}'.format(k, r))
bn = self.__getattr__('dil_bn_k{}_{}'.format(k, r))
out = out + bn(conv(x))
return out
def merge_dilated_branches(self):
if hasattr(self, 'origin_bn'):
origin_k, origin_b = fuse_bn(self.lk_origin, self.origin_bn)
for k, r in zip(self.kernel_sizes, self.dilates):
conv = self.__getattr__('dil_conv_k{}_{}'.format(k, r))
bn = self.__getattr__('dil_bn_k{}_{}'.format(k, r))
branch_k, branch_b = fuse_bn(conv, bn)
origin_k = merge_dilated_into_large_kernel(origin_k, branch_k, r)
origin_b += branch_b
merged_conv = get_conv2d(origin_k.size(0), origin_k.size(0), origin_k.size(2), stride=1,
padding=origin_k.size(2)//2, dilation=1, groups=origin_k.size(0), bias=True,
attempt_use_lk_impl=self.attempt_use_lk_impl)
merged_conv.weight.data = origin_k
merged_conv.bias.data = origin_b
self.lk_origin = merged_conv
self.__delattr__('origin_bn')
for k, r in zip(self.kernel_sizes, self.dilates):
self.__delattr__('dil_conv_k{}_{}'.format(k, r))
self.__delattr__('dil_bn_k{}_{}'.format(k, r))
class CTXDownsample(nn.Module):
def __init__(self, dim, h_dim):
super().__init__()
self.x_proj = nn.Sequential(
nn.Conv2d(dim, h_dim, kernel_size=3, stride=2, padding=1, bias=False),
nn.BatchNorm2d(h_dim)
)
self.h_proj = nn.Sequential(
nn.Conv2d(h_dim//4, h_dim//4, kernel_size=3, stride=2, padding=1, bias=False),
nn.BatchNorm2d(h_dim//4)
)
def forward(self, x, ctx):
x = self.x_proj(x)
ctx = self.h_proj(ctx)
return (x, ctx)
class ResDWConv(nn.Conv2d):
'''
Depthwise convolution with residual connection
'''
def __init__(self, dim, kernel_size=3):
super().__init__(dim, dim, kernel_size=kernel_size, padding=kernel_size//2, groups=dim)
def forward(self, x):
x = x + super().forward(x)
return x
class RepConvBlock(nn.Module):
def __init__(self,
dim=64,
kernel_size=7,
mlp_ratio=4,
ls_init_value=None,
res_scale=False,
drop_path=0,
norm_layer=LayerNorm2d,
use_gemm=False,
deploy=False,
use_checkpoint=False):
super().__init__()
self.res_scale = res_scale
self.use_checkpoint = use_checkpoint
mlp_dim = int(dim*mlp_ratio)
self.dwconv = ResDWConv(dim, kernel_size=3)
self.proj = nn.Sequential(
norm_layer(dim),
DilatedReparamBlock(dim, kernel_size=kernel_size, deploy=deploy, use_sync_bn=False, attempt_use_lk_impl=use_gemm),
nn.BatchNorm2d(dim),
SEModule(dim),
nn.Conv2d(dim, mlp_dim, kernel_size=1),
nn.GELU(),
ResDWConv(mlp_dim, kernel_size=3),
GRN(mlp_dim),
nn.Conv2d(mlp_dim, dim, kernel_size=1),
DropPath(drop_path) if drop_path > 0 else nn.Identity(),
)
self.ls = LayerScale(dim, init_value=ls_init_value) if ls_init_value is not None else nn.Identity()
def forward_features(self, x):
x = self.dwconv(x)
if self.res_scale:
x = self.ls(x) + self.proj(x)
else:
drop_path = self.proj[-1]
x = x + drop_path(self.ls(self.proj[:-1](x)))
return x
def forward(self, x):
if self.use_checkpoint and x.requires_grad:
x = checkpoint(self.forward_features, x, use_reentrant=False)
else:
x = self.forward_features(x)
return x
步骤二:创建模块并导入
此时需要在当前目录新建一个block.py
文件用以统一管理自定义的C2f模块(当然也可以直接在ultralytics/nn/modules/block.py
中直接添加)。内容如下:
import torch.nn as nn
from ..modules import C2f
from .overlock import RepConvBlock
class C2f_RepConvBlock(C2f):
def __init__(self, c1, c2, n = 1, shortcut = False, g = 1, e = 0.5):
super().__init__(c1, c2, n, shortcut, g, e)
self.m = nn.ModuleList(RepConvBlock(self.c) for _ in range(n))
添加完成之后需要新增一个__init__.py
文件,将添加的模块导入到__init__.py
文件中,这样在调用的时候就可以直接使用from extra_modules import *
。__init__.py
文件需要撰写以下内容:
from .block import C2f_RepConvBlock
具体目录结构如下图所示:
nn/
└── extra_modules/
├── __init__.py
├── block.py
└── overlock.py
步骤三:修改tasks.py
文件
首先在tasks.py
文件中添加以下内容:
from ultralytics.nn.extra_modules import *
然后找到parse_model()
函数,在函数查找如下内容:
if m in base_modules:
c1, c2 = ch[f], args[0]
if c2 != nc: # if c2 not equal to number of classes (i.e. for Classify() output)
c2 = make_divisible(min(c2, max_channels) * width, 8)
使用较老ultralytics版本的同学,此处可能不是
base_modules
,而是相关的模块的字典集合,此时直接添加到集合即可;若不是就找到base_modules
所指向的集合进行添加,添加方式如下:
base_modules = frozenset(
{
Classify, Conv, ConvTranspose, GhostConv, Bottleneck, GhostBottleneck,
SPP, SPPF, C2fPSA, C2PSA, DWConv, Focus, BottleneckCSP, C1, C2, C2f, C3k2,
RepNCSPELAN4, ELAN1, ADown, AConv, SPPELAN, C2fAttn, C3, C3TR, C3Ghost,
torch.nn.ConvTranspose2d, DWConvTranspose2d, C3x, RepC3, PSA, SCDown, C2fCIB,
A2C2f,
# 自定义模块
C2f_RepConvBlock,
}
)
其次找到parse_model()
函数,在函数查找如下内容:
if m in repeat_modules:
args.insert(2, n) # number of repeats
n = 1
与base_modules
同理,具体添加方式如下:
repeat_modules = frozenset( # modules with 'repeat' arguments
{
BottleneckCSP, C1, C2, C2f, C3k2, C2fAttn, C3, C3TR, C3Ghost, C3x, RepC3,
C2fPSA, C2fCIB, C2PSA, A2C2f,
# 自定义模块
C2f_RepConvBlock,
}
)
步骤四:修改配置文件
在相应位置添加如下代码即可。
# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
# [depth, width, max_channels]
n: [0.33, 0.25, 1024] # YOLOv8n summary: 225 layers, 3157200 parameters, 3157184 gradients, 8.9 GFLOPs
s: [0.33, 0.50, 1024] # YOLOv8s summary: 225 layers, 11166560 parameters, 11166544 gradients, 28.8 GFLOPs
m: [0.67, 0.75, 768] # YOLOv8m summary: 295 layers, 25902640 parameters, 25902624 gradients, 79.3 GFLOPs
l: [1.00, 1.00, 512] # YOLOv8l summary: 365 layers, 43691520 parameters, 43691504 gradients, 165.7 GFLOPs
x: [1.00, 1.25, 512] # YOLOv8x summary: 365 layers, 68229648 parameters, 68229632 gradients, 258.5 GFLOPs
# YOLOv8.0n backbone
backbone:
# [from, repeats, module, args]
- [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
- [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
- [-1, 3, C2f_RepConvBlock, [128, True]]
- [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
- [-1, 6, C2f_RepConvBlock, [256, True]]
- [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
- [-1, 6, C2f_RepConvBlock, [512, True]]
- [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
- [-1, 3, C2f_RepConvBlock, [1024, True]]
- [-1, 1, SPPF, [1024, 5]] # 9
# YOLOv8.0n head
head:
- [-1, 1, nn.Upsample, [None, 2, 'nearest']]
- [[-1, 6], 1, Concat, [1]] # cat backbone P4
- [-1, 3, C2f, [512]] # 12
- [-1, 1, nn.Upsample, [None, 2, 'nearest']]
- [[-1, 4], 1, Concat, [1]] # cat backbone P3
- [-1, 3, C2f, [256]] # 15 (P3/8-small)
- [-1, 1, Conv, [256, 3, 2]]
- [[-1, 12], 1, Concat, [1]] # cat head P4
- [-1, 3, C2f, [512]] # 18 (P4/16-medium)
- [-1, 1, Conv, [512, 3, 2]]
- [[-1, 9], 1, Concat, [1]] # cat head P5
- [-1, 3, C2f, [1024]] # 21 (P5/32-large)
- [[15, 18, 21], 1, Detect, [nc]] # Detect(P3, P4, P5)