本文介绍
IJCV2025 SRConvNet 提出了一种新型轻量化卷积网络SRConvNet,在单图像超分辨率任务中实现了Transformer级性能与卷积网络级效率的突破。SRConvNet通过两大创新设计缓解了现有方法的局限性:首先,傅里叶调制注意力机制(FMA)利用频域1x1卷积实现全局上下文建模,以线性计算复杂度模拟Transformer的长程依赖捕捉能力;其次,多尺度动态混合层(DML)通过并行动态卷积与通道shuffling机制,实现了自适应跨尺度特征融合。论文实验表明,SRConvNet在x4超分任务中,较SwinIR-light参数减少56%,FLOPs降低62%。本文通过构建SRConvBlock模块以改进C2f模块,具体实验结果如下(本文通过VOC数据验证算法性能,epoch为100,batchsize为32,imagesize为640*640):
Model | mAP50-95 | mAP50 | run time (h) | params (M) | interence time (ms) |
---|---|---|---|---|---|
YOLOv8 | 0.549 | 0.760 | 1.051 | 3.01 | 0.2+0.3(postprocess) |
YOLO11 | 0.553 | 0.757 | 1.142 | 2.59 | 0.2+0.3(postprocess) |
yolov8_C2f-SRConvBlock | 0.550 | 0.759 | 1.527 | 3.15 | 0.5+0.3(postprocess) |
重要声明:本文改进后代码可能只是并不适用于我所使用的数据集,对于其他数据集可能存在有效性。
本文改进是为了降低最新研究进展至YOLO的代码迁移难度,从而为对最新研究感兴趣的同学提供参考。
代码迁移
重点内容
步骤一:迁移代码
ultralytics框架的模块代码主要放在ultralytics/nn
文件夹下,此处为了与官方代码进行区分,可以新增一个extra_modules
文件夹,然后将我们的代码添加进入。
具体代码如下:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
import os
class MeanShift(nn.Conv2d):
def __init__(
self, rgb_range,
rgb_mean=(0.4488, 0.4371, 0.4040), rgb_std=(1.0, 1.0, 1.0), sign=-1):
super(MeanShift, self).__init__(3, 3, kernel_size=1)
std = torch.Tensor(rgb_std)
self.weight.data = torch.eye(3).view(3, 3, 1, 1) / std.view(3, 1, 1, 1)
self.bias.data = sign * rgb_range * torch.Tensor(rgb_mean) / std
for p in self.parameters():
p.requires_grad = False
class LayerNorm(nn.Module):
r""" From ConvNeXt (https://arxiv.org/pdf/2201.03545.pdf)
"""
def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
super().__init__()
self.weight = nn.Parameter(torch.ones(normalized_shape))
self.bias = nn.Parameter(torch.zeros(normalized_shape))
self.eps = eps
self.data_format = data_format
if self.data_format not in ["channels_last", "channels_first"]:
raise NotImplementedError
self.normalized_shape = (normalized_shape,)
def forward(self, x):
if self.data_format == "channels_last":
return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
elif self.data_format == "channels_first":
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.eps)
x = self.weight[:, None, None] * x + self.bias[:, None, None]
return x
class FourierUnit(nn.Module):
def __init__(self, dim, groups=1, fft_norm='ortho'):
super().__init__()
self.groups = groups
self.fft_norm = fft_norm
self.conv_layer = nn.Conv2d(in_channels=dim * 2, out_channels=dim * 2, kernel_size=1, stride=1,
padding=0, groups=self.groups, bias=False)
self.act = nn.GELU()
def forward(self, x):
batch, c, h, w = x.size()
r_size = x.size()
# (batch, c, h, w/2+1, 2)
# torch版本 < 1.8.0
# ffted = torch.rfft(x, signal_ndim=2, normalized=True)
# torch版本 >= 1.8.0
ffted = torch.fft.rfft2(x, dim=(-2, -1), norm='ortho')
ffted = torch.stack((ffted.real, ffted.imag), dim=-1)
# (batch, c, 2, h, w/2+1)
ffted = ffted.permute(0, 1, 4, 2, 3).contiguous()
ffted = ffted.view((batch, -1,) + ffted.size()[3:])
ffted = self.conv_layer(ffted) # (batch, c*2, h, w/2+1)
ffted = self.act(ffted)
# (batch,c, t, h, w/2+1, 2)
ffted = ffted.view((batch, -1, 2,) + ffted.size()[2:]).permute(0, 1, 3, 4, 2).contiguous()
# output = torch.irfft(ffted, signal_ndim=2, signal_sizes=r_size[2:], normalized=True)
# torch版本 >= 1.8.0
ffted_complex = torch.view_as_complex(ffted)
output = torch.fft.ifft2(ffted_complex, s=r_size[2:], norm='ortho').abs()
return output
class FConvMod(nn.Module):
def __init__(self, dim, num_heads):
super().__init__()
layer_scale_init_value = 1e-6
self.num_heads = num_heads
self.norm = LayerNorm(dim, eps=1e-6, data_format="channels_first")
self.a = FourierUnit(dim)
self.v = nn.Conv2d(dim, dim, 1)
self.act = nn.GELU()
self.layer_scale = nn.Parameter(layer_scale_init_value * torch.ones(num_heads), requires_grad=True)
self.CPE = nn.Conv2d(dim, dim, kernel_size=3, stride=1, padding=1, groups=dim)
self.proj = nn.Conv2d(dim, dim, 1)
def forward(self, x):
B, C, H, W = x.shape
N = H * W
shortcut = x
pos_embed = self.CPE(x)
x = self.norm(x)
a = self.a(x)
v = self.v(x)
a = rearrange(a, 'b (head c) h w -> b head c (h w)', head=self.num_heads)
v = rearrange(v, 'b (head c) h w -> b head c (h w)', head=self.num_heads)
a_all = torch.split(a, math.ceil(N // 4), dim=-1)
v_all = torch.split(v, math.ceil(N // 4), dim=-1)
attns = []
for a, v in zip(a_all, v_all):
attn = a * v
attn = self.layer_scale.unsqueeze(-1).unsqueeze(-1) * attn
attns.append(attn)
x = torch.cat(attns, dim=-1)
x = F.softmax(x, dim=-1)
x = rearrange(x, 'b head c (h w) -> b (head c) h w', head=self.num_heads, h=H, w=W)
x = x + pos_embed
x = self.proj(x)
out = x + shortcut
return out
class KernelAggregation(nn.Module):
def __init__(self, dim, kernel_size, groups, num_kernels, bias=True, init_weight=True):
super().__init__()
self.groups = groups
self.bias = bias
self.num_kernels = num_kernels
self.kernel_size = kernel_size
self.dim = dim
self.weight = nn.Parameter(torch.randn(num_kernels, dim, dim // groups, kernel_size, kernel_size),
requires_grad=True)
if bias:
self.bias = nn.Parameter(torch.zeros(num_kernels, dim))
else:
self.bias = None
if init_weight:
self._initialize_weights()
def _initialize_weights(self):
for i in range(self.num_kernels):
nn.init.kaiming_uniform_(self.weight[i])
def forward(self, x, attention):
B, C, H, W = x.shape
x = x.contiguous().view(1, B * self.dim, H, W)
weight = self.weight.contiguous().view(self.num_kernels, -1)
weight = torch.mm(attention, weight).contiguous().view(B * self.dim, self.dim // self.groups,
self.kernel_size, self.kernel_size)
if self.bias is not None:
bias = torch.mm(attention, self.bias).contiguous().view(-1)
x = F.conv2d(x, weight=weight, bias=bias, stride=1, padding=self.kernel_size // 2,
groups=self.groups * B)
else:
x = F.conv2d(x, weight=weight, bias=None, stride=1, padding=self.kernel_size // 2,
groups=self.groups * B)
x = x.contiguous().view(B, self.dim, x.shape[-2], x.shape[-1])
return x
class KernelAttention(nn.Module):
def __init__(self, dim, reduction=8, num_kernels=8):
super().__init__()
if dim != 3:
mid_channels = dim // reduction
else:
mid_channels = num_kernels
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.conv1 = nn.Conv2d(dim, mid_channels, 1)
self.act = nn.GELU()
self.conv2 = nn.Conv2d(mid_channels, num_kernels, 1)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
x = self.avg_pool(x)
x = self.conv1(x)
x = self.act(x)
x = self.conv2(x)
x = x.view(x.shape[0], -1)
x = self.sigmoid(x)
return x
class DynamicKernelAggregation(nn.Module):
def __init__(self, dim, kernel_size, groups=1, num_kernels=4):
super().__init__()
assert dim % groups == 0
self.attention = KernelAttention(dim, num_kernels=num_kernels)
self.aggregation = KernelAggregation(dim, kernel_size=kernel_size, groups=groups, num_kernels=num_kernels)
def forward(self, x):
attention = x
attention = self.attention(attention)
x = self.aggregation(x, attention)
return x
class DyConv(nn.Module):
def __init__(self, dim, kernel_size, groups, num_kernels=1):
super().__init__()
if num_kernels > 1:
self.conv = DynamicKernelAggregation(dim, kernel_size=kernel_size, groups=groups,
num_kernels=num_kernels)
else:
self.conv = nn.Conv2d(dim, dim, kernel_size=kernel_size, groups=groups)
def forward(self, x):
x = self.conv(x)
return x
class MixFFN(nn.Module):
def __init__(self, dim, num_kernels):
super().__init__()
self.proj_in = nn.Conv2d(dim, dim * 2, 1)
self.conv1 = DyConv(dim, kernel_size=5, groups=dim, num_kernels=num_kernels)
self.conv2 = DyConv(dim, kernel_size=7, groups=dim, num_kernels=num_kernels)
self.proj_out = nn.Conv2d(dim * 2, dim, 1)
self.norm = LayerNorm(dim, eps=1e-6, data_format="channels_first")
self.act = nn.GELU()
def forward(self, x):
shortcut = x
x = self.norm(x)
x = self.act(self.proj_in(x))
x1, x2 = torch.chunk(x, 2, dim=1)
x1 = self.act(self.conv1(x1)).unsqueeze(dim=2)
x2 = self.act(self.conv2(x2)).unsqueeze(dim=2)
x = torch.cat([x1, x2], dim=2)
x = rearrange(x, 'b c g h w -> b (c g) h w')
x = self.proj_out(x)
x = x + shortcut
return x
class SRConvBlock(nn.Module):
def __init__(self, dim, num_heads=8, num_kernels=16):
super().__init__()
self.attention = FConvMod(dim, num_heads)
self.ffn = MixFFN(dim, num_kernels)
def forward(self, x):
x = self.attention(x)
x = self.ffn(x)
return x
if __name__ == '__main__':
inputs = torch.randn(1, 64, 640, 640)
model = SRConvBlock(64)
outputs = model(inputs)
print(outputs.shape)
步骤二:创建模块并导入
此时需要在当前目录新建一个block.py
文件用以统一管理自定义的C2f模块(当然也可以直接在ultralytics/nn/modules/block.py
中直接添加)。内容如下:
import torch.nn as nn
from ..modules import C2f
from .srconvnet import SRConvBlock
class C2f_SRConvBlock(C2f):
def __init__(self, c1, c2, n = 1, shortcut = False, g = 1, e = 0.5):
super().__init__(c1, c2, n, shortcut, g, e)
self.m = nn.ModuleList(SRConvBlock(self.c) for _ in range(n))
添加完成之后需要新增一个__init__.py
文件,将添加的模块导入到__init__.py
文件中,这样在调用的时候就可以直接使用from extra_modules import *
。__init__.py
文件需要撰写以下内容:
from .block import C2f_SRConvBlock
具体目录结构如下图所示:
nn/
└── extra_modules/
├── __init__.py
├── block.py
└── srconvnet.py
步骤三:修改tasks.py
文件
首先在tasks.py
文件中添加以下内容:
from ultralytics.nn.extra_modules import *
然后找到parse_model()
函数,在函数查找如下内容:
if m in base_modules:
c1, c2 = ch[f], args[0]
if c2 != nc: # if c2 not equal to number of classes (i.e. for Classify() output)
c2 = make_divisible(min(c2, max_channels) * width, 8)
使用较老ultralytics版本的同学,此处可能不是
base_modules
,而是相关的模块的字典集合,此时直接添加到集合即可;若不是就找到base_modules
所指向的集合进行添加,添加方式如下:
base_modules = frozenset(
{
Classify, Conv, ConvTranspose, GhostConv, Bottleneck, GhostBottleneck,
SPP, SPPF, C2fPSA, C2PSA, DWConv, Focus, BottleneckCSP, C1, C2, C2f, C3k2,
RepNCSPELAN4, ELAN1, ADown, AConv, SPPELAN, C2fAttn, C3, C3TR, C3Ghost,
torch.nn.ConvTranspose2d, DWConvTranspose2d, C3x, RepC3, PSA, SCDown, C2fCIB,
A2C2f,
# 自定义模块
C2f_SRConvBlock,
}
)
其次找到parse_model()
函数,在函数查找如下内容:
if m in repeat_modules:
args.insert(2, n) # number of repeats
n = 1
与base_modules
同理,具体添加方式如下:
repeat_modules = frozenset( # modules with 'repeat' arguments
{
BottleneckCSP, C1, C2, C2f, C3k2, C2fAttn, C3, C3TR, C3Ghost, C3x, RepC3,
C2fPSA, C2fCIB, C2PSA, A2C2f,
# 自定义模块
C2f_SRConvBlock,
}
)
步骤四:修改配置文件
训练时需要设置
amp=False
。
在相应位置添加如下代码即可。
# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
# [depth, width, max_channels]
n: [0.33, 0.25, 1024] # YOLOv8n summary: 129 layers, 3157200 parameters, 3157184 gradients, 8.9 GFLOPS
s: [0.33, 0.50, 1024] # YOLOv8s summary: 129 layers, 11166560 parameters, 11166544 gradients, 28.8 GFLOPS
m: [0.67, 0.75, 768] # YOLOv8m summary: 169 layers, 25902640 parameters, 25902624 gradients, 79.3 GFLOPS
l: [1.00, 1.00, 512] # YOLOv8l summary: 209 layers, 43691520 parameters, 43691504 gradients, 165.7 GFLOPS
x: [1.00, 1.25, 512] # YOLOv8x summary: 209 layers, 68229648 parameters, 68229632 gradients, 258.5 GFLOPS
# YOLOv8.0n backbone
backbone:
# [from, repeats, module, args]
- [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
- [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
- [-1, 3, C2f, [128, True]]
- [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
- [-1, 3, C2f, [256, True]]
- [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
- [-1, 9, C2f, [512, True]]
- [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
- [-1, 3, C2f, [1024, True]]
- [-1, 1, SPPF, [1024, 5]] # 9
# YOLOv8.0n head
head:
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [[-1, 6], 1, Concat, [1]] # cat backbone P4
- [-1, 3, C2f, [512]] # 12
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [[-1, 4], 1, Concat, [1]] # cat backbone P3
- [-1, 3, C2f, [256]] # 15 (P3/8-small)
- [-1, 1, Conv, [256, 3, 2]]
- [[-1, 12], 1, Concat, [1]] # cat head P4
- [-1, 3, C2f_SRConvBlock, [512]] # 18 (P4/16-medium)
- [-1, 1, Conv, [512, 3, 2]]
- [[-1, 9], 1, Concat, [1]] # cat head P5
- [-1, 3, C2f_SRConvBlock, [1024]] # 21 (P5/32-large)
- [[15, 18, 21], 1, Detect, [nc]] # Detect(P3, P4, P5)