本文介绍
为提升 YOLOv8 在目标检测任务中的特征表达能力,我们借鉴了 CVPR2025 LSNet 提出的核心模块Block(此处称作LSBlock)。LSNet基于“看大,聚小”策略构建轻量化视觉网络结构,并提出了结合大内核感知和小内核聚合的LS(Large-Small)卷积。因此,LSBlock能有效捕捉广泛的感知信息,并针对动态和复杂的视觉表征实现精确的特征聚合,从而实现对视觉信息的熟练处理。实验结果如下(本文通过VOC数据验证算法性能,epoch为100,batchsize为32,imagesize为640*640):
Model | mAP50-95 | mAP50 | run time (h) | params (M) | interence time (ms) |
---|---|---|---|---|---|
YOLOv8 | 0.549 | 0.760 | 1.051 | 3.01 | 0.2+0.3(postprocess) |
YOLO11 | 0.553 | 0.757 | 1.142 | 2.59 | 0.2+0.3(postprocess) |
YOLOv8_C2f-LSBlock | 0.531 | 0.746 | 1.256 | 2.68 | 0.3+0.3(postprocess) |
重要声明:本文改进后代码可能只是并不适用于我所使用的数据集,对于其他数据集可能存在有效性。
本文改进是为了降低最新研究进展至YOLO的代码迁移难度,从而为对最新研究感兴趣的同学提供参考。
代码迁移
重点内容
步骤一:迁移代码
ultralytics框架的模块代码主要放在ultralytics/nn
文件夹下,此处为了与官方代码进行区分,可以新增一个extra_modules
文件夹,然后将我们的代码添加进入。
具体代码如下:
import math
import itertools
import triton
import triton.language as tl
import torch
import torch.nn as nn
from torch.autograd import Function
try:
from torch.amp import custom_fwd, custom_bwd
except:
from torch.cuda.amp import custom_fwd, custom_bwd
from timm.models.layers import SqueezeExcite
__all__ = ['LSConv', 'LSBlock']
def _grid(numel: int, bs: int) -> tuple:
return (triton.cdiv(numel, bs),)
@triton.jit
def _idx(i, n: int, c: int, h: int, w: int):
ni = i // (c * h * w)
ci = (i // (h * w)) % c
hi = (i // w) % h
wi = i % w
m = i < (n * c * h * w)
return ni, ci, hi, wi, m
@triton.jit
def ska_fwd(
x_ptr, w_ptr, o_ptr,
n, ic, h, w, ks, pad, wc,
BS: tl.constexpr,
CT: tl.constexpr, AT: tl.constexpr
):
pid = tl.program_id(0)
start = pid * BS
offs = start + tl.arange(0, BS)
ni, ci, hi, wi, m = _idx(offs, n, ic, h, w)
val = tl.zeros((BS,), dtype=AT)
for kh in range(ks):
hin = hi - pad + kh
hb = (hin >= 0) & (hin < h)
for kw in range(ks):
win = wi - pad + kw
b = hb & (win >= 0) & (win < w)
x_off = ((ni * ic + ci) * h + hin) * w + win
w_off = ((ni * wc + ci % wc) * ks * ks + (kh * ks + kw)) * h * w + hi * w + wi
x_val = tl.load(x_ptr + x_off, mask=m & b, other=0.0).to(CT)
w_val = tl.load(w_ptr + w_off, mask=m, other=0.0).to(CT)
val += tl.where(b & m, x_val * w_val, 0.0).to(AT)
tl.store(o_ptr + offs, val.to(CT), mask=m)
@triton.jit
def ska_bwd_x(
go_ptr, w_ptr, gi_ptr,
n, ic, h, w, ks, pad, wc,
BS: tl.constexpr,
CT: tl.constexpr, AT: tl.constexpr
):
pid = tl.program_id(0)
start = pid * BS
offs = start + tl.arange(0, BS)
ni, ci, hi, wi, m = _idx(offs, n, ic, h, w)
val = tl.zeros((BS,), dtype=AT)
for kh in range(ks):
ho = hi + pad - kh
hb = (ho >= 0) & (ho < h)
for kw in range(ks):
wo = wi + pad - kw
b = hb & (wo >= 0) & (wo < w)
go_off = ((ni * ic + ci) * h + ho) * w + wo
w_off = ((ni * wc + ci % wc) * ks * ks + (kh * ks + kw)) * h * w + ho * w + wo
go_val = tl.load(go_ptr + go_off, mask=m & b, other=0.0).to(CT)
w_val = tl.load(w_ptr + w_off, mask=m, other=0.0).to(CT)
val += tl.where(b & m, go_val * w_val, 0.0).to(AT)
tl.store(gi_ptr + offs, val.to(CT), mask=m)
@triton.jit
def ska_bwd_w(
go_ptr, x_ptr, gw_ptr,
n, wc, h, w, ic, ks, pad,
BS: tl.constexpr,
CT: tl.constexpr, AT: tl.constexpr
):
pid = tl.program_id(0)
start = pid * BS
offs = start + tl.arange(0, BS)
ni, ci, hi, wi, m = _idx(offs, n, wc, h, w)
for kh in range(ks):
hin = hi - pad + kh
hb = (hin >= 0) & (hin < h)
for kw in range(ks):
win = wi - pad + kw
b = hb & (win >= 0) & (win < w)
w_off = ((ni * wc + ci) * ks * ks + (kh * ks + kw)) * h * w + hi * w + wi
val = tl.zeros((BS,), dtype=AT)
steps = (ic - ci + wc - 1) // wc
for s in range(tl.max(steps, axis=0)):
cc = ci + s * wc
cm = (cc < ic) & m & b
x_off = ((ni * ic + cc) * h + hin) * w + win
go_off = ((ni * ic + cc) * h + hi) * w + wi
x_val = tl.load(x_ptr + x_off, mask=cm, other=0.0).to(CT)
go_val = tl.load(go_ptr + go_off, mask=cm, other=0.0).to(CT)
val += tl.where(cm, x_val * go_val, 0.0).to(AT)
tl.store(gw_ptr + w_off, val.to(CT), mask=m)
class SkaFn(Function):
@staticmethod
@custom_fwd
def forward(ctx, x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
ks = int(math.sqrt(w.shape[2]))
pad = (ks - 1) // 2
ctx.ks, ctx.pad = ks, pad
n, ic, h, width = x.shape
wc = w.shape[1]
o = torch.empty(n, ic, h, width, device=x.device, dtype=x.dtype)
numel = o.numel()
x = x.contiguous()
w = w.contiguous()
grid = lambda meta: _grid(numel, meta["BS"])
ct = tl.float16 if x.dtype == torch.float16 else (tl.float32 if x.dtype == torch.float32 else tl.float64)
at = tl.float32 if x.dtype == torch.float16 else ct
ska_fwd[grid](x, w, o, n, ic, h, width, ks, pad, wc, BS=1024, CT=ct, AT=at)
ctx.save_for_backward(x, w)
ctx.ct, ctx.at = ct, at
return o
@staticmethod
@custom_bwd
def backward(ctx, go: torch.Tensor) -> tuple:
ks, pad = ctx.ks, ctx.pad
x, w = ctx.saved_tensors
n, ic, h, width = x.shape
wc = w.shape[1]
go = go.contiguous()
gx = gw = None
ct, at = ctx.ct, ctx.at
if ctx.needs_input_grad[0]:
gx = torch.empty_like(x)
numel = gx.numel()
ska_bwd_x[lambda meta: _grid(numel, meta["BS"])](go, w, gx, n, ic, h, width, ks, pad, wc, BS=1024, CT=ct, AT=at)
if ctx.needs_input_grad[1]:
gw = torch.empty_like(w)
numel = gw.numel() // w.shape[2]
ska_bwd_w[lambda meta: _grid(numel, meta["BS"])](go, x, gw, n, wc, h, width, ic, ks, pad, BS=1024, CT=ct, AT=at)
return gx, gw, None, None
class SKA(torch.nn.Module):
def forward(self, x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
return SkaFn.apply(x, w) # type: ignore
class Conv2d_BN(torch.nn.Sequential):
def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1,
groups=1, bn_weight_init=1):
super().__init__()
self.add_module('c', torch.nn.Conv2d(
a, b, ks, stride, pad, dilation, groups, bias=False))
self.add_module('bn', torch.nn.BatchNorm2d(b))
torch.nn.init.constant_(self.bn.weight, bn_weight_init)
torch.nn.init.constant_(self.bn.bias, 0)
@torch.no_grad()
def fuse(self):
c, bn = self._modules.values()
w = bn.weight / (bn.running_var + bn.eps)**0.5
w = c.weight * w[:, None, None, None]
b = bn.bias - bn.running_mean * bn.weight / \
(bn.running_var + bn.eps)**0.5
m = torch.nn.Conv2d(w.size(1) * self.c.groups, w.size(
0), w.shape[2:], stride=self.c.stride, padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups,
device=c.weight.device)
m.weight.data.copy_(w)
m.bias.data.copy_(b)
return m
class Residual(torch.nn.Module):
def __init__(self, m, drop=0.):
super().__init__()
self.m = m
self.drop = drop
def forward(self, x):
if self.training and self.drop > 0:
return x + self.m(x) * torch.rand(x.size(0), 1, 1, 1,
device=x.device).ge_(self.drop).div(1 - self.drop).detach()
else:
return x + self.m(x)
class FFN(torch.nn.Module):
def __init__(self, ed, h):
super().__init__()
self.pw1 = Conv2d_BN(ed, h)
self.act = torch.nn.ReLU()
self.pw2 = Conv2d_BN(h, ed, bn_weight_init=0)
def forward(self, x):
x = self.pw2(self.act(self.pw1(x)))
return x
class Attention(torch.nn.Module):
def __init__(self, dim, key_dim, num_heads=8,
attn_ratio=4,
resolution=14):
super().__init__()
self.num_heads = num_heads
self.scale = key_dim ** -0.5
self.key_dim = key_dim
self.nh_kd = nh_kd = key_dim * num_heads
self.d = int(attn_ratio * key_dim)
self.dh = int(attn_ratio * key_dim) * num_heads
self.attn_ratio = attn_ratio
h = self.dh + nh_kd * 2
self.qkv = Conv2d_BN(dim, h, ks=1)
self.proj = torch.nn.Sequential(torch.nn.ReLU(), Conv2d_BN(
self.dh, dim, bn_weight_init=0))
self.dw = Conv2d_BN(nh_kd, nh_kd, 3, 1, 1, groups=nh_kd)
points = list(itertools.product(range(resolution), range(resolution)))
N = len(points)
attention_offsets = {}
idxs = []
for p1 in points:
for p2 in points:
offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
if offset not in attention_offsets:
attention_offsets[offset] = len(attention_offsets)
idxs.append(attention_offsets[offset])
self.attention_biases = torch.nn.Parameter(
torch.zeros(num_heads, len(attention_offsets)))
self.register_buffer('attention_bias_idxs',
torch.LongTensor(idxs).view(N, N))
@torch.no_grad()
def train(self, mode=True):
super().train(mode)
if mode and hasattr(self, 'ab'):
del self.ab
else:
self.ab = self.attention_biases[:, self.attention_bias_idxs]
def forward(self, x):
B, _, H, W = x.shape
N = H * W
qkv = self.qkv(x)
q, k, v = qkv.view(B, -1, H, W).split([self.nh_kd, self.nh_kd, self.dh], dim=1)
q = self.dw(q)
q, k, v = q.view(B, self.num_heads, -1, N), k.view(B, self.num_heads, -1, N), v.view(B, self.num_heads, -1, N)
attn = (
(q.transpose(-2, -1) @ k) * self.scale
+
(self.attention_biases[:, self.attention_bias_idxs]
if self.training else self.ab)
)
attn = attn.softmax(dim=-1)
x = (v @ attn.transpose(-2, -1)).reshape(B, -1, H, W)
x = self.proj(x)
return x
class RepVGGDW(torch.nn.Module):
def __init__(self, ed) -> None:
super().__init__()
self.conv = Conv2d_BN(ed, ed, 3, 1, 1, groups=ed)
self.conv1 = Conv2d_BN(ed, ed, 1, 1, 0, groups=ed)
self.dim = ed
def forward(self, x):
return self.conv(x) + self.conv1(x) + x
@torch.no_grad()
def fuse(self):
conv = self.conv.fuse()
conv1 = self.conv1.fuse()
conv_w = conv.weight
conv_b = conv.bias
conv1_w = conv1.weight
conv1_b = conv1.bias
conv1_w = torch.nn.functional.pad(conv1_w, [1,1,1,1])
identity = torch.nn.functional.pad(torch.ones(conv1_w.shape[0], conv1_w.shape[1], 1, 1, device=conv1_w.device), [1,1,1,1])
final_conv_w = conv_w + conv1_w + identity
final_conv_b = conv_b + conv1_b
conv.weight.data.copy_(final_conv_w)
conv.bias.data.copy_(final_conv_b)
return conv
class LKP(nn.Module):
def __init__(self, dim, lks, sks, groups):
super().__init__()
self.cv1 = Conv2d_BN(dim, dim // 2)
self.act = nn.ReLU()
self.cv2 = Conv2d_BN(dim // 2, dim // 2, ks=lks, pad=(lks - 1) // 2, groups=dim // 2)
self.cv3 = Conv2d_BN(dim // 2, dim // 2)
self.cv4 = nn.Conv2d(dim // 2, sks ** 2 * dim // groups, kernel_size=1)
self.norm = nn.GroupNorm(num_groups=dim // groups, num_channels=sks ** 2 * dim // groups)
self.sks = sks
self.groups = groups
self.dim = dim
def forward(self, x):
x = self.act(self.cv3(self.cv2(self.act(self.cv1(x)))))
w = self.norm(self.cv4(x))
b, _, h, width = w.size()
w = w.view(b, self.dim // self.groups, self.sks ** 2, h, width)
return w
class LSConv(nn.Module):
def __init__(self, dim):
super(LSConv, self).__init__()
self.lkp = LKP(dim, lks=7, sks=3, groups=8)
self.ska = SKA()
self.bn = nn.BatchNorm2d(dim)
def forward(self, x):
return self.bn(self.ska(x, self.lkp(x))) + x
class LSBlock(torch.nn.Module):
def __init__(self,
ed, kd=16, nh=8,
ar=4,
resolution=14,
stage=-1, depth=-1):
super().__init__()
if depth % 2 == 0:
self.mixer = RepVGGDW(ed)
self.se = SqueezeExcite(ed, 0.25)
else:
self.se = torch.nn.Identity()
if stage == 3:
self.mixer = Residual(Attention(ed, kd, nh, ar, resolution=resolution))
else:
self.mixer = LSConv(ed)
self.ffn = Residual(FFN(ed, int(ed * 2)))
def forward(self, x):
return self.ffn(self.se(self.mixer(x)))
步骤二:创建模块并导入
此时需要在当前目录新建一个block.py
文件用以统一管理自定义的C2f模块(当然也可以直接在ultralytics/nn/modules/block.py
中直接添加)。内容如下:
import torch.nn as nn
from ..modules import C2f
from .lsnet import LSBlock
class C2f_LSBlock(C2f):
def __init__(self, c1, c2, n = 1, shortcut = False, g = 1, e = 0.5):
super().__init__(c1, c2, n, shortcut, g, e)
self.m = nn.ModuleList(LSBlock(self.c) for _ in range(n))
添加完成之后需要新增一个__init__.py
文件,将添加的模块导入到__init__.py
文件中,这样在调用的时候就可以直接使用from extra_modules import *
。__init__.py
文件需要撰写以下内容:
from .block import C2f_LSBlock
具体目录结构如下图所示:
步骤三:修改tasks.py
文件
首先在tasks.py
文件中添加以下内容:
from ultralytics.nn.extra_modules import *
然后找到parse_model()
函数,在函数查找如下内容:
if m in base_modules:
c1, c2 = ch[f], args[0]
if c2 != nc: # if c2 not equal to number of classes (i.e. for Classify() output)
c2 = make_divisible(min(c2, max_channels) * width, 8)
使用较老ultralytics版本的同学,此处可能不是
base_modules
,而是相关的模块的字典集合,此时直接添加到集合即可;若不是就找到base_modules
所指向的集合进行添加,添加方式如下:
base_modules = frozenset(
{
Classify, Conv, ConvTranspose, GhostConv, Bottleneck, GhostBottleneck,
SPP, SPPF, C2fPSA, C2PSA, DWConv, Focus, BottleneckCSP, C1, C2, C2f, C3k2,
RepNCSPELAN4, ELAN1, ADown, AConv, SPPELAN, C2fAttn, C3, C3TR, C3Ghost,
torch.nn.ConvTranspose2d, DWConvTranspose2d, C3x, RepC3, PSA, SCDown, C2fCIB,
A2C2f,
# 自定义模块
C2f_LSBlock,
}
)
其次找到parse_model()
函数,在函数查找如下内容:
if m in repeat_modules:
args.insert(2, n) # number of repeats
n = 1
与base_modules
同理,具体添加方式如下:
repeat_modules = frozenset( # modules with 'repeat' arguments
{
BottleneckCSP, C1, C2, C2f, C3k2, C2fAttn, C3, C3TR, C3Ghost, C3x, RepC3,
C2fPSA, C2fCIB, C2PSA, A2C2f,
# 自定义模块
C2f_LSBlock,
}
)
步骤四:CUDA
LSBlock模块中的SKA使用Triton进行GPU内核编写以实现一个卷积操作(自定义了前向传播和反向传播)。因此,为运行该模块需要将数据及模型放置在GPU上,故需要对tasks.py
文件中的BaseModel
类进行修改,具体修改如下:
# forwrad 函数替换如下
self.model = self.model.to('cuda')
if isinstance(x, dict): # for cases of training and validating while training.
return self.loss(x, *args, **kwargs)
x = x.to('cuda')
return self.predict(x, *args, **kwargs)
# loss 函数替换如下
if getattr(self, "criterion", None) is None:
self.criterion = self.init_criterion()
preds = self.forward(batch["img"].to('cuda')) if preds is None else preds
return self.criterion(preds, batch)
步骤五:修改配置文件
在相应位置添加如下代码即可。
# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
# [depth, width, max_channels]
n: [0.33, 0.25, 1024] # YOLOv8n summary: 129 layers, 3157200 parameters, 3157184 gradients, 8.9 GFLOPS
s: [0.33, 0.50, 1024] # YOLOv8s summary: 129 layers, 11166560 parameters, 11166544 gradients, 28.8 GFLOPS
m: [0.67, 0.75, 768] # YOLOv8m summary: 169 layers, 25902640 parameters, 25902624 gradients, 79.3 GFLOPS
l: [1.00, 1.00, 512] # YOLOv8l summary: 209 layers, 43691520 parameters, 43691504 gradients, 165.7 GFLOPS
x: [1.00, 1.25, 512] # YOLOv8x summary: 209 layers, 68229648 parameters, 68229632 gradients, 258.5 GFLOPS
# YOLOv8.0n backbone
backbone:
# [from, repeats, module, args]
- [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
- [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
- [-1, 3, C2f_LSBlock, [128, True]]
- [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
- [-1, 6, C2f_LSBlock, [256, True]]
- [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
- [-1, 6, C2f_LSBlock, [512, True]]
- [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
- [-1, 3, C2f_LSBlock, [1024, True]]
- [-1, 1, SPPF, [1024, 5]] # 9
# YOLOv8.0n head
head:
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [[-1, 6], 1, Concat, [1]] # cat backbone P4
- [-1, 3, C2f, [512]] # 12
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [[-1, 4], 1, Concat, [1]] # cat backbone P3
- [-1, 3, C2f, [256]] # 15 (P3/8-small)
- [-1, 1, Conv, [256, 3, 2]]
- [[-1, 12], 1, Concat, [1]] # cat head P4
- [-1, 3, C2f, [512]] # 18 (P4/16-medium)
- [-1, 1, Conv, [512, 3, 2]]
- [[-1, 9], 1, Concat, [1]] # cat head P5
- [-1, 3, C2f, [1024]] # 21 (P5/32-large)
- [[15, 18, 21], 1, Detect, [nc]] # Detect(P3, P4, P5)