本文介绍
为提升 YOLOv8 在目标检测任务中的特征表达能力,我们借鉴了 CVPR2025 LSNet 提出的核心模块Block(此处称作LSBlock)。LSNet基于“看大,聚小”策略构建轻量化视觉网络结构,并提出了结合大内核感知和小内核聚合的LS(Large-Small)卷积。因此,LSBlock能有效捕捉广泛的感知信息,并针对动态和复杂的视觉表征实现精确的特征聚合,从而实现对视觉信息的熟练处理。实验结果如下(本文通过VOC数据验证算法性能,epoch为100,batchsize为32,imagesize为640*640):
| Model | mAP50-95 | mAP50 | run time (h) | params (M) | interence time (ms) |
|---|---|---|---|---|---|
| YOLOv8 | 0.549 | 0.760 | 1.051 | 3.01 | 0.2+0.3(postprocess) |
| YOLO11 | 0.553 | 0.757 | 1.142 | 2.59 | 0.2+0.3(postprocess) |
| YOLOv8_C2f-LSBlock | 0.531 | 0.746 | 1.256 | 2.68 | 0.3+0.3(postprocess) |

重要声明:本文改进后代码可能只是并不适用于我所使用的数据集,对于其他数据集可能存在有效性。
本文改进是为了降低最新研究进展至YOLO的代码迁移难度,从而为对最新研究感兴趣的同学提供参考。
代码迁移
重点内容
步骤一:迁移代码
ultralytics框架的模块代码主要放在ultralytics/nn文件夹下,此处为了与官方代码进行区分,可以新增一个extra_modules文件夹,然后将我们的代码添加进入。
具体代码如下:
import math
import itertools
import triton
import triton.language as tl
import torch
import torch.nn as nn
from torch.autograd import Function
try:
from torch.amp import custom_fwd, custom_bwd
except:
from torch.cuda.amp import custom_fwd, custom_bwd
from timm.models.layers import SqueezeExcite
__all__ = ['LSConv', 'LSBlock']
def _grid(numel: int, bs: int) -> tuple:
return (triton.cdiv(numel, bs),)
@triton.jit
def _idx(i, n: int, c: int, h: int, w: int):
ni = i // (c * h * w)
ci = (i // (h * w)) % c
hi = (i // w) % h
wi = i % w
m = i < (n * c * h * w)
return ni, ci, hi, wi, m
@triton.jit
def ska_fwd(
x_ptr, w_ptr, o_ptr,
n, ic, h, w, ks, pad, wc,
BS: tl.constexpr,
CT: tl.constexpr, AT: tl.constexpr
):
pid = tl.program_id(0)
start = pid * BS
offs = start + tl.arange(0, BS)
ni, ci, hi, wi, m = _idx(offs, n, ic, h, w)
val = tl.zeros((BS,), dtype=AT)
for kh in range(ks):
hin = hi - pad + kh
hb = (hin >= 0) & (hin < h)
for kw in range(ks):
win = wi - pad + kw
b = hb & (win >= 0) & (win < w)
x_off = ((ni * ic + ci) * h + hin) * w + win
w_off = ((ni * wc + ci % wc) * ks * ks + (kh * ks + kw)) * h * w + hi * w + wi
x_val = tl.load(x_ptr + x_off, mask=m & b, other=0.0).to(CT)
w_val = tl.load(w_ptr + w_off, mask=m, other=0.0).to(CT)
val += tl.where(b & m, x_val * w_val, 0.0).to(AT)
tl.store(o_ptr + offs, val.to(CT), mask=m)
@triton.jit
def ska_bwd_x(
go_ptr, w_ptr, gi_ptr,
n, ic, h, w, ks, pad, wc,
BS: tl.constexpr,
CT: tl.constexpr, AT: tl.constexpr
):
pid = tl.program_id(0)
start = pid * BS
offs = start + tl.arange(0, BS)
ni, ci, hi, wi, m = _idx(offs, n, ic, h, w)
val = tl.zeros((BS,), dtype=AT)
for kh in range(ks):
ho = hi + pad - kh
hb = (ho >= 0) & (ho < h)
for kw in range(ks):
wo = wi + pad - kw
b = hb & (wo >= 0) & (wo < w)
go_off = ((ni * ic + ci) * h + ho) * w + wo
w_off = ((ni * wc + ci % wc) * ks * ks + (kh * ks + kw)) * h * w + ho * w + wo
go_val = tl.load(go_ptr + go_off, mask=m & b, other=0.0).to(CT)
w_val = tl.load(w_ptr + w_off, mask=m, other=0.0).to(CT)
val += tl.where(b & m, go_val * w_val, 0.0).to(AT)
tl.store(gi_ptr + offs, val.to(CT), mask=m)
@triton.jit
def ska_bwd_w(
go_ptr, x_ptr, gw_ptr,
n, wc, h, w, ic, ks, pad,
BS: tl.constexpr,
CT: tl.constexpr, AT: tl.constexpr
):
pid = tl.program_id(0)
start = pid * BS
offs = start + tl.arange(0, BS)
ni, ci, hi, wi, m = _idx(offs, n, wc, h, w)
for kh in range(ks):
hin = hi - pad + kh
hb = (hin >= 0) & (hin < h)
for kw in range(ks):

最低0.47元/天 解锁文章
956

被折叠的 条评论
为什么被折叠?



