yolov8 odconvnext
[大佬绕行,手下留情]
网络分别为backbone、neck、head
backbone:主干网络,大多时候指的是提取特征的网络,其作用就是提取图片中的信息,供后面的网络使用。
neck:连接backbone和head,利用backbone提取的特征,提高鲁棒性
head:连接网络输出,利用之前提取的特征,做出预测。
yolov5s网络结构:更改前后结构对比。
官方代码是在.yaml文件定义网络结构,利用pytorch动态图特性,解析.yaml文件自动生成网络结构。在.yaml文件里有depth_multiple和width_multiple,它是控制网络的深度和宽度的参数。这么做的好处是能够灵活的配置网络结构,但是不利于理解网络结构。
yoolov5s—yoolov5s-odconvnext.yaml 文件对比:
yoolov5—yoolov8.yaml 文件对比:
yaml文件参数解析
1.nc:数据集中的类别数
2.depth_multiple:模型层数因子(用来调整网络的深度)
为了控制层的重复的次数。它会和后面的backbone & head的number相乘后取整,代表该层的重复的数量.
如:[[-1, 1, Conv, [64, 3, 2]],当depth_multiple为1时候,则重复11个
3.width_multiple:模型通道数因子(用来调整网络的宽度)
为了控制输出特征图的通道数,它会和出特征图的通道数相乘,代表该层的输出通道数。
如:[[-1, 1, Conv, [128, 3, 2]],当width_multiple为0.5时候,则输出通道为1280.5=64通道
[from, number, module, args] 参数解析 or [from, repeats, module, args]
from :从哪一层获得输入,-1表示从上一层获得,[[-1, 4], 1, Concat, [1]] # cat backbone P3 中 [-1,4]表示从上层和第4层两层获得。
number、repeats:表示有几个相同的模块,如果为6则表示有6个相同的模块。
module:模块的名称,这些模块写在common.py中。
args:类的初始化参数,用于解析作为 moudle 的传入参数。
例如 —[-1, 1, nn.Upsample, [None, 2, ‘nearest’]] 中[None, 2, ‘nearest’]
[-1, 1, Conv, [128, 3, 2]] # 1-P2/4 中的[128, 3, 2]
[-1, 3, C2f, [1024, True]] 中的[1024, True]
args参数依次为:输出channel,卷积核尺寸kernel size,步长stride,l零填充大小
yolov8网络结构
模块:ConvModule
功能: 标准的卷积
参数:输入通道数(c1), 输出通道数(c2), 卷积核大小(k,默认是1), 步长(s,默认是1), 填充(p,默认为None), 组(g, 默认为1), 扩张率(d,默认为1), 是否采用激活函数(act ,默认为True, 且采用SiLU为激活函数)
1.common.py配置:
在ultralytics/nn/modules.py文件中定义了yolov8网络中的卷积神经单元。
在./models/common.py文件中增加 ODConv+ConvNeXt模块。
```cpp
#------------------------------------Convnext start -------------------------------------
#ConvNextBlock
class ConvNextBlock(nn.Module):
def __init__(self, inputdim, dim, drop_path=0., layer_scale_init_value=1e-6, kersize = 7): #demo: [64, 64, 1] 1 denotes the number of repeats
super().__init__()
#匹配yolov5配置文件加入outdim输出通道
# self.flag = True if dim == outdim else False
self.dwconv = nn.Conv2d(dim, dim, kernel_size=kersize, padding=kersize // 2, groups=dim) # depthwise conv
self.norm = LayerNorm_s(dim, eps=1e-6)
self.pwconv1 = nn.Linear(dim, 4 * dim)
self.act = nn.GELU()
self.pwconv2 = nn.Linear(4 * dim, dim)
self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)),
requires_grad=True) if layer_scale_init_value > 0 else None
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
def forward(self, x):
# if self.flag == False:
# raise ValueError(
# f"Expected input out to have {dim} channels, but got {outdim} channels instead")
input = x
x = self.dwconv(x)
x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
x = self.norm(x)
x = self.pwconv1(x)
x = self.act(x)
x = self.pwconv2(x)
if self.gamma is not None:
x = self.gamma * x
x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
x = input + self.drop_path(x)
return x
class LayerNorm_s(nn.Module):
def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
super().__init__()
self.weight = nn.Parameter(torch.ones(normalized_shape))
self.bias = nn.Parameter(torch.zeros(normalized_shape))
self.eps = eps
self.data_format = data_format
if self.data_format not in ["channels_last", "channels_first"]:
raise NotImplementedError
self.normalized_shape = (normalized_shape,)
def forward(self, x):
if self.data_format == "channels_last":
return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
elif self.data_format == "channels_first":
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.eps)
x = self.weight[:, None, None] * x + self.bias[:, None, None]
return x
class DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
def forward(self, x):
return drop_path_f(x, self.drop_prob, self.training)
def drop_path_f(x, drop_prob: float = 0., training: bool = False):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
'survival rate' as the argument.
"""
if drop_prob == 0. or not training:
return x
keep_prob = 1 - drop_prob
shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
random_tensor.floor_() # binarize
output = x.div(keep_prob) * random_tensor
return output
#------------------------------------Convnext end -------------------------------------
#------------------------------------ODConv start -------------------------------------
class ODConv2d_3rd(nn.Conv2d):
def __init__(self, in_channels, out_channels, kernel_size,
stride=1, padding=0, dilation=1, groups=1, bias=True,
K=4, r=1 / 16, save_parameters=False,
padding_mode='zeros', device=None, dtype=None) -> None:
factory_kwargs = {'device': device, 'dtype': dtype}
self.K = K
self.r = r
self.save_parameters = save_parameters
super().__init__(in_channels, out_channels, kernel_size, stride,
padding, dilation, groups, bias, padding_mode)
del self.weight
self.weight = nn.Parameter(torch.empty((
K,
out_channels,
in_channels // groups,
*self.kernel_size,
), **factory_kwargs))
if bias:
del self.bias
self.bias = nn.Parameter(torch.empty(K, out_channels, **factory_kwargs))
hidden_dim = max(int(in_channels * r), 16) #设置下限为16
self.gap = nn.AdaptiveAvgPool2d(1)
self.reduction = nn.Linear(in_channels, hidden_dim)
self.fc = nn.Conv2d(in_channels, hidden_dim, 1, bias = False)
self.bn = nn.BatchNorm2d(hidden_dim)
self.act = nn.ReLU(inplace=True)
# self.act = nn.SiLU(inplace=True)
self.fc_f = nn.Linear(hidden_dim, out_channels)
if not save_parameters or self.kernel_size[0] * self.kernel_size[1] > 1:
self.fc_s = nn.Linear(hidden_dim, self.kernel_size[0] * self.kernel_size[1])
if not save_parameters or in_channels // groups > 1:
self.fc_c = nn.Linear(hidden_dim, in_channels // groups)
if not save_parameters or K > 1:
self.fc_w = nn.Linear(hidden_dim, K)
self.reset_parameters()
def reset_parameters(self) -> None:
fan_out = self.kernel_size[0] * self.kernel_size[1] * self.out_channels // self.groups
for i in range(self.K):
self.weight.data[i].normal_(0, math.sqrt(2.0 / fan_out))
if self.bias is not None:
self.bias.data.zero_()
def extra_repr(self):
return super().extra_repr() + f', K={self.K}, r={self.r:.4}'
def get_weight_bias(self, context):
B, C, H, W = context.shape
if C != self.in_channels:
raise ValueError(
f"Expected context{[B, C, H, W]} to have {self.in_channels} channels, but got {C} channels instead")
# x = self.gap(context).squeeze(-1).squeeze(-1) # B, c_in
# x = self.reduction(x) # B, hidden_dim
x = self.gap(context)
x = self.fc(x)
if x.size(0)>1:
x = self.bn(x)
x = x.squeeze(-1).squeeze(-1)
x = self.act(x)
attn_f = self.fc_f(x).sigmoid() # B, c_out
attn = attn_f.view(B, 1, -1, 1, 1, 1) # B, 1, c_out, 1, 1, 1
if hasattr(self, 'fc_s'):
attn_s = self.fc_s(x).sigmoid() # B, k * k
attn = attn * attn_s.view(B, 1, 1, 1, *self.kernel_size) # B, 1, c_out, 1, k, k
if hasattr(self, 'fc_c'):
attn_c = self.fc_c(x).sigmoid() # B, c_in // groups
attn = attn * attn_c.view(B, 1, 1, -1, 1, 1) # B, 1, c_out, c_in // groups, k, k
if hasattr(self, 'fc_w'):
attn_w = self.fc_w(x).softmax(-1) # B, n
attn = attn * attn_w.view(B, -1, 1, 1, 1, 1) # B, n, c_out, c_in // groups, k, k
weight = (attn * self.weight).sum(1) # B, c_out, c_in // groups, k, k
weight = weight.view(-1, self.in_channels // self.groups, *self.kernel_size) # B * c_out, c_in // groups, k, k
bias = None
if self.bias is not None:
if hasattr(self, 'fc_w'):
bias = attn_w @ self.bias
else:
bias = self.bias.tile(B, 1)
bias = bias.view(-1) # B * c_out
return weight, bias
def forward(self, input, context=None):
B, C, H, W = input.shape
if C != self.in_channels:
raise ValueError(
f"Expected input{[B, C, H, W]} to have {self.in_channels} channels, but got {C} channels instead")
weight, bias = self.get_weight_bias(context or input)
output = nn.functional.conv2d(
input.view(1, B * C, H, W), weight, bias,
self.stride, self.padding, self.dilation, B * self.groups) # 1, B * c_out, h_out, w_out
output = output.view(B, self.out_channels, *output.shape[2:])
return output
def debug(self, input, context=None):
B, C, H, W = input.shape
if C != self.in_channels:
raise ValueError(
f"Expected input{[B, C, H, W]} to have {self.in_channels} channels, but got {C} channels instead")
output_size = [
((H, W)[i] + 2 * self.padding[i] - self.dilation[i] * (self.kernel_size[i] - 1) - 1) // self.stride[i] + 1
for i in range(2)
]
weight, bias = self.get_weight_bias(context or input)
weight = weight.view(B, self.groups, self.out_channels // self.groups, -1) # B, groups, c_out // groups, c_in // groups * k * k
unfold = nn.functional.unfold(
input, self.kernel_size, self.dilation, self.padding, self.stride) # B, c_in * k * k, H_out * W_out
unfold = unfold.view(B, self.groups, -1, output_size[0] * output_size[1]) # B, groups, c_in // groups * k * k, H_out * W_out
output = weight @ unfold # B, groups, c_out // groups, H_out * W_out
output = output.view(B, self.out_channels, *output_size) # B, c_out, H_out * W_out
if bias is not None:
output = output + bias.view(B, self.out_channels, 1, 1)
return output
class ODConv_3rd(nn.Module):
# Standard convolution
def __init__(self, c1, c2, k=1, s=1, kerNums=1, g=1, p=None, act=True): # ch_in, ch_out, kernel, stride, padding, groups
super().__init__()
self.conv = ODConv2d_3rd(c1, c2, k, s, autopad(k, p), groups=g, K=kerNums)
self.bn = nn.BatchNorm2d(c2)
self.act = nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity())
def forward(self, x):
return self.act(self.bn(self.conv(x)))
def forward_fuse(self, x):
return self.act(self.conv(x))
#------------------------------------ODConv end -------------------------------------
2.yolo.py配置:
然后找到~ultralytics/nn/tasks.py文件下里的parse_model函数,将类名注册;
parse_model函数中for i, (f, n, m, args) in enumerate(d[‘backbone’] + d[‘head’]):内部对应位置 下方只需要增加 ODConv+ConvNeXt模块
if m in [Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, DWConv, MixConv2d, Focus, CrossConv,
BottleneckCSP, C3, C3TR, C3SPP, C3Ghost, ODConv_3rd, ConvNextBlock]:
3.新增YOLOv8的ultralytics/models/v8下yaml配置文件:
首先增加以下yolov8_ODConv.yaml文件。
方法二参考:改进YOLO系列:改进YOLOv8,教你YOLOv8如何添加20多种注意力机制,并实验不同位置。
参考:
https://github.com/open-mmlab/mmyolo/blob/dev/configs/yolov8/README.md
https://github.com/chengshuxiao/YOLOv5-ODConvNeXt
https://github.com/OSVAI/ODConv
https://github.com/ultralytics/assets/releases
https://github.com/z1069614715/objectdetection_script/tree/master/cv-attention