目录
数据处理 (在LoadImagesAndLabels类中)
mosaic增强
按概率是否使用mosaic增强 这个概率设置在超参数里面
代码:mosaic = self.mosaic and random.random() < hyp['mosaic']
关于mosaic:https://blog.youkuaiyun.com/weixin_34910922/article/details/121041318
mixup 增强
必须使用了mosaic增强才能使用mixup
if random.random() < hyp['mixup']:
r = np.random.beta(32.0, 32.0) # 初始化一个概率
im = (im * r + im2 * (1 - r)).astype(np.uint8) #将概率作为两张图片的权重相加
labels = np.concatenate((labels, labels2), 0) #两张图片的标签合并
数据增广
用的就是pytorch自带的
参考博客:https://blog.youkuaiyun.com/weixin_45250844/article/details/120469766
我把这些方法叫美图秀秀
hsv值域变化
这里涉及到3个超参数:
坐标转换
yolo需要的原始坐标样式:
从左到右:类别,x1,x2,y1,y2
转换公式为:
x = (x1+x2) / 2
y=(y1+y2)/2
w=x2-x1
h=y2-y1
归一化:
(x, w)/ 图片宽度 (y, h)/图片高度。 默认设置图片宽和高都是一样的(resize的结果)
yolo官方文档建议图片的大小640 1280 这样成倍数增长
超参数
hsv_h: 0.015 # image HSV-Hue augmentation (fraction) hsv图像模式 色调
hsv_s: 0.7 # image HSV-Saturation augmentation (fraction) hsv 图像模式 饱和度
hsv_v: 0.4 # image HSV-Value augmentation (fraction) 明亮度
flipud: 0.0 # image flip up-down (probability) 上下翻转
fliplr: 0.5 # image flip left-right (probability) 水平翻转
mosaic: 1.0 # image mosaic (probability) mosaic数据增强 按概率 random.random() < hyp['mosaic']
mixup: 0.0 # image mixup (probability) mixup数据增强 两张图片混合 x, y * r + x1, y * (1-r)
输出
img:图片(归一化:img / 255)
targets:shape形状(哪一张图片,类别,转换后的坐标)总共6位
模型(yolo.py common.py)
模型图 :https://blog.youkuaiyun.com/qq_38253797/article/details/119754854
nc: 80 # number of classes
depth_multiple: 0.33 # model depth multiple
width_multiple: 0.50 # layer channel multiple
anchors:
- [10,13, 16,30, 33,23] # P3/8 w, h
- [30,61, 62,45, 59,119] # P4/16
- [116,90, 156,198, 373,326] # P5/32
# YOLOv5 v6.0 backbone
backbone:
# [from, number, module, args [输出, 核大小, 步长, 填充]]
[[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
[-1, 1, Conv, [128, 3, 2]], # 1-P2/4
[-1, 3, C3, [128]],
[-1, 1, Conv, [256, 3, 2]], # 3-P3/8
[-1, 6, C3, [256]],
[-1, 1, Conv, [512, 3, 2]], # 5-P4/16
[-1, 9, C3, [512]],
[-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
[-1, 3, C3, [1024]],
[-1, 1, SPPF, [1024, 5]], # 9
]
# YOLOv5 v6.0 head
head:
[[-1, 1, Conv, [512, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 6], 1, Concat, [1]], # cat backbone P4
[-1, 3, C3, [512, False]], # 13
[-1, 1, Conv, [256, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 4], 1, Concat, [1]], # cat backbone P3
[-1, 3, C3, [256, False]], # 17 (P3/8-small)
[-1, 1, Conv, [256, 3, 2]],
[[-1, 14], 1, Concat, [1]], # cat head P4
[-1, 3, C3, [512, False]], # 20 (P4/16-medium)
[-1, 1, Conv, [512, 3, 2]],
[[-1, 10], 1, Concat, [1]], # cat head P5
[-1, 3, C3, [1024, False]], # 23 (P5/32-large)
[[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
]
模型构造
nc: 表示的是预测类别数 :模型输出 = na * (nc + 5) ,na=len(anchors)
加 5表示 (中心坐标 , 宽高 , 置信度)
depth_multiple width_multiple:控制块的深度和宽度
if m in {BottleneckCSP, C3, C3TR, C3Ghost, C3x}: 这几个块
class BottleneckCSP(nn.Module):
# CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
super().__init__()
c_ = int(c2 * e) # hidden channels
self.cv1 = Conv(c1, c_, 1, 1)
self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
self.cv4 = Conv(2 * c_, c2, 1, 1)
self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3)
self.act = nn.SiLU()
self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
def forward(self, x):
y1 = self.cv3(self.m(self.cv1(x)))
y2 = self.cv2(x)
return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1))))
def parse_model(d, ch): # model_dict, input_channels(3)
# Parse a YOLOv5 model.yaml dictionary
LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10} {'module':<40}{'arguments':<30}")
anchors, nc, gd, gw, act = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple'], d.get('activation')
if act:
Conv.default_act = eval(act) # redefine default activation, i.e. Conv.default_act = nn.SiLU()
LOGGER.info(f"{colorstr('activation:')} {act}") # print
na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors # number of anchors
no = na * (nc + 5) # number of outputs = anchors * (classes + 5)
layers, save, c2 = [], [], ch[-1] # layers, savelist, ch out
for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']): # from, number, module, args
m = eval(m) if isinstance(m, str) else m # eval strings
for j, a in enumerate(args):
with contextlib.suppress(NameError):
args[j] = eval(a) if isinstance(a, str) else a # eval strings
n = n_ = max(round(n * gd), 1) if n > 1 else n # depth gain
if m in {
Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, DWConv, MixConv2d, Focus, CrossConv,
BottleneckCSP, C3, C3TR, C3SPP, C3Ghost, nn.ConvTranspose2d, DWConvTranspose2d, C3x}:
c1, c2 = ch[f], args[0]
if c2 != no: # if not output
c2 = make_divisible(c2 * gw, 8)
args = [c1, c2, *args[1:]]
if m in {BottleneckCSP, C3, C3TR, C3Ghost, C3x}:
args.insert(2, n) # number of repeats
n = 1
elif m is nn.BatchNorm2d:
args = [ch[f]]
elif m is Concat:
c2 = sum(ch[x] for x in f)
# TODO: channel, gw, gd
elif m in {Detect, Segment}:
args.append([ch[x] for x in f])
if isinstance(args[1], int): # number of anchors
args[1] = [list(range(args[1] * 2))] * len(f)
if m is Segment:
args[3] = make_divisible(args[3] * gw, 8)
elif m is Contract:
c2 = ch[f] * args[0] ** 2
elif m is Expand:
c2 = ch[f] // args[0] ** 2
else:
c2 = ch[f]
m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args) # module
t = str(m)[8:-2].replace('__main__.', '') # module type
np = sum(x.numel() for x in m_.parameters()) # number params
m_.i, m_.f, m_.type, m_.np = i, f, t, np # attach index, 'from' index, type, number params
LOGGER.info(f'{i:>3}{str(f):>18}{n_:>3}{np:10.0f} {t:<40}{str(args):<30}') # print
save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1) # append to savelist
layers.append(m_)
if i == 0:
ch = []
ch.append(c2)
return nn.Sequential(*layers), sorted(save)
backbone: 主要就是几个卷积块 Conv块构成(卷积,批量归一化, silu激活函数)
这里没有用到池化层来缩小,但是池化层还是有使用,三个串联起来的(我忘记在那个块了),用3的核,步长为2代替了,没有全连接层。每个版本有些许不一样,但是大体都是一样的
总共缩小了32倍(这里跟感受野和上采样有关)
forward
def _forward_once(self, x, profile=False, visualize=False):
y, dt = [], [] # outputs
for m in self.model:
if m.f != -1: # if not from previous layer
x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers
if profile:
self._profile_one_layer(m, x, dt)
x = m(x) # run
y.append(x if m.i in self.save else None) # save output
if visualize:
feature_visualization(x, m.type, m.i, save_dir=visualize)
return x
解释:
[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
配置文件 -1的时候,直接后面的模型。建议debug看一下,模型可以看出放到一个数组里面和配置文件的顺序一样。
[[-1, 4], 1, Concat, [1]], # cat backbone P3
配置文件[-1, 4]的时候,会将第四个块(C3)的输出拿到,将当前的输出堆叠到C3块的输出。
神经网络可以看成特征提取,这么做是为了不浪费当前提取的特征。至于为什么要加前面的,一个原因是越到后面感受野越大,第二个就是为了速度快。最后输出3组数据分别代表大中小的目标检测。
在配置文件的head:
head的构成可以看出来,卷积块,上采样层,concat层,c3块。参考模型图来看。
Detect 输出格式
这个是最后输出的层。
前面的模型走完之后,输出的shape形状(图像数量,(类别数+5)* 先验眶类别(默认为3,大中小三种类型), w, h (表示网格大小))
detect转换后的shape形状(图片数量, 先验框(默认3,每一个类别的3种类型), w,h, 预测类型 + 5(x ,y, w, h ,iou))
列如:[(1,3,32,32,85),(1,3,16,16,85),(1,3,8,8,85) ]
def forward(self, x):
z = [] # inference output
for i in range(self.nl):
x[i] = self.m[i](x[i]) # conv
bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
if not self.training: # inference
if self.dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:
self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)
if isinstance(self, Segment): # (boxes + masks)
xy, wh, conf, mask = x[i].split((2, 2, self.nc + 1, self.no - self.nc - 5), 4)
xy = (xy.sigmoid() * 2 + self.grid[i]) * self.stride[i] # xy
wh = (wh.sigmoid() * 2) ** 2 * self.anchor_grid[i] # wh
y = torch.cat((xy, wh, conf.sigmoid(), mask), 4)
else: # Detect (boxes only)
xy, wh, conf = x[i].sigmoid().split((2, 2, self.nc + 1), 4)
xy = (xy * 2 + self.grid[i]) * self.stride[i] # xy
wh = (wh * 2) ** 2 * self.anchor_grid[i] # wh
y = torch.cat((xy, wh, conf), 4)
z.append(y.view(bs, self.na * nx * ny, self.no))
return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x)
锚框
build_targets (模型的输出,dataload的输出):
def build_targets(self, p, targets):
# Build targets for compute_loss(), input targets(image,class,x,y,w,h)
na, nt = self.na, targets.shape[0] # number of anchors, targets
tcls, tbox, indices, anch = [], [], [], []
gain = torch.ones(7, device=self.device) # normalized to gridspace gain
ai = torch.arange(na, device=self.device).float().view(na, 1).repeat(1, nt) # same as .repeat_interleave(nt)
targets = torch.cat((targets.repeat(na, 1, 1), ai[..., None]), 2) # append anchor indices
g = 0.5 # bias
off = torch.tensor(
[
[0, 0],
[1, 0],
[0, 1],
[-1, 0],
[0, -1], # j,k,l,m
# [1, 1], [1, -1], [-1, 1], [-1, -1], # jk,jm,lk,lm
],
device=self.device).float() * g # offsets
for i in range(self.nl):
anchors, shape = self.anchors[i], p[i].shape
gain[2:6] = torch.tensor(shape)[[3, 2, 3, 2]] # xyxy gain
# Match targets to anchors
t = targets * gain # shape(3,n,7)
if nt:
# Matches
r = t[..., 4:6] / anchors[:, None] # wh ratio
j = torch.max(r, 1 / r).max(2)[0] < self.hyp['anchor_t'] # compare
# j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t'] # iou(3,n)=wh_iou(anchors(3,2), gwh(n,2))
t = t[j] # filter
# Offsets
gxy = t[:, 2:4] # grid xy
gxi = gain[[2, 3]] - gxy # inverse
j, k = ((gxy % 1 < g) & (gxy > 1)).T
l, m = ((gxi % 1 < g) & (gxi > 1)).T
j = torch.stack((torch.ones_like(j), j, k, l, m))
t = t.repeat((5, 1, 1))[j]
offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j]
else:
t = targets[0]
offsets = 0
# Define
bc, gxy, gwh, a = t.chunk(4, 1) # (image, class), grid xy, grid wh, anchors
a, (b, c) = a.long().view(-1), bc.long().T # anchors, image, class
gij = (gxy - offsets).long()
gi, gj = gij.T # grid indices
# Append
indices.append((b, a, gj.clamp_(0, shape[2] - 1), gi.clamp_(0, shape[3] - 1))) # image, anchor, grid
tbox.append(torch.cat((gxy - gij, gwh), 1)) # box
anch.append(anchors[a]) # anchors
tcls.append(c) # class
return tcls, tbox, indices, anch
1. 首先将真实的坐标映射到网格坐标上:
gain[2:6] = torch.tensor(shape)[[3, 2, 3, 2]] # xyxy gain
t = targets * gain
解释:targets 乘的是模型输出的3 2 维度的大小。也就是那两个维度就表示网格大小。
在之前dataload做了归一化,所以这里是乘法而不是除法。
加上一张我自己画的图:
2 : 计算高宽比 并且过滤得到置信度合适的框
r = t[..., 4:6] / anchors[:, None] # wh ratio
j = torch.max(r, 1 / r).max(2)[0] < self.hyp['anchor_t']
解释:比较早的版本是计算iou,v7改成了计算高宽比,只取最大的,因为要么小于1,要么大于1.越小越不好,越大也越不好,为了方便就取最大的。这里有一个超参数。
3 偏移
gxy = t[:, 2:4] # grid xy
gxi = gain[[2, 3]] - gxy # inverse
j, k = ((gxy % 1 < g) & (gxy > 1)).T
l, m = ((gxi % 1 < g) & (gxi > 1)).T
j = torch.stack((torch.ones_like(j), j, k, l, m))
t = t.repeat((5, 1, 1))[j]
解释:
真实标签 x < 0.5, x > 0.5, y<0.5 ,y>0.5。如果小于0.5就-0.5让它移动到旁边的网格,同样大于0.5就加0.5。这样最坏的情况,一个真实框得到的候选框有1* 3*3*3。总共五个类别
4 输出:
1 索引(图片,框,网格)
2 box (这里中心坐标变成了, 相对于当前网格的坐标了,之前是相对于整张图片)
tbox.append(torch.cat((gxy - gij, gwh), 1)) # box
3 类别
4 先验框
四:损失函数
for i, pi in enumerate(p): # layer index, layer predictions
b, a, gj, gi = indices[i] # image, anchor, gridy, gridx
tobj = torch.zeros(pi.shape[:4], dtype=pi.dtype, device=self.device) # target obj
n = b.shape[0] # number of targets
if n:
# pxy, pwh, _, pcls = pi[b, a, gj, gi].tensor_split((2, 4, 5), dim=1) # faster, requires torch 1.8.0
pxy, pwh, _, pcls = pi[b, a, gj, gi].split((2, 2, 1, self.nc), 1) # target-subset of predictions
# Regression
pxy = pxy.sigmoid() * 2 - 0.5
pwh = (pwh.sigmoid() * 2) ** 2 * anchors[i]
pbox = torch.cat((pxy, pwh), 1) # predicted box
iou = bbox_iou(pbox, tbox[i], CIoU=True).squeeze() # iou(prediction, target)
lbox += (1.0 - iou).mean() # iou loss
# Objectness
iou = iou.detach().clamp(0).type(tobj.dtype)
if self.sort_obj_iou:
j = iou.argsort()
b, a, gj, gi, iou = b[j], a[j], gj[j], gi[j], iou[j]
if self.gr < 1:
iou = (1.0 - self.gr) + self.gr * iou
tobj[b, a, gj, gi] = iou # iou ratio
# Classification
if self.nc > 1: # cls loss (only if multiple classes)
t = torch.full_like(pcls, self.cn, device=self.device) # targets
t[range(n), tcls[i]] = self.cp
lcls += self.BCEcls(pcls, t) # BCE
# Append targets to text file
# with open('targets.txt', 'a') as file:
# [file.write('%11.5g ' * 4 % tuple(x) + '\n') for x in torch.cat((txy[i], twh[i]), 1)]
obji = self.BCEobj(pi[..., 4], tobj)
lobj += obji * self.balance[i] # obj loss
if self.autobalance:
self.balance[i] = self.balance[i] * 0.9999 + 0.0001 / obji.detach().item()
解释:送入损失函数的数据,只有被候选框过滤掉的,并不是全部送进去。
同样我再画一个图:
比如模型输出的shape(1,3,80,80,85): 80,80就代表网格的宽高。前面提到的坐标映射也是乘以这个值。每一个网格有85个数据。由此可以看出,每一个网格就是预测一个物体。
过滤代码:
pxy, pwh, _, pcls = pi[b, a, gj, gi].split((2, 2, 1, self.nc), 1)
1 box损失: lbox += (1.0 - iou).mean() # iou
2 分类损失 :lcls += self.BCEcls(pcls, t) # BCE
3 置信度损失 :obji = self.BCEobj(pi[..., 4], tobj)
最后三个损失相加
return (lbox + lobj + lcls) * bs, torch.cat((lbox, lobj, lcls)).detach()
超参数:
box: 0.05 # box loss gain
cls: 0.5 # cls loss gain
cls_pw: 1.0 # cls BCELoss positive_weight 类 损失样本不均衡
obj: 1.0 # obj loss gain (scale with pixels)
obj_pw: 1.0 # obj BCELoss positive_weight 置信度 损失样本不均衡
iou_t: 0.20 # IoU training threshold
总结:
数据流动 dataload -> forword ->build_targets->loss
体会:关键就在数据的组合,处理。无论我们怎么组合,只要设计好一个损失函数,它就会学习到我们想要的结果。