1. ShuffleNet V1
1.1 Abstract
我们提出了一个极其效率的CNN架构——ShuffleNet
,其专为计算能力非常有限的移动设备设计。这个新的架构利用了两个新的操作:pointwise group conv
和channel shuffle
,并大大降低计算成本,同时确保准确性。
1.2 Approach
1.2.1 Channel Shuffle for Group Convolutions
在微型网络结构中, 由于 1×1
卷积计算代价很高,在计算资源有限的情况下特征图的通道数就会受限,这会极大地降低模型的准确率。为了解决这个问题,一个简单的方案就是通道之间进行稀疏连接,也就是对 1×1
卷积也进行分组。
如下图图(a)所示,输出特征图只与一部分输入特征图相连接。但这样就会带来一个副作用,叠加几个卷积层后,输出的特征图都只由输入特征的其中一部分产生,比如图中红色部分的特征就只由输入的红色部分特征得来,而蓝色部分的特征就只由输入的蓝色部分特征得来。这阻止了不同组之间特征的信息流动因此会减弱网络的表示能力。
如果我们允许组卷积可以从不同组获取输入数据,那么输入通道和输出通道将会完全相关联。如上图图(b)所示,先把每个组内的特征分为几个子组特征,再把每个子组特征分别送到下一层的每个组中去卷积。假设一个卷积层有
g
g
g个组,其输出具有
g
×
n
g \times n
g×n个通道,我们首先将输出通道reshape
为
(
g
,
n
)
(g, n)
(g,n),然后再转置,最后将其flatten
(展平)作为下一层的输入。具体见下图,其中
g
=
3
,
n
=
2
g=3,n=2
g=3,n=2
1.2.2 ShuffleNet Unit
接下来分析一下,ShuffleNet
的FLOPs
的变化。假设输出尺寸为
c
∗
h
∗
w
c∗h∗w
c∗h∗w,和bottleneck
中的通道数为
m
m
m,
g
g
g是分组的组数。其中ResNet
(左)和ResNeXt
(右)的结构单元如下图所示:
ResNet
的FLOPs为: ( c × 1 × 1 ) h w m + ( m × 3 × 3 ) h w m + ( m × 1 × 1 ) h w c = 9 h w m 2 + 2 h w c m = h w ( 2 c m + 9 m 2 ) (c \times 1 \times 1)hwm+(m \times 3 \times 3)hwm+(m \times 1 \times 1)hwc=9hwm^2+2hwcm=hw(2cm+9m^2) (c×1×1)hwm+(m×3×3)hwm+(m×1×1)hwc=9hwm2+2hwcm=hw(2cm+9m2)ResNeXt
的FLOPs为: ( c / g × 1 × 1 ) h w m + ( m / g × 3 × 3 ) h w m + ( m / g × 1 × 1 ) h w c = h w ( 2 c m / g + 9 m 2 / g ) (c/g \times 1 \times 1)hwm + (m/g \times 3 \times 3)hwm+(m/g \times 1 \times 1)hwc=hw(2cm/g+9m^2/g) (c/g×1×1)hwm+(m/g×3×3)hwm+(m/g×1×1)hwc=hw(2cm/g+9m2/g)(论文中是 h w ( 2 c m + 9 m 2 / g ) hw(2cm+9m^2/g) hw(2cm+9m2/g),应该是将ResNeXt中bottleneck中的1x1卷积看作为常规卷积,而非组卷积)ShuffleNet
的FLOPs为: ( c / g × 1 × 1 ) h w m + ( m / m × 3 × 3 ) h w m + ( m / g × 1 × 1 ) h w c = h w ( 2 c m / g + 9 m ) (c/g \times 1 \times 1)hwm+(m/m \times 3 \times 3)hwm+(m/g \times 1 \times 1)hwc=hw(2cm/g+9m) (c/g×1×1)hwm+(m/m×3×3)hwm+(m/g×1×1)hwc=hw(2cm/g+9m)
可以看出,ShuffleNet
相对的FLOPs
较小
1.3 Network Architecture
网络中的 ShuffleNet 单元可以划分为三个阶段,每个阶段的第一个单元步长为 2
,每经过一个阶段特征图通道数翻倍,瓶颈层的特征图通道数为输出通道数的1/4
分组卷积的组数 g g g控制着点卷积的稀疏性,在同一个复杂度下,组数越多,特征图的通道数就可以越大。
1.4 Pytorch实现
import torch
import torch.nn as nn
class ConvBNRelU(nn.Sequential):
def __init__(self, in_channel, out_channel, kernel_size, stride, groups):
padding = (kernel_size - 1) // 2
super(ConvBNRelU, self).__init__(
nn.Conv2d(in_channels=in_channel, out_channels=out_channel, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups),
nn.BatchNorm2d(out_channel),
nn.ReLU6(inplace=True),
)
class ConvBN(nn.Sequential):
def __init__(self, in_channel, out_channel, groups):
super(ConvBN, self).__init__(
nn.Conv2d(in_channels=in_channel, out_channels=out_channel, kernel_size=1, stride=1, groups=groups),
nn.BatchNorm2d(out_channel),
)
class ChannelShuffle(nn.Module):
def __init__(self, groups):
super(ChannelShuffle, self).__init__()
self.groups = groups
def forward(self, x):
# Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]
bacth_size, num_channels, height, width = x.size()
channels_per_group = num_channels // self.groups
x = x.view(bacth_size, self.groups, channels_per_group, height, width)
x = torch.transpose(x, dim0=1, dim1=2).contiguous()
x = x.view(bacth_size, -1, height, width)
return x
class ShuffleNetUnits(nn.Module):
def __init__(self, in_channel, out_channel, stride, groups):
super(ShuffleNetUnits, self).__init__()
self.stride = stride
out_channel = out_channel - in_channel if self.stride > 1 else out_channel
mid_channel = out_channel // 4
self.bottleneck = nn.Sequential(
ConvBNRelU(in_channel=in_channel, out_channel=mid_channel, kernel_size=1, stride=1, groups=groups),
ChannelShuffle(groups=groups),
ConvBNRelU(in_channel=mid_channel, out_channel=mid_channel, kernel_size=3, stride=stride, groups=groups),
ConvBN(in_channel=mid_channel, out_channel=out_channel, groups=groups),
)
if self.stride > 1:
self.shortcut = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
self.relu = nn.ReLU6(inplace=True)
def forward(self, x):
out = self.bottleneck(x)
if self.stride > 1:
out = torch.cat([self.shortcut(x), out], dim=1)
else:
out += x
return self.relu(out)
class ShuffleNet(nn.Module):
def __init__(self, planes, layers, groups, num_classes=1000):
super(ShuffleNet, self).__init__()
self.stage1 = nn.Sequential(
ConvBNRelU(in_channel=3, out_channel=24, kernel_size=3, stride=2, groups=1),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
)
self.stage2 = self._make_layer(in_channel=24, out_channel=planes[0], groups=groups, block_num=layers[0], is_stage2=True)
self.stage3 = self._make_layer(in_channel=planes[0], out_channel=planes[1], groups=groups, block_num=layers[1], is_stage2=False)
self.stage4 = self._make_layer(in_channel=planes[1], out_channel=planes[2], groups=groups, block_num=layers[2], is_stage2=False)
self.globalpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Sequential(
nn.Dropout(p=0.2),
nn.Linear(in_features=planes[2], out_features=num_classes)
)
# weight init
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, nn.BatchNorm2d):
nn.init.ones_(m.weight)
nn.init.zeros_(m.bias)
elif isinstance(m, nn.Linear):
nn.init.normal_(m.weight, mean=0, std=0.01)
nn.init.zeros_(m.bias)
def _make_layer(self, in_channel, out_channel, groups, block_num, is_stage2):
layers = []
layers.append(ShuffleNetUnits(in_channel=in_channel, out_channel=out_channel, stride=2, groups=1 if is_stage2 else groups))
for _ in range(1, block_num):
layers.append(ShuffleNetUnits(in_channel=out_channel, out_channel=out_channel, stride=1, groups=groups))
return nn.Sequential(*layers)
def forward(self, x):
x = self.stage1(x)
x = self.stage2(x)
x = self.stage3(x)
x = self.stage4(x)
x = self.globalpool(x)
x = torch.flatten(x, start_dim=1)
x = self.fc(x)
return x
def shufflenet_g1(**kwargs):
planes = [144, 288, 576]
layers = [4, 8, 4]
model = ShuffleNet(planes=planes, layers=layers, groups=1)
return model
def shufflenet_g2(**kwargs):
planes = [200, 400, 800]
layers = [4, 8, 4]
model = ShuffleNet(planes=planes, layers=layers, groups=2)
return model
model = shufflenet_g2()
x = torch.randn(1, 3, 224, 224)
out = model(x)
print(out.size())
2. ShuffleNet V2
主要的创新点是提出了4条轻量化的原则,并基于此4条原则,在v1基础上提出了ShuffleNet V2。
2.1 FLOPs
F L O P s FLOPs FLOPs:浮点运算数,理解为计算量。可以用来衡量算法/模型的复杂度。
对于卷积层:
F L O P s = ( C i n × K 2 ) H W C o u t FLOPs = (C_{in} \times K^2)HWC_{out} FLOPs=(Cin×K2)HWCout
- C i n C_{in} Cin:input channel
- K K K:kernel size
- H 、 W H、W H、W:output feature map size
- C o u t C_{out} Cout:output channel
M
A
C
MAC
MAC:memory access cost
2.2 Practical Guidelines
G1. 输入输出具有相同channel
时,内存消耗最小
F L O P s = ( c 1 × 1 × 1 ) h w c 2 FLOPs = (c_1\times 1 \times 1)hwc_2 FLOPs=(c1×1×1)hwc2
M A C = h w c 1 + h w c 2 + 1 × 1 × c 1 × c 2 = h w ( c 1 + c 2 ) + c 1 c 2 MAC = hwc_1 + hwc_2 + 1 \times 1 \times c1 \times c2 = hw(c_1+c_2) + c_1 c_2 MAC=hwc1+hwc2+1×1×c1×c2=hw(c1+c2)+c1c2
由均值不等式可知,当
c
1
=
c
2
c_1 = c_2
c1=c2时,取得最小值。
G2. 过多的分组卷积操作会增大 M A C MAC MAC,从而使模型速度变慢
F L O P s = ( c 1 / g × 1 × 1 ) h w c 2 = h w c 1 c 2 / g FLOPs = (c_1/g \times 1 \times 1)hwc_2=hwc_1c_2/g FLOPs=(c1/g×1×1)hwc2=hwc1c2/g
M A C = h w c 1 + h w c 2 + 1 × 1 × c 1 × c 2 / g = h w ( c 1 + c 2 ) + c 1 c 2 / g MAC = hwc_1 + hwc_2 + 1 \times 1 \times c_1 \times c_2 / g = hw(c_1+c_2) + c_1 c_2 /g MAC=hwc1+hwc2+1×1×c1×c2/g=hw(c1+c2)+c1c2/g
MAC 与 FLOPs的关系如下:
-
MAC = h w c 1 + F L O P s × g c 1 + F L O P s h w hwc_1 + \frac {FLOPs \times g} {c_1} + \frac {FLOPs} {hw} hwc1+c1FLOPs×g+hwFLOPs
-
当FLOPs不变时, g g g越大,MAC也越大
G3. 模型中的分支数量越少,模型速度越快
G4. Element-wise操作不能被忽略
element-wise
操作包含ReLU
,ADDTensor
,ADDbias
,depthwise conv
2.3. ShuffleNet V2
ShuffleNetV2 的结构单元:
ShuffleNetV1 的不足之处:
ShuffleNetV2 的网络结构:
2.4. Pytorch实现
import torch
import torch.nn as nn
class ConvBNReLu(nn.Sequential):
def __init__(self, in_channel, out_channel, kernel_size, stride, groups):
padding = (kernel_size - 1) // 2
super(ConvBNReLu, self).__init__(
nn.Conv2d(in_channels=in_channel, out_channels=out_channel, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups),
nn.BatchNorm2d(out_channel),
nn.ReLU6(inplace=True),
)
class ConvBN(nn.Sequential):
def __init__(self, in_channel, out_channel, kernel_size, stride, groups):
padding = (kernel_size - 1) // 2
super(ConvBN, self).__init__(
nn.Conv2d(in_channels=in_channel, out_channels=out_channel, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups),
nn.BatchNorm2d(out_channel),
)
class HalfSplit(nn.Module):
def __init__(self, dim=0, first_half=True):
super(HalfSplit, self).__init__()
self.first_half = first_half
self.dim = dim
def forward(self, x):
splits = torch.chunk(x, 2, dim=self.dim)
return splits[0] if self.first_half else splits[1]
class ChannelShuffle(nn.Module):
def __init__(self, groups):
super(ChannelShuffle, self).__init__()
self.groups = groups
def forward(self, x):
# Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]
batch_size, num_channels, height, width = x.size()
channels_per_group = num_channels // self.groups
x = x.view(batch_size, self.groups, channels_per_group, height, width)
x = torch.transpose(x, dim0=1, dim1=2).contiguous()
x = x.view(batch_size, -1, height, width)
return x
class ShuffleNetUnits(nn.Module):
def __init__(self, in_channel, out_channel, stride, groups):
super(ShuffleNetUnits, self).__init__()
self.stride = stride
if self.stride > 1:
mid_channel = out_channel - in_channel
else:
mid_channel = out_channel // 2
in_channel = mid_channel
self.first_split = HalfSplit(dim=1, first_half=True)
self.second_split = HalfSplit(dim=1, first_half=False)
self.bottleneck = nn.Sequential(
ConvBNReLu(in_channel=in_channel, out_channel=in_channel, kernel_size=1, stride=1, groups=1),
ConvBN(in_channel=in_channel, out_channel=mid_channel, kernel_size=3, stride=stride, groups=groups),
ConvBNReLu(in_channel=mid_channel, out_channel=mid_channel, kernel_size=1, stride=1, groups=1),
)
if self.stride > 1:
self.shortcut = nn.Sequential(
ConvBN(in_channel=in_channel, out_channel=in_channel, kernel_size=3, stride=stride, groups=groups),
ConvBNReLu(in_channel=in_channel, out_channel=in_channel, kernel_size=1, stride=1, groups=1),
)
self.channel_shuffle = ChannelShuffle(groups=groups)
def forward(self, x):
if self.stride > 1:
x1 = self.bottleneck(x)
x2 = self.shortcut(x)
else:
x1 = self.first_split(x)
x2 = self.second_split(x)
x1 = self.bottleneck(x1)
out = torch.cat([x1, x2], dim=1)
out = self.channel_shuffle(out)
return out
class ShuffleNetV2(nn.Module):
def __init__(self, planes, layers, groups, num_classes=1000):
super(ShuffleNetV2, self).__init__()
self.groups = groups
self.stage1 = nn.Sequential(
ConvBNReLu(in_channel=3, out_channel=24, kernel_size=3, stride=2, groups=1),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
)
self.stage2 = self._make_layers(in_channel=24, out_channel=planes[0], block_num=layers[0], is_stage2=True)
self.stage3 = self._make_layers(in_channel=planes[0], out_channel=planes[1], block_num=layers[1], is_stage2=False)
self.stage4 = self._make_layers(in_channel=planes[1], out_channel=planes[2], block_num=layers[2], is_stage2=False)
self.conv5 = ConvBNReLu(in_channel=planes[2], out_channel=planes[3], kernel_size=1, stride=1, groups=1)
self.globalpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Sequential(
nn.Dropout(p=0.2),
nn.Linear(in_features=planes[3], out_features=num_classes)
)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.Linear):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def _make_layers(self, in_channel, out_channel, block_num, is_stage2):
layers = []
layers.append(ShuffleNetUnits(in_channel=in_channel, out_channel=out_channel, stride=2, groups=1 if is_stage2 else self.groups))
for _ in range(1, block_num):
layers.append(ShuffleNetUnits(in_channel=out_channel, out_channel=out_channel, stride=1, groups=self.groups))
return nn.Sequential(*layers)
def forward(self, x):
x = self.stage1(x)
x = self.stage2(x)
x = self.stage3(x)
x = self.stage4(x)
x = self.conv5(x)
x = self.globalpool(x)
x = torch.flatten(x, start_dim=1)
x = self.fc(x)
return x
def shufflenet_v2_x0_5(**kwargs):
planes = [48, 96, 192, 1024]
layers = [4, 8, 4]
model = ShuffleNetV2(planes=planes, layers=layers, groups=1)
return model