残差网络ResNet由目前已经出到V2版本了,那么V2和V1由何不同呢,以下是阅读论文后的个人总结:
首先直接上图:
左边是v1,右边是v2,可以看出其主要差别是在BN和ReLU相对weight和addition的操作顺序上,即论文中提到的post-activation(后激活)还是pre-activation(先激活)的区别。
结合计算公式,可以看出对于V1来说f为ReLU函数,而V2为直接的identity mapping(恒等映射)。这样做可以带来两个好处:1.对于深层的网络(论文实验中使用1001层的ResNet)优化来说更简单,因为ReLU的特性所致,信号为负时,shortcut不起作用,所以初期训练会十分慢。而采用identity mapping使得无论训练初期还是末期,信号都能顺利在各层传输,也使得网络更容易训练。当然这种现象在层数不高时并不严重(实验使用低于164层)2.减少过拟合现象,在V1中虽然对输入进行了BN正则化,但是加上shortcut后,传到下个层的weight层的信号未正则化。而V2中对于传入每个层的信号都正则化可以帮助减少过拟合现象。
总结来说,对于层数较少的网络可以继续使用V1 block,对于层数较多使用V2 block。
最后,使用pytorch实现resnet:
# ResNet34 use v1 and v2
from torch import nn
from torch.nn import functional as F
class ResidualBlockV1(nn.Module):
# Residual Block v1
def __init__(self, inchannel, outchannel, stride=1, shortcut=None):
super(ResidualBlockV1, self).__init__()
self.basic = nn.Sequential(
nn.Conv2d(inchannel, outchannel, 3, stride, 1, bias=False),
nn.BatchNorm2d(outchannel),
nn.ReLU(inplace=True),
nn.Conv2d(outchannel, outchannel, 3, 1, 1, bias=False),
nn.BatchNorm2d(outchannel))
self.shortcut = shortcut #shortcut function is customizable
def forward(self, x):
out = self.basic(x)
residual = x if self.shortcut is None else self.shortcut(x)
out += residual
return F.relu(out)
class ResidualBlockV2(nn.Module):
# Residual Block v2
def __init__(self, inchannel, outchannel, stride=1, shortcut=None):
super(ResidualBlockV2, self).__init__()
self.basic = nn.Sequential(
nn.BatchNorm2d(inchannel),
nn.ReLU(inplace=True),
nn.Conv2d(inchannel, outchannel, 3, stride, 1, bias=False),
nn.BatchNorm2d(outchannel),
nn.ReLU(inplace=True),
nn.Conv2d(outchannel, outchannel, 3, 1, 1, bias=False))
self.shortcut = shortcut #shortcut function is customizable
def forward(self, x):
out = self.basic(x)
residual = x if self.shortcut is None else self.shortcut(x)
out += residual
return out
class make_layer(nn.Module):
# Reusable layer
def __init__(self,
inchannel,
outchannel,
block_num,
stride=1,
use_v1_block=True):
super(make_layer, self).__init__()
layers = []
shortcut = nn.Sequential(
nn.Conv2d(inchannel, outchannel, 1, stride, bias=False),
nn.BatchNorm2d(outchannel))
if use_v1_block:
layers.append(ResidualBlockV1(inchannel, outchannel, stride, shortcut))
for i in range(1, block_num):
layers.append(ResidualBlockV1(outchannel, outchannel))
else:
layers.append(ResidualBlockV2(inchannel, outchannel, stride, shortcut))
for i in range(1, block_num):
layers.append(ResidualBlockV2(outchannel, outchannel))
self.layer = nn.Sequential(*layers)
def forward(self, x):
out = self.layer(x)
return out
class ResNet34V1(nn.Module):
def __init__(self):
super(ResNet34V1, self).__init__()
self.pre = nn.Sequential(
nn.Conv2d(3, 64, 7, 2, 3, bias=False),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(3, 2, 1))
# build each layer
self.layer1 = make_layer(64, 64, 3)
self.layer2 = make_layer(64, 128, 4, stride=2)
self.layer3 = make_layer(128, 256, 6, stride=2)
self.layer4 = make_layer(256, 512, 3, stride=2)
self.fc = nn.Linear(512, 1000)
def forward(self, x):
x = self.pre(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = F.avg_pool2d(x, 7)
x = x.view(x.size(0), -1)
return self.fc(x)
class ResNet34V2(nn.Module):
def __init__(self):
super(ResNet34V2, self).__init__()
self.pre = nn.Sequential(
nn.Conv2d(3, 64, 7, 2, 3, bias=False),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(3, 2, 1))
# build each layer
self.layer1 = make_layer(64, 64, 3, use_v1_block=False)
self.layer2 = make_layer(64, 128, 4, stride=2, use_v1_block=False)
self.layer3 = make_layer(128, 256, 6, stride=2, use_v1_block=False)
self.layer4 = make_layer(256, 512, 3, stride=2, use_v1_block=False)
self.fc = nn.Linear(512, 1000)
def forward(self, x):
x = self.pre(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = F.avg_pool2d(x, 7)
x = x.view(x.size(0), -1)
return self.fc(x)
print('This is ResNet34 v1:')
print(ResNet34V1())
print('This is ResNet34 v2:')
print(ResNet34V2())
参考: