转自:https://blog.youkuaiyun.com/u014114990/article/details/90665141
1. 为什么要合并BN层
在训练深度网络模型时,BN(Batch Normalization)层能够加速网络收敛,并且能够控制过拟合,一般放在卷积层之后。BN 层将数据归一化后,能够有效解决梯度消失与梯度爆炸问题。虽然 BN 层在训练时起到了积极作用,然而,在网络前向推断时多了一些层的运算,影响了模型的性能,且占用了更多的内存或者显存空间。目前,很多先进的网络模型(ResNet,MobileNet,Xception,ShuffleNet 等)都使用了BN技术,因此,我们有必要将 BN 层的参数合并到卷积层,来提升模型前向推断的速度。
2. BN层与卷积层合并的数学原理
卷积层中
3. 实验结果
机器:显卡 GTX 1080Ti,i7 CPU
本实验对比了Resnet50 模型合并BN层前后的性能,分类精度保持不变,速度显著提升。
| 模型 | CPU前向时间 | GPU前向时间 |
| Resnet50(合并前) | 176.17ms | 11.03ms |
| Resnet50(合并后) | 161.69ms | 7.3ms |
| 提升 | 10% | 51% |
4. 合并的python脚本
该脚本需要caffe的python接口
-
#!/usr/bin/env python -
# -*- coding: UTF-8 -*- -
import numpy as np -
import sys -
import os -
import os.path as osp -
import google.protobuf as pb -
import google.protobuf.text_format -
from argparse import ArgumentParser -
import caffe -
caffe.set_mode_cpu() -
def load_and_fill_biases(src_model, src_weights, dst_model, dst_weights): -
with open(src_model) as f: -
model = caffe.proto.caffe_pb2.NetParameter() -
pb.text_format.Merge(f.read(), model) -
for i, layer in enumerate(model.layer): -
if layer.type == 'Convolution': # or layer.type == 'Scale': -
# Add bias layer if needed -
if layer.convolution_param.bias_term == False: -
layer.convolution_param.bias_term = True -
layer.convolution_param.bias_filler.type = 'constant' -
layer.convolution_param.bias_filler.value = 0.0 -
with open(dst_model, 'w') as f: -
f.write(pb.text_format.MessageToString(model)) -
caffe.set_mode_cpu() -
net_src = caffe.Net(src_model, src_weights, caffe.TEST) -
net_dst = caffe.Net(dst_model, caffe.TEST) -
for key in net_src.params.keys(): -
for i in range(len(net_src.params[key])): -
net_dst.params[key][i].data[:] = net_src.params[key][i].data[:] -
if dst_weights is not None: -
# Store params -
pass -
return net_dst -
def merge_conv_and_bn(net, i_conv, i_bn, i_scale): -
# This is based on Kyeheyon's work -
assert(i_conv != None) -
assert(i_bn != None) -
def copy_double(data): -
return np.array(data, copy=True, dtype=np.double) -
key_conv = net._layer_names[i_conv] -
key_bn = net._layer_names[i_bn] -
key_scale = net._layer_names[i_scale] if i_scale else None -
# Copy -
bn_mean = copy_double(net.params[key_bn][0].data) -
bn_variance = copy_double(net.params[key_bn][1].data) -
num_bn_samples = copy_double(net.params[key_bn][2].data) -
# and Invalidate the BN layer -
net.params[key_bn][0].data[:] = 0 -
net.params[key_bn][1].data[:] = 1 -
net.params[key_bn][2].data[:] = 1 -
if num_bn_samples[0] == 0: -
num_bn_samples[0] = 1 -
if net.params.has_key(key_scale): -
print 'Combine {:s} + {:s} + {:s}'.format(key_conv, key_bn, key_scale) -
scale_weight = copy_double(net.params[key_scale][0].data) -
scale_bias = copy_double(net.params[key_scale][1].data) -
net.params[key_scale][0].data[:] = 1 -
net.params[key_scale][1].data[:] = 0 -
else: -
print 'Combine {:s} + {:s}'.format(key_conv, key_bn) -
scale_weight = 1 -
scale_bias = 0 -
weight = copy_double(net.params[key_conv][0].data) -
bias = copy_double(net.params[key_conv][1].data) -
alpha = scale_weight / np.sqrt(bn_variance / num_bn_samples[0] + 1e-5) -
net.params[key_conv][1].data[:] = bias * alpha + (scale_bias - (bn_mean / num_bn_samples[0]) * alpha) -
for i in range(len(alpha)): -
net.params[key_conv][0].data[i] = weight[i] * alpha[i] -
def merge_batchnorms_in_net(net): -
# for each BN -
for i, layer in enumerate(net.layers): -
if layer.type != 'BatchNorm': -
continue -
l_name = net._layer_names[i] -
l_bottom = net.bottom_names[l_name] -
assert(len(l_bottom) == 1) -
l_bottom = l_bottom[0] -
l_top = net.top_names[l_name] -
assert(len(l_top) == 1) -
l_top = l_top[0] -
can_be_absorbed = True -
# Search all (bottom) layers -
for j in xrange(i - 1, -1, -1): -
tops_of_j = net.top_names[net._layer_names[j]] -
if l_bottom in tops_of_j: -
if net.layers[j].type not in ['Convolution', 'InnerProduct']: -
can_be_absorbed = False -
else: -
# There must be only one layer -
conv_ind = j -
break -
if not can_be_absorbed: -
continue -
# find the following Scale -
scale_ind = None -
for j in xrange(i + 1, len(net.layers)): -
bottoms_of_j = net.bottom_names[net._layer_names[j]] -
if l_top in bottoms_of_j: -
if scale_ind: -
# Followed by two or more layers -
scale_ind = None -
break -
if net.layers[j].type in ['Scale']: -
scale_ind = j -
top_of_j = net.top_names[net._layer_names[j]][0] -
if top_of_j == bottoms_of_j[0]: -
# On-the-fly => Can be merged -
break -
else: -
# Followed by a layer which is not 'Scale' -
scale_ind = None -
break -
merge_conv_and_bn(net, conv_ind, i, scale_ind) -
return net -
def process_model(net, src_model, dst_model, func_loop, func_finally): -
with open(src_model) as f: -
model = caffe.proto.caffe_pb2.NetParameter() -
pb.text_format.Merge(f.read(), model) -
for i, layer in enumerate(model.layer): -
map(lambda x: x(layer, net, model, i), func_loop) -
map(lambda x: x(net, model), func_finally) -
with open(dst_model, 'w') as f: -
f.write(pb.text_format.MessageToString(model)) -
# Functions to remove (redundant) BN and Scale layers -
to_delete_empty = [] -
def pick_empty_layers(layer, net, model, i): -
if layer.type not in ['BatchNorm', 'Scale']: -
return -
bottom = layer.bottom[0] -
top = layer.top[0] -
if (bottom != top): -
# Not supperted yet -
return -
if layer.type == 'BatchNorm': -
zero_mean = np.all(net.params[layer.name][0].data == 0) -
one_var = np.all(net.params[layer.name][1].data == 1) -
if zero_mean and one_var: -
print 'Delete layer: {}'.format(layer.name) -
to_delete_empty.append(layer) -
if layer.type == 'Scale': -
no_scaling = np.all(net.params[layer.name][0].data == 1) -
zero_bias = np.all(net.params[layer.name][1].data == 0) -
if no_scaling and zero_bias: -
print 'Delete layer: {}'.format(layer.name) -
to_delete_empty.append(layer) -
def remove_empty_layers(net, model): -
map(model.layer.remove, to_delete_empty) -
# A function to add 'engine: CAFFE' param into 1x1 convolutions -
def set_engine_caffe(layer, net, model, i): -
if layer.type == 'Convolution': -
if layer.convolution_param.kernel_size == 1\ -
or (layer.convolution_param.kernel_h == layer.convolution_param.kernel_w == 1): -
layer.convolution_param.engine = dict(layer.convolution_param.Engine.items())['CAFFE'] -
def main(): -
# Set default output file names -
if args.output_model is None: -
file_name = osp.splitext(args.model)[0] -
args.output_model = file_name + '_inference.prototxt' -
if args.output_weights is None: -
file_name = osp.splitext(args.weights)[0] -
args.output_weights = file_name + '_inference.caffemodel' -
net = load_and_fill_biases(args.model, args.weights, args.model + '.temp.pt', None) -
net = merge_batchnorms_in_net(net) -
process_model(net, args.model + '.temp.pt', args.output_model, -
[pick_empty_layers, set_engine_caffe], -
[remove_empty_layers]) -
# Store params -
net.save(args.output_weights) -
if __name__ == '__main__': -
parser = ArgumentParser( -
description="Generate Batch Normalized model for inference") -
parser.add_argument('--model', default="MobileNetSSD_deploy.prototxt", help="The net definition prototxt") -
parser.add_argument('--weights', default="MobileNetSSD_deploy.caffemodel", help="The weights caffemodel") -
parser.add_argument('--output_model') -
parser.add_argument('--output_weights') -
args = parser.parse_args() -
main()
脚本下载地址:
https://download.youkuaiyun.com/download/kangdi7547/10578152
本文介绍如何将Batch Normalization(BN)层与卷积层合并,以提升深度学习模型的推理速度,通过实验证明,此操作能显著提高ResNet50模型在CPU和GPU上的前向传播效率。
3791

被折叠的 条评论
为什么被折叠?



