阅读完 R-CNN一系列论文之后,开始看源代码。本文简要记录自己对roi_pooling_layer源码的理解。
作者首先在caffe.proto中添加该层参数说明,主要是三个变量
optional ROIPoolingParameter roi_pooling_param = 43;
message ROIPoolingParameter {
// Pad, kernel size, and stride are all given as a single value for equal
// dimensions in height and width or as Y, X pairs.
optional uint32 pooled_h = 1 [default = 0]; // The pooled output height
optional uint32 pooled_w = 2 [default = 0]; // The pooled output width
// Multiplicative spatial scale factor to translate ROI coords from their
// input scale to the scale used when pooling
optional float spatial_scale = 3 [default = 1];
}
LayerSetUp
void ROIPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
ROIPoolingParameter roi_pool_param = this->layer_param_.roi_pooling_param();
CHECK_GT(roi_pool_param.pooled_h(), 0)
<< "pooled_h must be > 0";
CHECK_GT(roi_pool_param.pooled_w(), 0)
<< "pooled_w must be > 0";
pooled_height_ = roi_pool_param.pooled_h();
pooled_width_ = roi_pool_param.pooled_w();
spatial_scale_ = roi_pool_param.spatial_scale();
LOG(INFO) << "Spatial scale: " << spatial_scale_;
}
实现参数赋值
Reshape
void ROIPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
channels_ = bottom[0]->channels();
height_ = bottom[0]->height();
width_ = bottom[0]->width();
// top[0]的通道数与bottom[0]的通道数是相等的,毕竟只是做了个pooling而已
// top[0]的数量跟ROI的数量是一样的,就是将ROI对应到conv_5上
top[0]->Reshape(bottom[1]->num(), channels_, pooled_height_, // num of rois
pooled_width_);
max_idx_.Reshape(bottom[1]->num(), channels_, pooled_height_,
pooled_width_);
}
Forward_cpu
void ROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
const Dtype* bottom_data = bottom[0]->cpu_data();
// bottom_data 为完整图像经过conv_layer等前向传播所得 ,即conv_5
const Dtype* bottom_rois = bottom[1]->cpu_data();
// bottom_rois 为rois,其实就是一些rois的信息:batch_index和2个点的坐标
// Number of ROIs
int num_rois = bottom[1]->num(); //ROI的数量
int batch_size = bottom[0]->num(); //conv_5 一批特征的数量
int top_count = top[0]->count();
Dtype* top_data = top[0]->mutable_cpu_data();
caffe_set(top_count, Dtype(-FLT_MAX), top_data);
//将top_data全部设置成最小值(个数,值,位置)
int* argmax_data = max_idx_.mutable_cpu_data();
caffe_set(top_count, -1, argmax_data);
// For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
//对于每一个ROI 区域
int roi_batch_ind = bottom_rois[0]; //下标
int roi_start_w = round(bottom_rois[1] * spatial_scale_);
int roi_start_h = round(bottom_rois[2] * spatial_scale_);
int roi_end_w = round(bottom_rois[3] * spatial_scale_);
int roi_end_h = round(bottom_rois[4] * spatial_scale_);
CHECK_GE(roi_batch_ind, 0);
CHECK_LT(roi_batch_ind, batch_size);
int roi_height = max(roi_end_h - roi_start_h + 1, 1);
int roi_width = max(roi_end_w - roi_start_w + 1, 1);
const Dtype bin_size_h = static_cast<Dtype>(roi_height)
/ static_cast<Dtype>(pooled_height_);
//除法 ROI区域相对于pooling后图像大小的比例 可理解为pooling后一个像 素代表多少个ROI像素
const Dtype bin_size_w = static_cast<Dtype>(roi_width)
/ static_cast<Dtype>(pooled_width_);
const Dtype* batch_data = bottom_data + bottom[0]->offset(roi_batch_ind); // 找到对应ROI的conv_5的地址
// bottom_data 为完整图像经过conv_layer等前向传播所得 ,即conv_5 地址
for (int c = 0; c < channels_; ++c) {
for (int ph = 0; ph < pooled_height_; ++ph) {
for (int pw = 0; pw < pooled_width_; ++pw) {
// Compute pooling region for this output unit:
// start (included) = floor(ph * roi_height / pooled_height_)
// end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
* bin_size_h));
int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
* bin_size_w));
int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
* bin_size_h));
int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
* bin_size_w));
hstart = min(max(hstart + roi_start_h, 0), height_); //height_ 是conv_5的大小
hend = min(max(hend + roi_start_h, 0), height_);
wstart = min(max(wstart + roi_start_w, 0), width_);
wend = min(max(wend + roi_start_w, 0), width_);
//为什么要加上roi_start_h?因为roi取自源图片,其左上角坐标不是从(0,0)开始
bool is_empty = (hend <= hstart) || (wend <= wstart);
const int pool_index = ph * pooled_width_ + pw;
if (is_empty) {
top_data[pool_index] = 0;
argmax_data[pool_index] = -1;
}
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int index = h * width_ + w;
if (batch_data[index] > top_data[pool_index]) {
top_data[pool_index] = batch_data[index]; //conv_5上的像素对应到了输出里面
argmax_data[pool_index] = index;
}
}
}
}
}
// Increment all data pointers by one channel
batch_data += bottom[0]->offset(0, 1);
top_data += top[0]->offset(0, 1);
argmax_data += max_idx_.offset(0, 1);
}
// Increment ROI data pointer
bottom_rois += bottom[1]->offset(1);
}
}
template <typename Dtype>
void ROIPoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
NOT_IMPLEMENTED;
}
总的来讲,实现的功能是
首先计算rois映射到feature map的坐标,即原始坐标*spacial_scale(大小为所有stride的乘积分之一),然后针对每个输出来进行计算,即每个输出点都代表原先的一块区域,这个区域大小为bin_h= roi_height / pooled_ height, bin_w=roi_width / pooled_width.遍历所有top的点所映射回feature map的区域,并找到最大值,记录最大值所在的位置。
Backward_gpu
作者实现的是GPU上的反向传播算法
template <typename Dtype>
__global__ void ROIPoolBackward(const int nthreads, const Dtype* top_diff,
const int* argmax_data, const int num_rois, const Dtype spatial_scale,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, Dtype* bottom_diff,
const Dtype* bottom_rois) {
CUDA_KERNEL_LOOP(index, nthreads) {
// (n, c, h, w) coords in bottom data //遍历bottom[0],也就是feature map conv_5的特征图
int w = index % width;
int h = (index / width) % height;
int c = (index / width / height) % channels;
int n = index / width / height / channels;
Dtype gradient = 0;
// Accumulate gradient over all ROIs that pooled this element
for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5;
int roi_batch_ind = offset_bottom_rois[0];
// Skip if ROI's batch index doesn't match n ROI要和conv_5的序号对应起来
if (n != roi_batch_ind) {
continue;
}
int roi_start_w = round(offset_bottom_rois[1] * spatial_scale); //ROIS 缩放到 映射大小区域
int roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
int roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
int roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
// Skip if ROI doesn't include (h, w)
const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
h >= roi_start_h && h <= roi_end_h);
if (!in_roi) {
continue;
}
int offset = (roi_n * channels + c) * pooled_height * pooled_width; //计算该ROI 对应到pooling后,相对于初始位置的偏移
const Dtype* offset_top_diff = top_diff + offset; //获得地址
const int* offset_argmax_data = argmax_data + offset; //对应pooling取值在conv_5上的位置地址
// Compute feasible set of pooled units that could have pooled
// this bottom unit
// Force malformed ROIs to be 1x1
int roi_width = max(roi_end_w - roi_start_w + 1, 1);
int roi_height = max(roi_end_h - roi_start_h + 1, 1);
Dtype bin_size_h = static_cast<Dtype>(roi_height)
/ static_cast<Dtype>(pooled_height);
Dtype bin_size_w = static_cast<Dtype>(roi_width)
/ static_cast<Dtype>(pooled_width);
int phstart = floor(static_cast<Dtype>(h - roi_start_h) / bin_size_h);
int phend = ceil(static_cast<Dtype>(h - roi_start_h + 1) / bin_size_h);
int pwstart = floor(static_cast<Dtype>(w - roi_start_w) / bin_size_w);
int pwend = ceil(static_cast<Dtype>(w - roi_start_w + 1) / bin_size_w);
phstart = min(max(phstart, 0), pooled_height);
phend = min(max(phend, 0), pooled_height);
pwstart = min(max(pwstart, 0), pooled_width);
pwend = min(max(pwend, 0), pooled_width);
for (int ph = phstart; ph < phend; ++ph) {
for (int pw = pwstart; pw < pwend; ++pw) {
if (offset_argmax_data[ph * pooled_width + pw] == (h * width + w)) { //对于这一个conv_5位置点,它可能对pooling上多个点有影响,累加
gradient += offset_top_diff[ph * pooled_width + pw];
}
}
}
}
bottom_diff[index] = gradient;
}
}
template <typename Dtype>
void ROIPoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
if (!propagate_down[0]) {
return;
}
const Dtype* bottom_rois = bottom[1]->gpu_data();
const Dtype* top_diff = top[0]->gpu_diff();
Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
const int count = bottom[0]->count();
caffe_gpu_set(count, Dtype(0.), bottom_diff);
const int* argmax_data = max_idx_.gpu_data();
// NOLINT_NEXT_LINE(whitespace/operators)
ROIPoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
count, top_diff, argmax_data, top[0]->num(), spatial_scale_, channels_,
height_, width_, pooled_height_, pooled_width_, bottom_diff, bottom_rois);
CUDA_POST_KERNEL_CHECK;
}
总结起来就是:
遍历feature map并记录n, c, h, w,为之后记录bottom_diff做准备,然后计算每个roi映射到feature map的坐标,接下来我就认为有个小问题了,作者的意思是表达如果h,w如果不在roi区域内的话,可以直接continue了,这点不难理解,某个点在roi中可能对这个roi所对应的top产生贡献(在某个bin中为最大),如果点不在那个区域中,一定不会对top产生贡献。而某一点可能对多个区域产生贡献,故loss返回来时,同一点的loss累加。
参考博客
http://blog.youkuaiyun.com/xyy19920105/article/details/50420779
http://blog.youkuaiyun.com/iamzhangzhuping/article/details/51500162