Ubuntu14.04下Py-faster-rcnn在CPU下的配置编译
- 安装依赖项等
- 下载faster-rcnn
git clone –recursive https://github.com/rbgirshick/py-faster-rcnn.git - 编译cython
打开py-faster-rcnn/lib/setup.py,修改成如下后make
#CUDA = locate_cuda()
#self.set_executable('compiler_so', CUDA['nvcc'])
#Extension('nms.gpu_nms',
#['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'],
#library_dirs=[CUDA['lib64']],
#libraries=['cudart'],
#language='c++',
#runtime_library_dirs=[CUDA['lib64']],
# this syntax is specific to this build system
# we're only going to use certain compiler args with nvcc and not with
# gcc the implementation of this trick is in customize_compiler() below
#extra_compile_args={'gcc': ["-Wno-unused-function"],
# 'nvcc': ['-arch=sm_35',
# '--ptxas-options=-v',
# '-c',
# '--compiler-options',
# "'-fPIC'"]},
#include_dirs = [numpy_include, CUDA['include']]
#),
4.
修改makefile.config,将如下两行前的#去掉
CPU_ONLY := 1
WITH_PYTHON_LAYER := 1
修改Cmakelist.txt,如下:
caffe_option(CPU_ONLY "Build Caffe without CUDA support" ON)
修改/py-faster-rcnn/lib/fast_rcnn/config.py,如下:
__C.USE_GPU_NMS = False
修改/py-faster-rcnn/lib/fast_rcnn/nms_wrapper.py,如下:
from nms.gpu_nms import gpu_nms
def nms(dets, thresh, force_cpu=True):
5.
Make -j8 && make pycaffe
6.以上修改,可以进行测试。如果需要训练,还需修改py-faster-rcnn/caffe-fast-rcnn/src/layers/smooth_L1_loss_layer.cpp 和 roi_pooling_layer.cpp,修改后make -j8 && make pycaffe
其中,smooth_L1_loss_layer.cpp,修改
template <typename Dtype>
void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
//NOT_IMPLEMENTED;
// cpu implementation
CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1))
<< "Inputs must have the same dimension.";
int count = bottom[0]->count();
caffe_sub(count,
bottom[0]->cpu_data(),
bottom[1]->cpu_data(),
diff_.mutable_cpu_data());
if(has_weights_){
caffe_mul(count,
bottom[2]->cpu_data(),
diff_.cpu_data(),
diff_.mutable_cpu_data());
}
// f(x) = 0.5 * (sigma * x)^2 if |x| < 1 / sigma / sigma
// |x| - 0.5 / sigma / sigma otherwise
const Dtype* in = diff_.cpu_data();
Dtype* out = errors_.mutable_cpu_data();
for(int index=0; index<count; ++index){
Dtype val = in[index];
Dtype abs_val = abs(val);
if(abs_val < 1.0 / sigma2_){
out[index] = 0.5 * val * val * sigma2_;
}
else{
out[index] = abs_val - 0.5 / sigma2_;
}
}
if(has_weights_){
caffe_mul(count, bottom[3]->cpu_data(), out, errors_.mutable_cpu_data());
}
// compute loss
Dtype loss = caffe_cpu_dot(count, ones_.cpu_data(), errors_.cpu_data());
top[0]->mutable_cpu_data()[0] = loss / bottom[0]->num();
// end cpu implementation
}
template <typename Dtype>
void SmoothL1LossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
//NOT_IMPLEMENTED;
// cpu implementation
int count = diff_.count();
const Dtype* in = diff_.cpu_data();
Dtype* out = diff_.mutable_cpu_data();
for(int index=0; index < count; index++){
Dtype val = in[index];
Dtype abs_val = abs(val);
if(abs_val < 1.0 / sigma2_){
out[index] = sigma2_ * val;
}
else{
out[index] = (Dtype(0) < val) - (val < Dtype(0));
}
}
for(int i=0; i<2; ++i){
if(propagate_down[i]){
const Dtype sign = (i == 0) ? 1 : -1;
const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
caffe_cpu_axpby(
count,
alpha,
out,//diff_.cpu_data(),
Dtype(0),
bottom[i]->mutable_cpu_diff());
if(has_weights_){
caffe_mul(
count,
bottom[2]->cpu_data(),
bottom[i]->cpu_diff(),
bottom[i]->mutable_cpu_data());
caffe_mul(
count,
bottom[3]->cpu_data(),
bottom[i]->cpu_diff(),
bottom[i]->mutable_cpu_data());
}
}
}
// end cpu implementation
}
roi_pooling_layer.cpp,修改如下:
template <typename Dtype>
void ROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
const Dtype* bottom_data = bottom[0]->cpu_data();
const Dtype* bottom_rois = bottom[1]->cpu_data();
// Number of ROIs
int num_rois = bottom[1]->num();
int batch_size = bottom[0]->num();
int top_count = top[0]->count();
Dtype* top_data = top[0]->mutable_cpu_data();
caffe_set(top_count, Dtype(-FLT_MAX), top_data);
int* argmax_data = max_idx_.mutable_cpu_data();
caffe_set(top_count, -1, argmax_data);
// For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
for (int n = 0; n < num_rois; ++n) {
int roi_batch_ind = bottom_rois[0];
int roi_start_w = round(bottom_rois[1] * spatial_scale_);
int roi_start_h = round(bottom_rois[2] * spatial_scale_);
int roi_end_w = round(bottom_rois[3] * spatial_scale_);
int roi_end_h = round(bottom_rois[4] * spatial_scale_);
CHECK_GE(roi_batch_ind, 0);
CHECK_LT(roi_batch_ind, batch_size);
int roi_height = max(roi_end_h - roi_start_h + 1, 1);
int roi_width = max(roi_end_w - roi_start_w + 1, 1);
const Dtype bin_size_h = static_cast<Dtype>(roi_height)
/ static_cast<Dtype>(pooled_height_);
const Dtype bin_size_w = static_cast<Dtype>(roi_width)
/ static_cast<Dtype>(pooled_width_);
const Dtype* batch_data = bottom_data + bottom[0]->offset(roi_batch_ind);
for (int c = 0; c < channels_; ++c) {
for (int ph = 0; ph < pooled_height_; ++ph) {
for (int pw = 0; pw < pooled_width_; ++pw) {
// Compute pooling region for this output unit:
// start (included) = floor(ph * roi_height / pooled_height_)
// end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
* bin_size_h));
int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
* bin_size_w));
int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
* bin_size_h));
int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
* bin_size_w));
hstart = min(max(hstart + roi_start_h, 0), height_);
hend = min(max(hend + roi_start_h, 0), height_);
wstart = min(max(wstart + roi_start_w, 0), width_);
wend = min(max(wend + roi_start_w, 0), width_);
bool is_empty = (hend <= hstart) || (wend <= wstart);
const int pool_index = ph * pooled_width_ + pw;
if (is_empty) {
top_data[pool_index] = 0;
argmax_data[pool_index] = -1;
continue;
}
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int index = h * width_ + w;
if (batch_data[index] > top_data[pool_index]) {
top_data[pool_index] = batch_data[index];
argmax_data[pool_index] = index;
}
}
}
}
}
// Increment all data pointers by one channel
batch_data += bottom[0]->offset(0, 1);
top_data += top[0]->offset(0, 1);
argmax_data += max_idx_.offset(0, 1);
}
// Increment ROI data pointer
bottom_rois += bottom[1]->offset(1);
}
}
template <typename Dtype>
void ROIPoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
//NOT_IMPLEMENTED;
//*** cpu implementation ***
if(!propagate_down[0]){
return;
}
const Dtype* bottom_rois = bottom[1]->cpu_data();
const Dtype* top_diff = top[0]->cpu_diff();
Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
const int nums = bottom[0]->num();
const int count = bottom[0]->count();
const int batch_size = bottom[0]->num();
caffe_set(count, Dtype(0), bottom_diff);
const int* argmax_data = max_idx_.cpu_data();
CHECK_EQ(top[0]->num(),bottom[1]->num())<<"top and bottom num not equal!";
for (int n = 0; n < nums; ++n){
int roi_batch_ind = bottom_rois[0];
CHECK_GE(roi_batch_ind,0);
CHECK_LT(roi_batch_ind, batch_size);
int roi_start_w = round(bottom_rois[1] * spatial_scale_);
int roi_start_h = round(bottom_rois[2] * spatial_scale_);
int roi_end_w = round(bottom_rois[3] * spatial_scale_);
int roi_end_h = round(bottom_rois[4] * spatial_scale_);
int roi_height = max(roi_end_h - roi_start_h + 1, 1);
int roi_width = max(roi_end_w - roi_start_w + 1, 1);
Dtype bin_size_h = static_cast<Dtype>(roi_height)
/ static_cast<Dtype>(pooled_height_);
Dtype bin_size_w = static_cast<Dtype>(roi_width)
/ static_cast<Dtype>(pooled_width_);
Dtype* batch_bottom_diff = bottom_diff + bottom[0]->offset(roi_batch_ind);
for(int c = 0; c < channels_; ++c){
for(int h = 0; h < height_; ++h){
for(int w =0; w< width_; ++w){
// skip if ROI doesn't include (h,w)
const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
h >= roi_start_h && h <= roi_end_h);
if(!in_roi)
continue;
// output index
int index = h * width_ + w;// check if width_
// compute outputs' size, phstart, pwstart, phend, pwend**
int phstart = floor(static_cast<Dtype>(h - roi_start_h) / bin_size_h);
int phend = ceil(static_cast<Dtype>(h - roi_start_h + 1) / bin_size_h);
int pwstart = floor(static_cast<Dtype>(w - roi_start_w) / bin_size_w);
int pwend = ceil(static_cast<Dtype>(w - roi_start_w + 1) / bin_size_w);
phstart = min(max(phstart, 0), pooled_height_);
phend = min(max(phend, 0), pooled_height_);
pwstart = min(max(pwstart, 0), pooled_width_);
pwend = min(max(pwend, 0), pooled_width_);
for(int ph = phstart; ph < phend; ++ph){
for( int pw = pwstart; pw < pwend; ++ pw){
if(argmax_data[ph * pooled_width_ + pw] == (h *width_ + w)){
batch_bottom_diff[index] += top_diff[ph * pooled_width_ + pw];
}
}
}
}
}
batch_bottom_diff += bottom[0]->offset(0, 1);
top_diff += top[0]->offset(0, 1);
argmax_data += max_idx_.offset(0, 1);
}
bottom_rois += bottom[1]->offset(1);
}
// ***end cpu implementation ***
}