Ubuntu14.04下Py-faster-rcnn在CPU下的配置编译

最新推荐文章于 2020-08-12 10:52:32 发布

原创最新推荐文章于 2020-08-12 10:52:32 发布 · 416 阅读

2 ·

CC 4.0 BY-SA版权

文章标签：

#ubuntu #py-faster-rcnn #CPU #安装 #caffe

深度学习专栏收录该内容

3 篇文章

订阅专栏

本文介绍如何在Ubuntu 14.04系统中为Py-Faster-RCNN配置仅CPU的支持，包括依赖项安装、源码编译、配置文件修改等步骤，并涉及如何针对CPU环境调整Caffe的编译选项。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

Ubuntu14.04下Py-faster-rcnn在CPU下的配置编译

安装依赖项等
下载faster-rcnn
git clone –recursive https://github.com/rbgirshick/py-faster-rcnn.git
编译cython
打开py-faster-rcnn/lib/setup.py，修改成如下后make

 #CUDA = locate_cuda()  
 #self.set_executable('compiler_so', CUDA['nvcc'])  
    #Extension('nms.gpu_nms',  
        #['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'],  
        #library_dirs=[CUDA['lib64']],  
        #libraries=['cudart'],  
        #language='c++',  
        #runtime_library_dirs=[CUDA['lib64']],  
        # this syntax is specific to this build system  
        # we're only going to use certain compiler args with nvcc and not with  
        # gcc the implementation of this trick is in customize_compiler() below  
        #extra_compile_args={'gcc': ["-Wno-unused-function"],  
        #                    'nvcc': ['-arch=sm_35',  
        #                             '--ptxas-options=-v',  
        #                             '-c',  
        #                             '--compiler-options',  
        #                             "'-fPIC'"]},  
        #include_dirs = [numpy_include, CUDA['include']]  
    #),

4.
修改makefile.config，将如下两行前的#去掉

    CPU_ONLY := 1
    WITH_PYTHON_LAYER := 1

修改Cmakelist.txt，如下：

caffe_option(CPU_ONLY  "Build Caffe without CUDA support" ON)

修改/py-faster-rcnn/lib/fast_rcnn/config.py，如下：

__C.USE_GPU_NMS = False

修改/py-faster-rcnn/lib/fast_rcnn/nms_wrapper.py，如下：

from nms.gpu_nms import gpu_nms
def nms(dets, thresh, force_cpu=True):

Make -j8 && make pycaffe

6.以上修改，可以进行测试。如果需要训练，还需修改py-faster-rcnn/caffe-fast-rcnn/src/layers/smooth_L1_loss_layer.cpp 和 roi_pooling_layer.cpp，修改后make -j8 && make pycaffe

其中，smooth_L1_loss_layer.cpp，修改

template <typename Dtype>
void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  //NOT_IMPLEMENTED;
  // cpu implementation
  CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1))
      << "Inputs must have the same dimension.";
  int count = bottom[0]->count();
  caffe_sub(count, 
            bottom[0]->cpu_data(), 
            bottom[1]->cpu_data(),
            diff_.mutable_cpu_data());

  if(has_weights_){
    caffe_mul(count, 
              bottom[2]->cpu_data(), 
              diff_.cpu_data(), 
              diff_.mutable_cpu_data());
  }
  // f(x) = 0.5 * (sigma * x)^2          if |x| < 1 / sigma / sigma
  //        |x| - 0.5 / sigma / sigma    otherwise
  const Dtype* in = diff_.cpu_data();
  Dtype* out = errors_.mutable_cpu_data();
  for(int index=0; index<count; ++index){
    Dtype val = in[index];
    Dtype abs_val = abs(val);
    if(abs_val < 1.0 / sigma2_){
        out[index] = 0.5 * val * val * sigma2_;
    }
    else{
        out[index] = abs_val - 0.5 / sigma2_;
    }
  }

  if(has_weights_){
    caffe_mul(count, bottom[3]->cpu_data(), out, errors_.mutable_cpu_data());
  }

  // compute loss
  Dtype loss = caffe_cpu_dot(count, ones_.cpu_data(), errors_.cpu_data());
  top[0]->mutable_cpu_data()[0] = loss / bottom[0]->num();
  // end cpu implementation

}

template <typename Dtype>
void SmoothL1LossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  //NOT_IMPLEMENTED;
  // cpu implementation
  int count = diff_.count();
  const Dtype* in = diff_.cpu_data();
  Dtype* out = diff_.mutable_cpu_data();
  for(int index=0; index < count; index++){
    Dtype val = in[index];
    Dtype abs_val = abs(val);
    if(abs_val < 1.0 / sigma2_){
        out[index] = sigma2_ *  val;
    } 
    else{
        out[index] = (Dtype(0) < val) - (val < Dtype(0));
    }
  }

  for(int i=0; i<2; ++i){
    if(propagate_down[i]){
        const Dtype sign = (i == 0) ? 1 : -1;
        const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
        caffe_cpu_axpby(
            count, 
            alpha, 
            out,//diff_.cpu_data(), 
            Dtype(0), 
            bottom[i]->mutable_cpu_diff());

        if(has_weights_){
            caffe_mul(
                count, 
                bottom[2]->cpu_data(), 
                bottom[i]->cpu_diff(), 
                bottom[i]->mutable_cpu_data());
            caffe_mul(
                count,
                bottom[3]->cpu_data(),
                bottom[i]->cpu_diff(),
                bottom[i]->mutable_cpu_data());
        }
    }
  }
  // end cpu implementation

}

roi_pooling_layer.cpp，修改如下：

template <typename Dtype>
void ROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->cpu_data();
  const Dtype* bottom_rois = bottom[1]->cpu_data();
  // Number of ROIs
  int num_rois = bottom[1]->num();
  int batch_size = bottom[0]->num();
  int top_count = top[0]->count();
  Dtype* top_data = top[0]->mutable_cpu_data();
  caffe_set(top_count, Dtype(-FLT_MAX), top_data);
  int* argmax_data = max_idx_.mutable_cpu_data();
  caffe_set(top_count, -1, argmax_data);

  // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
  for (int n = 0; n < num_rois; ++n) {
    int roi_batch_ind = bottom_rois[0];
    int roi_start_w = round(bottom_rois[1] * spatial_scale_);
    int roi_start_h = round(bottom_rois[2] * spatial_scale_);
    int roi_end_w = round(bottom_rois[3] * spatial_scale_);
    int roi_end_h = round(bottom_rois[4] * spatial_scale_);
    CHECK_GE(roi_batch_ind, 0);
    CHECK_LT(roi_batch_ind, batch_size);

    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
    const Dtype bin_size_h = static_cast<Dtype>(roi_height)
                             / static_cast<Dtype>(pooled_height_);
    const Dtype bin_size_w = static_cast<Dtype>(roi_width)
                             / static_cast<Dtype>(pooled_width_);

    const Dtype* batch_data = bottom_data + bottom[0]->offset(roi_batch_ind);

    for (int c = 0; c < channels_; ++c) {
      for (int ph = 0; ph < pooled_height_; ++ph) {
        for (int pw = 0; pw < pooled_width_; ++pw) {
          // Compute pooling region for this output unit:
          //  start (included) = floor(ph * roi_height / pooled_height_)
          //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
          int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
                                              * bin_size_h));
          int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
                                              * bin_size_w));
          int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
                                           * bin_size_h));
          int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
                                           * bin_size_w));

          hstart = min(max(hstart + roi_start_h, 0), height_);
          hend = min(max(hend + roi_start_h, 0), height_);
          wstart = min(max(wstart + roi_start_w, 0), width_);
          wend = min(max(wend + roi_start_w, 0), width_);

          bool is_empty = (hend <= hstart) || (wend <= wstart);

          const int pool_index = ph * pooled_width_ + pw;
          if (is_empty) {
            top_data[pool_index] = 0;
            argmax_data[pool_index] = -1;
            continue;
          }

          for (int h = hstart; h < hend; ++h) {
            for (int w = wstart; w < wend; ++w) {
              const int index = h * width_ + w;
              if (batch_data[index] > top_data[pool_index]) {
                top_data[pool_index] = batch_data[index];
                argmax_data[pool_index] = index;
              }
            }
          }
        }
      }
      // Increment all data pointers by one channel
      batch_data += bottom[0]->offset(0, 1);
      top_data += top[0]->offset(0, 1);
      argmax_data += max_idx_.offset(0, 1);
    }
    // Increment ROI data pointer
    bottom_rois += bottom[1]->offset(1);
  }
}

template <typename Dtype>
void ROIPoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  //NOT_IMPLEMENTED;
  //*** cpu implementation ***
  if(!propagate_down[0]){ 
    return; 
  } 
  const Dtype* bottom_rois = bottom[1]->cpu_data(); 
  const Dtype* top_diff = top[0]->cpu_diff(); 
  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); 
  const int nums = bottom[0]->num(); 
  const int count = bottom[0]->count(); 
  const int batch_size = bottom[0]->num(); 
  caffe_set(count, Dtype(0), bottom_diff); 
  const int* argmax_data = max_idx_.cpu_data(); 

  CHECK_EQ(top[0]->num(),bottom[1]->num())<<"top and bottom num not equal!";

  for (int n = 0; n < nums; ++n){ 
    int roi_batch_ind = bottom_rois[0]; 
    CHECK_GE(roi_batch_ind,0); 
    CHECK_LT(roi_batch_ind, batch_size); 

    int roi_start_w = round(bottom_rois[1] * spatial_scale_); 
    int roi_start_h = round(bottom_rois[2] * spatial_scale_); 
    int roi_end_w = round(bottom_rois[3] * spatial_scale_); 
    int roi_end_h = round(bottom_rois[4] * spatial_scale_); 

    int roi_height = max(roi_end_h - roi_start_h + 1, 1); 
    int roi_width = max(roi_end_w - roi_start_w + 1, 1); 

    Dtype bin_size_h = static_cast<Dtype>(roi_height) 
                        / static_cast<Dtype>(pooled_height_); 
    Dtype bin_size_w = static_cast<Dtype>(roi_width) 
                        / static_cast<Dtype>(pooled_width_); 

    Dtype* batch_bottom_diff = bottom_diff + bottom[0]->offset(roi_batch_ind);

    for(int c = 0; c < channels_; ++c){ 
        for(int h = 0; h < height_; ++h){ 
            for(int w =0; w< width_; ++w){ 
                // skip if ROI doesn't include (h,w)
                const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
                           h >= roi_start_h && h <= roi_end_h);
                if(!in_roi)
                    continue;

                // output index 
                int index = h * width_ + w;// check if width_ 

                // compute outputs' size, phstart, pwstart, phend, pwend** 
                int phstart = floor(static_cast<Dtype>(h - roi_start_h) / bin_size_h); 
                int phend = ceil(static_cast<Dtype>(h - roi_start_h + 1) / bin_size_h); 
                int pwstart = floor(static_cast<Dtype>(w - roi_start_w) / bin_size_w); 
                int pwend = ceil(static_cast<Dtype>(w - roi_start_w + 1) / bin_size_w); 

                phstart = min(max(phstart, 0), pooled_height_); 
                phend = min(max(phend, 0), pooled_height_); 
                pwstart = min(max(pwstart, 0), pooled_width_); 
                pwend = min(max(pwend, 0), pooled_width_); 

                for(int ph = phstart; ph < phend; ++ph){ 
                    for( int pw = pwstart; pw < pwend; ++ pw){ 
                        if(argmax_data[ph * pooled_width_ + pw] == (h *width_ + w)){ 
                            batch_bottom_diff[index] += top_diff[ph * pooled_width_ + pw]; 
                        } 
                    } 
                } 
            } 
        } 
        batch_bottom_diff += bottom[0]->offset(0, 1); 
        top_diff += top[0]->offset(0, 1); 
        argmax_data += max_idx_.offset(0, 1); 
    } 
    bottom_rois += bottom[1]->offset(1); 
  }
  // ***end cpu implementation ***

}