支持参差网络和老版分割网络的caffe

本文详细介绍如何在Caffe中自定义BN层和Upsample层,包括修改源代码、添加头文件及配置编译参数等步骤,并提供了解决过程中可能遇到问题的链接。
部署运行你感兴趣的模型镜像

系统环境Ubuntu14.04以上,CUDA8.0以上,python2

首先要拼一个这样需求的caffe,支持resnet和segnet等等网络,其实其他网络的支持也是可以的,老版的layer生成是通过common_layers和layer_factory实现的xxx类的实现,新版的layer生成简单一点,单独需要加什么,在src/caffe/layers下面添加相应的头文件和源文件就可以。前提系统可以正常编译各种版本的caffe。

第一步在src/caffe和include/caffe添加data_reader.cpp和data_reader.hpp文件

data_reader.cpp实现源码:

#include <boost/thread.hpp>
#include <map>
#include <string>
#include <vector>

#include "caffe/common.hpp"
#include "caffe/data_reader.hpp"
#include "caffe/layers/data_layer.hpp"
#include "caffe/proto/caffe.pb.h"

namespace caffe {

using boost::weak_ptr;

map<const string, weak_ptr<DataReader::Body> > DataReader::bodies_;
static boost::mutex bodies_mutex_;

DataReader::DataReader(const LayerParameter& param)
    : queue_pair_(new QueuePair(  //
        param.data_param().prefetch() * param.data_param().batch_size())) {
  // Get or create a body
  boost::mutex::scoped_lock lock(bodies_mutex_);
  string key = source_key(param);
  weak_ptr<Body>& weak = bodies_[key];
  body_ = weak.lock();
  if (!body_) {
    body_.reset(new Body(param));
    bodies_[key] = weak_ptr<Body>(body_);
  }
  body_->new_queue_pairs_.push(queue_pair_);
}

DataReader::~DataReader() {
  string key = source_key(body_->param_);
  body_.reset();
  boost::mutex::scoped_lock lock(bodies_mutex_);
  if (bodies_[key].expired()) {
    bodies_.erase(key);
  }
}

//

DataReader::QueuePair::QueuePair(int size) {
  // Initialize the free queue with requested number of datums
  for (int i = 0; i < size; ++i) {
    free_.push(new Datum());
  }
}

DataReader::QueuePair::~QueuePair() {
  Datum* datum;
  while (free_.try_pop(&datum)) {
    delete datum;
  }
  while (full_.try_pop(&datum)) {
    delete datum;
  }
}

//

DataReader::Body::Body(const LayerParameter& param)
    : param_(param),
      new_queue_pairs_() {
  StartInternalThread();
}

DataReader::Body::~Body() {
  StopInternalThread();
}

void DataReader::Body::InternalThreadEntry() {
  shared_ptr<db::DB> db(db::GetDB(param_.data_param().backend()));
  db->Open(param_.data_param().source(), db::READ);
  shared_ptr<db::Cursor> cursor(db->NewCursor());
  vector<shared_ptr<QueuePair> > qps;
  try {
    int solver_count = param_.phase() == TRAIN ? Caffe::solver_count() : 1;

    // To ensure deterministic runs, only start running once all solvers
    // are ready. But solvers need to peek on one item during initialization,
    // so read one item, then wait for the next solver.
    for (int i = 0; i < solver_count; ++i) {
      shared_ptr<QueuePair> qp(new_queue_pairs_.pop());
      read_one(cursor.get(), qp.get());
      qps.push_back(qp);
    }
    // Main loop
    while (!must_stop()) {
      for (int i = 0; i < solver_count; ++i) {
        read_one(cursor.get(), qps[i].get());
      }
      // Check no additional readers have been created. This can happen if
      // more than one net is trained at a time per process, whether single
      // or multi solver. It might also happen if two data layers have same
      // name and same source.
      CHECK_EQ(new_queue_pairs_.size(), 0);
    }
  } catch (boost::thread_interrupted&) {
    // Interrupted exception is expected on shutdown
  }
}

void DataReader::Body::read_one(db::Cursor* cursor, QueuePair* qp) {
  Datum* datum = qp->free_.pop();
  // TODO deserialize in-place instead of copy?
  datum->ParseFromString(cursor->value());
  qp->full_.push(datum);

  // go to the next iter
  cursor->Next();
  if (!cursor->valid()) {
    DLOG(INFO) << "Restarting data prefetching from start.";
    cursor->SeekToFirst();
  }
}
}  // namespace caffe

data_reader.hpp实现源码:

#ifndef CAFFE_DATA_READER_HPP_
#define CAFFE_DATA_READER_HPP_

#include <map>
#include <string>
#include <vector>

#include "caffe/common.hpp"
#include "caffe/internal_thread.hpp"
#include "caffe/util/blocking_queue.hpp"
#include "caffe/util/db.hpp"

namespace caffe {

/**
 * @brief Reads data from a source to queues available to data layers.
 * A single reading thread is created per source, even if multiple solvers
 * are running in parallel, e.g. for multi-GPU training. This makes sure
 * databases are read sequentially, and that each solver accesses a different
 * subset of the database. Data is distributed to solvers in a round-robin
 * way to keep parallel training deterministic.
 */
class DataReader {
 public:
  explicit DataReader(const LayerParameter& param);
  ~DataReader();

  inline BlockingQueue<Datum*>& free() const {
    return queue_pair_->free_;
  }
  inline BlockingQueue<Datum*>& full() const {
    return queue_pair_->full_;
  }

 protected:
  // Queue pairs are shared between a body and its readers
  class QueuePair {
   public:
    explicit QueuePair(int size);
    ~QueuePair();

    BlockingQueue<Datum*> free_;
    BlockingQueue<Datum*> full_;

  DISABLE_COPY_AND_ASSIGN(QueuePair);
  };

  // A single body is created per source
  class Body : public InternalThread {
   public:
    explicit Body(const LayerParameter& param);
    virtual ~Body();

   protected:
    void InternalThreadEntry();
    void read_one(db::Cursor* cursor, QueuePair* qp);

    const LayerParameter param_;
    BlockingQueue<shared_ptr<QueuePair> > new_queue_pairs_;

    friend class DataReader;

  DISABLE_COPY_AND_ASSIGN(Body);
  };

  // A source is uniquely identified by its layer name + path, in case
  // the same database is read from two different locations in the net.
  static inline string source_key(const LayerParameter& param) {
    return param.name() + ":" + param.data_param().source();
  }

  const shared_ptr<QueuePair> queue_pair_;
  shared_ptr<Body> body_;

  static map<const string, boost::weak_ptr<DataReader::Body> > bodies_;

DISABLE_COPY_AND_ASSIGN(DataReader);
};

}  // namespace caffe
#endif  // CAFFE_DATA_READER_HPP_

然后修改util下面的blocking_queue.cpp的实现源码:

#include <boost/thread.hpp>
#include <string>
#include "caffe/data_reader.hpp"
#include "caffe/proto/caffe.pb.h"
#include "caffe/layers/base_data_layer.hpp"
#include "caffe/parallel.hpp"
#include "caffe/util/blocking_queue.hpp"

namespace caffe {

template<typename T>
class BlockingQueue<T>::sync {
 public:
  mutable boost::mutex mutex_;
  boost::condition_variable condition_;
};

template<typename T>
BlockingQueue<T>::BlockingQueue()
    : sync_(new sync()) {
}

template<typename T>
void BlockingQueue<T>::push(const T& t) {
  boost::mutex::scoped_lock lock(sync_->mutex_);
  queue_.push(t);
  lock.unlock();
  sync_->condition_.notify_one();
}

template<typename T>
bool BlockingQueue<T>::try_pop(T* t) {
  boost::mutex::scoped_lock lock(sync_->mutex_);

  if (queue_.empty()) {
    return false;
  }

  *t = queue_.front();
  queue_.pop();
  return true;
}

template<typename T>
T BlockingQueue<T>::pop(const string& log_on_wait) {
  boost::mutex::scoped_lock lock(sync_->mutex_);

  while (queue_.empty()) {
    if (!log_on_wait.empty()) {
      LOG_EVERY_N(INFO, 1000)<< log_on_wait;
    }
    sync_->condition_.wait(lock);
  }

  T t = queue_.front();
  queue_.pop();
  return t;
}

template<typename T>
bool BlockingQueue<T>::try_peek(T* t) {
  boost::mutex::scoped_lock lock(sync_->mutex_);

  if (queue_.empty()) {
    return false;
  }

  *t = queue_.front();
  return true;
}

template<typename T>
T BlockingQueue<T>::peek() {
  boost::mutex::scoped_lock lock(sync_->mutex_);

  while (queue_.empty()) {
    sync_->condition_.wait(lock);
  }

  return queue_.front();
}

template<typename T>
size_t BlockingQueue<T>::size() const {
  boost::mutex::scoped_lock lock(sync_->mutex_);
  return queue_.size();
}

template class BlockingQueue<Batch<float>*>;
template class BlockingQueue<Batch<double>*>;
template class BlockingQueue<Datum*>;
template class BlockingQueue<shared_ptr<DataReader::QueuePair> >;
}  // namespace caffe

然后就是src/caffe/layers下面添加bn_layer.cpp、bn_layer.cu、upsample_layer.cpp、upsample_layer.cu文件

bn_layer.cpp实现源码:

#include <algorithm>
#include <vector>

#include "caffe/common.hpp"
#include "caffe/layers/bn_layer.hpp"
#include "caffe/filler.hpp"
#include "caffe/layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {
  template <typename Dtype>
  void BNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
    top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(),
        bottom[0]->height(), bottom[0]->width());
    if (top.size() > 1) {
        // top blob for batch mean
        top[1]->Reshape(1, C_, 1, 1);
    }
    if (top.size() > 2) {
        // top blob for batch variance
        top[2]->Reshape(1, C_, 1, 1);
    }

    x_norm_.Reshape(bottom[0]->num(), bottom[0]->channels(),
        bottom[0]->height(), bottom[0]->width());

    // mean
    spatial_mean_.Reshape(N_, C_, 1, 1);
    batch_mean_.Reshape(1, C_, 1, 1);
    // variance
    spatial_variance_.Reshape(N_, C_, 1, 1);
    batch_variance_.Reshape(1, C_, 1, 1);
    // buffer blob
    buffer_blob_.Reshape(N_, C_, H_, W_);

    // fill spatial multiplier
    spatial_sum_multiplier_.Reshape(1, 1, H_, W_);
    Dtype* spatial_multipl_data = spatial_sum_multiplier_.mutable_cpu_data();
    caffe_set(spatial_sum_multiplier_.count(), Dtype(1),
        spatial_multipl_data);
    caffe_set(spatial_sum_multiplier_.count(), Dtype(0),
        spatial_sum_multiplier_.mutable_cpu_diff());
    // fill batch multiplier
    batch_sum_multiplier_.Reshape(N_, 1, 1, 1);
    Dtype* batch_multiplier_data = batch_sum_multiplier_.mutable_cpu_data();
    caffe_set(batch_sum_multiplier_.count(), Dtype(1),
        batch_multiplier_data);
    caffe_set(batch_sum_multiplier_.count(), Dtype(0),
        batch_sum_multiplier_.mutable_cpu_diff());
  }
  template <typename Dtype>
  void BNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
    // Figure out the dimensions
    N_ = bottom[0]->num();
    C_ = bottom[0]->channels();
    H_ = bottom[0]->height();
    W_ = bottom[0]->width();
    var_eps_ = 1e-9;

    // Check if we need to set up the weights
    if (this->blobs_.size() > 0) {
      LOG(INFO) << "Skipping parameter initialization";
    } else {
      this->blobs_.resize(2);

      // fill scale with scale_filler
      this->blobs_[0].reset(new Blob<Dtype>(1, C_, 1, 1));
      shared_ptr<Filler<Dtype> > scale_filler(GetFiller<Dtype>(
          this->layer_param_.bn_param().scale_filler()));
      scale_filler->Fill(this->blobs_[0].get());

      // fill shift with shift_filler
      this->blobs_[1].reset(new Blob<Dtype>(1, C_, 1, 1));
      shared_ptr<Filler<Dtype> > shift_filler(GetFiller<Dtype>(
          this->layer_param_.bn_param().shift_filler()));
      shift_filler->Fill(this->blobs_[1].get());
    }  // parameter initialization
    this->param_propagate_down_.resize(this->blobs_.size(), true);
  }

  template <typename Dtype>
  void BNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
    const Dtype* bottom_data = bottom[0]->cpu_data();
    Dtype* top_data = top[0]->mutable_cpu_data();
    const Dtype* const_top_data = top[0]->cpu_data();

    const Dtype* scale_data = this->blobs_[0]->cpu_data();
    const Dtype* shift_data = this->blobs_[1]->cpu_data();

    switch (this->layer_param_.bn_param().bn_mode()) {
    case BNParameter_BNMode_LEARN:
      // put the squares of bottom into buffer_blob_
      caffe_powx(bottom[0]->count(), bottom_data, Dtype(2),
          buffer_blob_.mutable_cpu_data());

      // computes variance using var(X) = E(X^2) - (EX)^2
      // EX across spatial
      caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_,
          Dtype(1. / (H_ * W_)), bottom_data,
          spatial_sum_multiplier_.cpu_data(), Dtype(0),
          spatial_mean_.mutable_cpu_data());
      // EX across batch
      caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1. / N_),
          spatial_mean_.cpu_data(),
          batch_sum_multiplier_.cpu_data(), Dtype(0),
          batch_mean_.mutable_cpu_data());

      // E(X^2) across spatial
      caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_,
          Dtype(1. / (H_ * W_)), buffer_blob_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), Dtype(0),
          spatial_variance_.mutable_cpu_data());
      // E(X^2) across batch
      caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1. / N_),
          spatial_variance_.cpu_data(),
          batch_sum_multiplier_.cpu_data(), Dtype(0),
          batch_variance_.mutable_cpu_data());

      caffe_powx(batch_mean_.count(), batch_mean_.cpu_data(), Dtype(2),
          buffer_blob_.mutable_cpu_data());  // (EX)^2
      caffe_sub(batch_mean_.count(), batch_variance_.cpu_data(),
          buffer_blob_.cpu_data(),
          batch_variance_.mutable_cpu_data());  // variance

      // save top[1] (batch_mean) and top[2] (batch_variance)
      if (top.size() > 1) {
          caffe_copy(batch_mean_.count(), batch_mean_.cpu_data(),
              top[1]->mutable_cpu_data());
      }
      if (top.size() > 2) {
          caffe_copy(batch_variance_.count(), batch_variance_.cpu_data(),
              top[2]->mutable_cpu_data());
      }

      // do mean and variance normalization
      // subtract mean
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_,
          C_, 1, Dtype(1),
          batch_sum_multiplier_.cpu_data(),
          batch_mean_.cpu_data(), Dtype(0),
          spatial_mean_.mutable_cpu_data());

      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
          H_ * W_, 1, Dtype(-1),
          spatial_mean_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), Dtype(0),
          buffer_blob_.mutable_cpu_data());

      caffe_add(buffer_blob_.count(), bottom_data,
          buffer_blob_.cpu_data(), top_data);

      // normalize variance
      caffe_add_scalar(batch_variance_.count(), var_eps_,
        batch_variance_.mutable_cpu_data());
      caffe_powx(batch_variance_.count(),
          batch_variance_.cpu_data(), Dtype(0.5),
          batch_variance_.mutable_cpu_data());

      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_,
          C_, 1, Dtype(1),
          batch_sum_multiplier_.cpu_data(),
          batch_variance_.cpu_data(), Dtype(0),
          spatial_variance_.mutable_cpu_data());
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,
          N_ * C_, H_ * W_, 1, Dtype(1),
          spatial_variance_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), Dtype(0),
          buffer_blob_.mutable_cpu_data());

      caffe_div(buffer_blob_.count(), const_top_data,
          buffer_blob_.cpu_data(), top_data);

      // Saving x_norm
      caffe_copy(buffer_blob_.count(), const_top_data,
          x_norm_.mutable_cpu_data());
      // scale
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.cpu_data(), scale_data, Dtype(0),
          spatial_variance_.mutable_cpu_data());
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
          H_ * W_, 1, Dtype(1),
          spatial_variance_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), Dtype(0),
          buffer_blob_.mutable_cpu_data());
      caffe_mul(buffer_blob_.count(), top_data,
          buffer_blob_.cpu_data(), top_data);

      // shift
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.cpu_data(), shift_data, Dtype(0),
          spatial_mean_.mutable_cpu_data());
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,
          N_ * C_, H_ * W_, 1, Dtype(1),
          spatial_mean_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), Dtype(0),
          buffer_blob_.mutable_cpu_data());
      caffe_add(buffer_blob_.count(), const_top_data,
          buffer_blob_.cpu_data(), top_data);
      break;
    case BNParameter_BNMode_INFERENCE:
      // scale
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.cpu_data(), scale_data, Dtype(0),
          spatial_variance_.mutable_cpu_data());
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
          H_ * W_, 1, Dtype(1),
          spatial_variance_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), Dtype(0),
          buffer_blob_.mutable_cpu_data());
      caffe_mul(buffer_blob_.count(), bottom_data,
          buffer_blob_.cpu_data(), top_data);

      // shift
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.cpu_data(), shift_data, Dtype(0),
          spatial_mean_.mutable_cpu_data());
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,
          N_ * C_, H_ * W_, 1, Dtype(1),
          spatial_mean_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), Dtype(0),
          buffer_blob_.mutable_cpu_data());
      caffe_add(buffer_blob_.count(), const_top_data,
          buffer_blob_.cpu_data(), top_data);
      break;
    default:
      LOG(FATAL) << "Unknown BN mode.";
    } 
  }

  template <typename Dtype>
  void BNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down,
      const vector<Blob<Dtype>*>& bottom) {
    const Dtype* top_diff = top[0]->cpu_diff();
    const Dtype* bottom_data = bottom[0]->cpu_data();
    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();

    Dtype* scale_diff = this->blobs_[0]->mutable_cpu_diff();
    Dtype* shift_diff = this->blobs_[1]->mutable_cpu_diff();
    const Dtype* scale_data = this->blobs_[0]->cpu_data();

    switch (this->layer_param_.bn_param().bn_mode()) {
    case BNParameter_BNMode_LEARN:
      // Propagate layer to parameters
      // gradient w.r.t. scale
      caffe_mul(buffer_blob_.count(), x_norm_.cpu_data(),
          top_diff, buffer_blob_.mutable_cpu_data());
      // EX across spatial
      caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_,
          H_ * W_, Dtype(1), buffer_blob_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), Dtype(0),
          spatial_variance_.mutable_cpu_diff());
      // EX across batch
      caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1),
          spatial_variance_.cpu_diff(),
          batch_sum_multiplier_.cpu_data(), Dtype(0), scale_diff);

      // gradient w.r.t. shift
      // EX across spatial
      caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_,
          H_ * W_, Dtype(1), top_diff,
          spatial_sum_multiplier_.cpu_data(),
          Dtype(0), spatial_mean_.mutable_cpu_diff());
      // EX across batch
      caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_,
          Dtype(1), spatial_mean_.cpu_diff(),
          batch_sum_multiplier_.cpu_data(),
          Dtype(0), shift_diff);

      // Propagate down

      // put scale * top_diff to buffer_blob_
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.cpu_data(), scale_data, Dtype(0),
          spatial_variance_.mutable_cpu_data());
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
          H_ * W_, 1, Dtype(1),
          spatial_variance_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), Dtype(0),
          buffer_blob_.mutable_cpu_data());
      caffe_mul(buffer_blob_.count(), top_diff, buffer_blob_.cpu_data(),
          buffer_blob_.mutable_cpu_data());

      // use new top diff for computation
      caffe_mul(buffer_blob_.count(),  x_norm_.cpu_data(),
          buffer_blob_.cpu_data(), bottom_diff);
      // EX across spatial
      caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_,
          Dtype(1), bottom_diff,
          spatial_sum_multiplier_.cpu_data(), Dtype(0),
          spatial_mean_.mutable_cpu_data());
      // EX across batch
      caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1),
          spatial_mean_.cpu_data(),
          batch_sum_multiplier_.cpu_data(), Dtype(0),
          batch_mean_.mutable_cpu_data());

      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,
          N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.cpu_data(),
          batch_mean_.cpu_data(), Dtype(0),
          spatial_mean_.mutable_cpu_data());
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
          H_ * W_, 1, Dtype(1),
          spatial_mean_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), Dtype(0),
          bottom_diff);

      caffe_mul(buffer_blob_.count(),
          x_norm_.cpu_data(), bottom_diff, bottom_diff);

      // EX across spatial
      caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_,
          H_ * W_, Dtype(1), buffer_blob_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), Dtype(0),
          spatial_mean_.mutable_cpu_data());
      // EX across batch
      caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1),
          spatial_mean_.cpu_data(),
          batch_sum_multiplier_.cpu_data(), Dtype(0),
          batch_mean_.mutable_cpu_data());

      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,
          N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.cpu_data(),
          batch_mean_.cpu_data(), Dtype(0),
          spatial_mean_.mutable_cpu_data());
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,
          N_ * C_, H_ * W_, 1, Dtype(1),
          spatial_mean_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), Dtype(1), bottom_diff);

      caffe_cpu_axpby(buffer_blob_.count(), Dtype(1),
          buffer_blob_.cpu_data(), Dtype(-1. / (N_ * H_ * W_)),
          bottom_diff);

      // put the squares of bottom into buffer_blob_
      caffe_powx(buffer_blob_.count(), bottom_data, Dtype(2),
          buffer_blob_.mutable_cpu_data());

      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,
          N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.cpu_data(),
          batch_variance_.cpu_data(), Dtype(0),
          spatial_variance_.mutable_cpu_data());
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,
          N_ * C_, H_ * W_, 1, Dtype(1),
          spatial_variance_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), Dtype(0),
          buffer_blob_.mutable_cpu_data());

      caffe_div(buffer_blob_.count(), bottom_diff,
          buffer_blob_.cpu_data(), bottom_diff);
      break;
    case BNParameter_BNMode_INFERENCE:
      // Propagate layer to parameters
      // gradient w.r.t. scale
      caffe_mul(buffer_blob_.count(), bottom_data,
          top_diff, buffer_blob_.mutable_cpu_data());
      // EX across spatial
      caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_,
          H_ * W_, Dtype(1), buffer_blob_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), Dtype(0),
          spatial_variance_.mutable_cpu_diff());
      // EX across batch
      caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1),
          spatial_variance_.cpu_diff(),
          batch_sum_multiplier_.cpu_data(), Dtype(0), scale_diff);

      // gradient w.r.t. shift
      // EX across spatial
      caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_,
          H_ * W_, Dtype(1), top_diff,
          spatial_sum_multiplier_.cpu_data(),
          Dtype(0), spatial_mean_.mutable_cpu_diff());
      // EX across batch
      caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_,
          Dtype(1), spatial_mean_.cpu_diff(),
          batch_sum_multiplier_.cpu_data(),
          Dtype(0), shift_diff);

      // Propagate down
      // put scale * top_diff to buffer_blob_
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.cpu_data(), scale_data, Dtype(0),
          spatial_variance_.mutable_cpu_data());
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
          H_ * W_, 1, Dtype(1),
          spatial_variance_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), Dtype(0),
          buffer_blob_.mutable_cpu_data());
      caffe_mul(buffer_blob_.count(), top_diff, buffer_blob_.cpu_data(),
          bottom_diff);
      break;
    default:
      LOG(FATAL) << "Unknown BN mode.";
    }
  }
#ifdef CPU_ONLY
STUB_GPU(BNLayer);
#endif

  INSTANTIATE_CLASS(BNLayer);
  REGISTER_LAYER_CLASS(BN);
}  // namespace caffe

bn_layer.cu实现源码:

#include <algorithm>
#include <vector>

#include "caffe/common.hpp"
#include "caffe/layers/bn_layer.hpp"
#include "caffe/filler.hpp"
#include "caffe/layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {
  template <typename Dtype>
  void BNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
    const Dtype* bottom_data = bottom[0]->gpu_data();
    const Dtype* const_top_data = top[0]->gpu_data();
    Dtype* top_data = top[0]->mutable_gpu_data();
    Dtype* spatial_mean_data = spatial_mean_.mutable_gpu_data();
    Dtype* buffer_data = buffer_blob_.mutable_gpu_data();
    const Dtype* const_buffer_data = buffer_blob_.gpu_data();

    switch (this->layer_param_.bn_param().bn_mode()) {
    case BNParameter_BNMode_LEARN:
      // put the squares of bottom into buffer_blob_
      caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2),
          buffer_blob_.mutable_gpu_data());

      // computes variance using var(X) = E(X^2) - (EX)^2
      // EX across spatial
      caffe_gpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_,
          Dtype(1. / (H_ * W_)),
          bottom_data, spatial_sum_multiplier_.gpu_data(),
          Dtype(0), spatial_mean_data);
      // EX across batch
      caffe_gpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1. / N_),
          spatial_mean_.gpu_data(),
          batch_sum_multiplier_.gpu_data(), Dtype(0),
          batch_mean_.mutable_gpu_data());

      // E(X^2) across spatial
      caffe_gpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_,
          Dtype(1. / (H_ * W_)), buffer_data,
          spatial_sum_multiplier_.gpu_data(), Dtype(0),
          spatial_variance_.mutable_gpu_data());
      // E(X^2) across batch
      caffe_gpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1. / N_),
          spatial_variance_.gpu_data(),
          batch_sum_multiplier_.gpu_data(), Dtype(0),
          batch_variance_.mutable_gpu_data());

      caffe_gpu_powx(batch_mean_.count(), batch_mean_.gpu_data(),
          Dtype(2), buffer_blob_.mutable_gpu_data());  // (EX)^2
      caffe_gpu_sub(batch_mean_.count(), batch_variance_.gpu_data(),
          buffer_data, batch_variance_.mutable_gpu_data());  // variance

      // save top[1] (batch_mean) and top[2] (batch_variance)
      if (top.size() > 1) {
          caffe_copy(batch_mean_.count(), batch_mean_.gpu_data(),
              top[1]->mutable_gpu_data());
      }
      if (top.size() > 2) {
          caffe_copy(batch_variance_.count(), batch_variance_.gpu_data(),
              top[2]->mutable_gpu_data());
      }

      // do mean and variance normalization
      // subtract mean
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.gpu_data(), batch_mean_.gpu_data(), Dtype(0),
          spatial_mean_data);
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_, H_ * W_,
          1, -Dtype(1),
          spatial_mean_.gpu_data(), spatial_sum_multiplier_.gpu_data(), Dtype(0),
          buffer_blob_.mutable_gpu_data());

      caffe_gpu_add(buffer_blob_.count(), bottom_data, buffer_data, top_data);

      // normalize variance
      caffe_gpu_add_scalar(batch_variance_.count(), var_eps_,
          batch_variance_.mutable_gpu_data());
      caffe_gpu_powx(batch_variance_.count(), batch_variance_.gpu_data(),
          Dtype(0.5), batch_variance_.mutable_gpu_data());

      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.gpu_data(), batch_variance_.gpu_data(), Dtype(0),
          spatial_variance_.mutable_gpu_data());
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
          H_ * W_, 1, Dtype(1),
          spatial_variance_.gpu_data(), spatial_sum_multiplier_.gpu_data(),
          Dtype(0), buffer_blob_.mutable_gpu_data());

      caffe_gpu_div(buffer_blob_.count(), top_data, buffer_data, top_data);

      // Saving x_norm
      caffe_copy(top[0]->count(), const_top_data, x_norm_.mutable_gpu_data());

      // scale
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.gpu_data(), this->blobs_[0]->gpu_data(),
          Dtype(0), spatial_variance_.mutable_gpu_data());
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
          H_ * W_, 1, Dtype(1),
          spatial_variance_.gpu_data(), spatial_sum_multiplier_.gpu_data(),
          Dtype(0), buffer_blob_.mutable_gpu_data());

      caffe_gpu_mul(buffer_blob_.count(), top_data, buffer_data, top_data);

      // shift
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.gpu_data(),
          this->blobs_[1]->gpu_data(), Dtype(0),
          spatial_mean_data);
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_, H_ * W_, 1,
          Dtype(1),
          spatial_mean_.gpu_data(), spatial_sum_multiplier_.gpu_data(), Dtype(0),
          buffer_blob_.mutable_gpu_data());
      caffe_gpu_add(buffer_blob_.count(), top_data, buffer_data, top_data);
      break;
    case BNParameter_BNMode_INFERENCE:
      // scale
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.gpu_data(), this->blobs_[0]->gpu_data(),
          Dtype(0), spatial_variance_.mutable_gpu_data());
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
          H_ * W_, 1, Dtype(1),
          spatial_variance_.gpu_data(), spatial_sum_multiplier_.gpu_data(),
          Dtype(0), buffer_blob_.mutable_gpu_data());

      caffe_gpu_mul(buffer_blob_.count(), bottom_data, buffer_data, top_data);

      // shift
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.gpu_data(),
          this->blobs_[1]->gpu_data(), Dtype(0),
          spatial_mean_data);
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_, H_ * W_, 1,
          Dtype(1),
          spatial_mean_.gpu_data(), spatial_sum_multiplier_.gpu_data(), Dtype(0),
          buffer_blob_.mutable_gpu_data());
      caffe_gpu_add(buffer_blob_.count(), top_data, buffer_data, top_data);
      break;
    default:
      LOG(FATAL) << "Unknown BN mode.";
    }
  }

  template <typename Dtype>
  void BNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down,
      const vector<Blob<Dtype>*>& bottom) {
    const Dtype* top_diff = top[0]->gpu_diff();
    const Dtype* top_data = top[0]->gpu_data();
    const Dtype* bottom_data = bottom[0]->gpu_data();
    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
    const Dtype* const_bottom_diff = bottom[0]->gpu_diff();
    Dtype* spatial_mean_data = spatial_mean_.mutable_gpu_data();
    Dtype* buffer_data = buffer_blob_.mutable_gpu_data();
    const Dtype* const_buffer_data = buffer_blob_.gpu_data();

    switch (this->layer_param_.bn_param().bn_mode()) {
    case BNParameter_BNMode_LEARN:
      // Propage to layer params
      // gradient w.r.t. scale
      caffe_gpu_mul(buffer_blob_.count(), x_norm_.gpu_data(),
          top_diff, buffer_blob_.mutable_gpu_data());
      // EX across spatial
      caffe_gpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1),
          buffer_data, spatial_sum_multiplier_.gpu_data(), Dtype(0),
      spatial_variance_.mutable_gpu_data());
      // EX across batch
      caffe_gpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1),
          spatial_variance_.gpu_data(),
          batch_sum_multiplier_.gpu_data(), Dtype(0),
          this->blobs_[0]->mutable_gpu_diff());

      // gradient w.r.t. shift
      // EX across spatial
      caffe_gpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1),
          top_diff, spatial_sum_multiplier_.gpu_data(),
          Dtype(0), spatial_mean_data);
      // EX across batch
      caffe_gpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1),
          spatial_mean_.gpu_data(),
          batch_sum_multiplier_.gpu_data(), Dtype(0),
          this->blobs_[1]->mutable_gpu_diff());

      // Propagate down
      // scale top diff
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.gpu_data(), this->blobs_[0]->gpu_data(),
          Dtype(0), spatial_variance_.mutable_gpu_data());
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
          H_ * W_, 1, Dtype(1),
          spatial_variance_.gpu_data(), spatial_sum_multiplier_.gpu_data(),
          Dtype(0),
          buffer_blob_.mutable_gpu_data());
      caffe_gpu_mul(buffer_blob_.count(), top_diff, buffer_data,
          buffer_blob_.mutable_gpu_data());

      // use new top diff for computation
      caffe_gpu_mul(buffer_blob_.count(), x_norm_.gpu_data(),
          buffer_data, bottom_diff);
      // EX across spatial
      caffe_gpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_,
          Dtype(1), bottom_diff,
          spatial_sum_multiplier_.gpu_data(), Dtype(0), spatial_mean_data);
      // EX across batch
      caffe_gpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1),
          spatial_mean_.gpu_data(),
          batch_sum_multiplier_.gpu_data(), Dtype(0),
          batch_mean_.mutable_gpu_data());

      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.gpu_data(),
          batch_mean_.gpu_data(), Dtype(0),
          spatial_mean_data);
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
          H_ * W_, 1, Dtype(1), spatial_mean_.gpu_data(),
          spatial_sum_multiplier_.gpu_data(), Dtype(0),
          bottom_diff);

      caffe_gpu_mul(buffer_blob_.count(), x_norm_.gpu_data(),
          bottom_diff, bottom_diff);

      // EX across spatial
      caffe_gpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1),
          buffer_data, spatial_sum_multiplier_.gpu_data(),
          Dtype(0), spatial_mean_data);

      // EX across batch
      caffe_gpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1),
          spatial_mean_.gpu_data(),
          batch_sum_multiplier_.gpu_data(), Dtype(0),
          batch_mean_.mutable_gpu_data());

      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_,
          C_, 1, Dtype(1),
          batch_sum_multiplier_.gpu_data(),
          batch_mean_.gpu_data(), Dtype(0),
          spatial_mean_data);
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
          H_ * W_, 1, Dtype(1),
          spatial_mean_.gpu_data(), spatial_sum_multiplier_.gpu_data(),
          Dtype(1),
          bottom_diff);

      caffe_gpu_axpby(buffer_blob_.count(), Dtype(1), buffer_data,
          Dtype(-1. / (N_ * H_ * W_)),
          bottom_diff);

      // put the squares of bottom into buffer_blob_
      caffe_gpu_powx(buffer_blob_.count(), bottom_data, Dtype(2),
          buffer_blob_.mutable_gpu_data());

      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.gpu_data(), batch_variance_.gpu_data(), Dtype(0),
          spatial_variance_.mutable_gpu_data());
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
          H_ * W_, 1, Dtype(1),
          spatial_variance_.gpu_data(), spatial_sum_multiplier_.gpu_data(),
          Dtype(0),
          buffer_blob_.mutable_gpu_data());

      caffe_gpu_div(buffer_blob_.count(), const_bottom_diff,
          const_buffer_data, bottom_diff);
      break;
    case BNParameter_BNMode_INFERENCE:
      // Propage to layer params
      // gradient w.r.t. scale
      caffe_gpu_mul(buffer_blob_.count(), bottom_data,
          top_diff, buffer_blob_.mutable_gpu_data());
      // EX across spatial
      caffe_gpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1),
          buffer_data, spatial_sum_multiplier_.gpu_data(), Dtype(0),
      spatial_variance_.mutable_gpu_data());
      // EX across batch
      caffe_gpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1),
          spatial_variance_.gpu_data(),
          batch_sum_multiplier_.gpu_data(), Dtype(0),
          this->blobs_[0]->mutable_gpu_diff());

      // gradient w.r.t. shift
      // EX across spatial
      caffe_gpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1),
          top_diff, spatial_sum_multiplier_.gpu_data(),
          Dtype(0), spatial_mean_data);
      // EX across batch
      caffe_gpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1),
          spatial_mean_.gpu_data(),
          batch_sum_multiplier_.gpu_data(), Dtype(0),
          this->blobs_[1]->mutable_gpu_diff());

      // Propagate down
      // scale top diff
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
          batch_sum_multiplier_.gpu_data(), this->blobs_[0]->gpu_data(),
          Dtype(0), spatial_variance_.mutable_gpu_data());
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
          H_ * W_, 1, Dtype(1),
          spatial_variance_.gpu_data(), spatial_sum_multiplier_.gpu_data(),
          Dtype(0),
          buffer_blob_.mutable_gpu_data());
      caffe_gpu_mul(buffer_blob_.count(), top_diff, buffer_data,
          bottom_diff);
      break;
    default:
      LOG(FATAL) << "Unknown BN mode.";
    }
  }

  INSTANTIATE_LAYER_GPU_FUNCS(BNLayer);
}  // namespace caffe

upsample_layer.cpp实现源码:

#include <algorithm>
#include <cfloat>
#include <vector>
#include <iostream>

#include "caffe/layers/upsample_layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {

template <typename Dtype>
void UpsampleLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  UpsampleParameter upsample_param = this->layer_param_.upsample_param();
  CHECK((upsample_param.has_upsample_h() && upsample_param.has_upsample_w())
      || (!upsample_param.has_scale() && upsample_param.has_scale_h()
      && upsample_param.has_scale_w())
      || (!upsample_param.has_scale_h() && !upsample_param.has_scale_w()))
      << "upsample_h & upsample_w are required, else (DEPRECATED) "
      << "scale OR scale_h & scale_w are required.";

  if (upsample_param.has_upsample_h() && upsample_param.has_upsample_w()) {
    upsample_h_ = upsample_param.upsample_h();
    upsample_w_ = upsample_param.upsample_w();
    CHECK_GT(upsample_h_, 1);
    CHECK_GT(upsample_w_, 1);
  } else {
    LOG(INFO) << "Params 'pad_out_{}_' are deprecated. Please declare upsample"
        << " height and width useing the upsample_h, upsample_w parameters.";
    if (!upsample_param.has_scale_h()) {
      scale_h_ = scale_w_ = upsample_param.scale();
      CHECK_GT(scale_h_, 1);
    } else {
      scale_h_ = upsample_param.scale_h();
      scale_w_ = upsample_param.scale_w();
      CHECK_GT(scale_h_, 1);
      CHECK_GT(scale_w_, 1);
    }
    pad_out_h_ = upsample_param.pad_out_h();
    pad_out_w_ = upsample_param.pad_out_w();
    CHECK(!pad_out_h_ || scale_h_ == 2) 
        << "Output height padding compensation requires scale_h == 2, otherwise "
        << "the output size is ill-defined.";
    CHECK(!pad_out_w_ || scale_w_ == 2) 
        << "Output width padding compensation requires scale_w == 2, otherwise "
        << "the output size is ill-defined.";
    upsample_h_ = upsample_w_ = -1;  // flag to calculate in Reshape
  }
}

template <typename Dtype>
void UpsampleLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
      << "corresponding to (num, channels, height, width)";
  CHECK_EQ(4, bottom[1]->num_axes()) << "Input mask must have 4 axes, "
      << "corresponding to (num, channels, height, width)";
  CHECK_EQ(bottom[0]->num(), bottom[1]->num());
  CHECK_EQ(bottom[0]->channels(), bottom[1]->channels());
  CHECK_EQ(bottom[0]->height(), bottom[1]->height());
  CHECK_EQ(bottom[0]->width(), bottom[1]->width());

  if (upsample_h_ <= 0 || upsample_w_ <= 0) {
    upsample_h_ = bottom[0]->height() * scale_h_ - int(pad_out_h_);
    upsample_w_ = bottom[0]->width() * scale_w_ - int(pad_out_w_);
  }
  top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(), upsample_h_,
      upsample_w_);
  channels_ = bottom[0]->channels();
  height_ = bottom[0]->height();
  width_ = bottom[0]->width();
}

template <typename Dtype>
void UpsampleLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->cpu_data();
  const Dtype* bottom_mask_data = bottom[1]->cpu_data();
  Dtype* top_data = top[0]->mutable_cpu_data();

  // Initialize
  const int top_count = top[0]->count();
  caffe_set(top_count, Dtype(0), top_data);
  // The main loop
  for (int n = 0; n < bottom[0]->num(); ++n) {
    for (int c = 0; c < channels_; ++c) {
      for (int i = 0; i < height_ * width_; ++i) {
        const int idx = static_cast<int>(bottom_mask_data[i]);
        if (idx >= upsample_h_ * upsample_w_) {
          // this can happen if the pooling layer that created the input mask
          // had an input with different size to top[0]
          LOG(FATAL) << "upsample top index " << idx << " out of range - "
            << "check scale settings match input pooling layer's "
            << "downsample setup";
        }
        top_data[idx] = bottom_data[i];
      }
      // compute offset
      bottom_data += bottom[0]->offset(0, 1);
      bottom_mask_data += bottom[1]->offset(0, 1);
      top_data += top[0]->offset(0, 1);
    }
  }
}

template <typename Dtype>
void UpsampleLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  if (propagate_down[0]) {
    const Dtype* top_diff = top[0]->cpu_diff();
    const Dtype* bottom_mask_data = bottom[1]->cpu_data();
    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();

    const int bottom_count = bottom[0]->count();
    caffe_set(bottom_count, Dtype(0), bottom_diff);
    // The main loop
    for (int n = 0; n < bottom[0]->num(); ++n) {
      for (int c = 0; c < channels_; ++c) {
        for (int i = 0; i < height_ * width_; ++i) {
          const int idx = static_cast<int>(bottom_mask_data[i]);
          if (idx >= height_ * width_ * scale_h_ * scale_w_) {
            // this can happen if the pooling layer that created
            // the input mask had an input with different size to top[0]
            LOG(FATAL) << "upsample top index " << idx << " out of range - "
              << "check scale settings match input pooling layer's downsample setup";
          }
          bottom_diff[i] = top_diff[idx];
        }
        // compute offset
        bottom_diff += bottom[0]->offset(0, 1);
        bottom_mask_data += bottom[1]->offset(0, 1);
        top_diff += top[0]->offset(0, 1);
      }
    }
  }
}


#ifdef CPU_ONLY
STUB_GPU(UpsampleLayer);
#endif

INSTANTIATE_CLASS(UpsampleLayer);
REGISTER_LAYER_CLASS(Upsample);

}  // namespace caffe

upsample_layer.cu实现源码:

#include <algorithm>
#include <cfloat>
#include <vector>

#include "caffe/layers/upsample_layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {

template <typename Dtype>
  __global__ void UpsampleForward(const int nthreads, int in_w, int in_h,
      int out_w, int out_h, const Dtype* bottom_data,
      const Dtype* bottom_mask, Dtype* top_data) {
    CUDA_KERNEL_LOOP(index, nthreads) {
      int offset = index / (in_w * in_h) * out_w * out_h;
      int upsample_idx = static_cast<int>(bottom_mask[index]);
      top_data[offset + upsample_idx] = bottom_data[index];
    }
  }

template <typename Dtype>
void UpsampleLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->gpu_data();
  const Dtype* bottom_mask = bottom[1]->gpu_data();
  Dtype* top_data = top[0]->mutable_gpu_data();
  caffe_gpu_set(top[0]->count(), Dtype(0), top_data);
  int bottom_count = bottom[0]->count();
  UpsampleForward<Dtype><<<CAFFE_GET_BLOCKS(bottom_count), CAFFE_CUDA_NUM_THREADS>>>(
      bottom_count, bottom[0]->width(), bottom[0]->height(), 
      top[0]->width(), top[0]->height(), bottom_data, bottom_mask, top_data);
  CUDA_POST_KERNEL_CHECK;
}

template <typename Dtype>
  __global__ void UpsampleBackward(const int nthreads, int in_w, int in_h,
      int out_w, int out_h, const Dtype* top_diff,
      const Dtype* bottom_mask, Dtype* bottom_diff) {
    CUDA_KERNEL_LOOP(index, nthreads) {
      int offset = index / (in_w * in_h) * out_w * out_h;
      int upsample_idx = static_cast<int>(bottom_mask[index]);
      bottom_diff[index] = top_diff[offset + upsample_idx];
    }
  }

template <typename Dtype>
void UpsampleLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  if (propagate_down[0]) {
    const Dtype* top_diff = top[0]->gpu_diff();
    const Dtype* bottom_mask = bottom[1]->gpu_data();
    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
    const int bottom_count = bottom[0]->count();
    caffe_gpu_set(bottom_count, Dtype(0.), bottom_diff);
    UpsampleBackward<Dtype><<<CAFFE_GET_BLOCKS(bottom_count), CAFFE_CUDA_NUM_THREADS>>>(
        bottom_count, bottom[0]->width(), bottom[0]->height(), 
        top[0]->width(), top[0]->height(), top_diff, bottom_mask, bottom_diff);
    CUDA_POST_KERNEL_CHECK;
  }
}
INSTANTIATE_LAYER_GPU_FUNCS(UpsampleLayer);
}  // namespace caffe

bn_layer.hpp头文件和upsample_layer.hpp头文件添加

bn_layer.hpp实现源码:

#include <vector>

#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"

namespace caffe {

/**
 * @brief Normalizes the input to have 0-mean and/or unit (1) variance across
 *        the batch.
 *
 * This layer computes Batch Normalization as described in [1]. For each channel
 * in the data (i.e. axis 1), it subtracts the mean and divides by the variance,
 * where both statistics are computed across both spatial dimensions and across
 * the different examples in the batch.
 *
 * By default, during training time, the network is computing global
 * mean/variance statistics via a running average, which is then used at test
 * time to allow deterministic outputs for each input. You can manually toggle
 * whether the network is accumulating or using the statistics via the
 * use_global_stats option. For reference, these statistics are kept in the
 * layer's three blobs: (0) mean, (1) variance, and (2) moving average factor.
 *
 * Note that the original paper also included a per-channel learned bias and
 * scaling factor. To implement this in Caffe, define a `ScaleLayer` configured
 * with `bias_term: true` after each `BatchNormLayer` to handle both the bias
 * and scaling factor.
 *
 * [1] S. Ioffe and C. Szegedy, "Batch Normalization: Accelerating Deep Network
 *     Training by Reducing Internal Covariate Shift." arXiv preprint
 *     arXiv:1502.03167 (2015).
 *
 * TODO(dox): thorough documentation for Forward, Backward, and proto params.
 */
template <typename Dtype>
class BNLayer : public Layer<Dtype> {
 public:
  explicit BNLayer(const LayerParameter& param)
      : Layer<Dtype>(param) {}
  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);

  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);

  virtual inline const char* type() const { return "BN"; }
  virtual inline int ExactNumBottomBlobs() const { return 1; }
  virtual inline int MinTopBlobs() const { return 1; }
  // if the BNMode is "LEARN" mamximum 3 top blobs are available
  virtual inline int MaxTopBlobs() const {
    return (this->layer_param_.bn_param().bn_mode() ==
            BNParameter_BNMode_LEARN) ? 3 : 1;
  }

 protected:
  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

  // spatial mean & variance
  Blob<Dtype> spatial_mean_, spatial_variance_;
  // batch mean & variance
  Blob<Dtype> batch_mean_, batch_variance_;
  // buffer blob
  Blob<Dtype> buffer_blob_;

  Blob<Dtype> x_norm_;
  // x_sum_multiplier is used to carry out sum using BLAS
  Blob<Dtype> spatial_sum_multiplier_, batch_sum_multiplier_;

  // dimension
  int N_;
  int C_;
  int H_;
  int W_;
  // eps
  Dtype var_eps_;
};

}  // namespace caffe

upsample_layer.hpp实现源码:

#ifndef CAFFE_UPSAMPLE_LAYER_HPP_
#define CAFFE_UPSAMPLE_LAYER_HPP_

#include <vector>

#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"

namespace caffe {

template <typename Dtype>
class UpsampleLayer : public Layer<Dtype> {
 public:
  explicit UpsampleLayer(const LayerParameter& param)
      : Layer<Dtype>(param) {}
  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);

  virtual inline const char* type() const { return "Upsample"; }
  // [input, encoder max-pooling mask]
  virtual inline int ExactNumBottomBlobs() const { return 2; }
  virtual inline int ExactNumTopBlobs() const { return 1; }

 protected:
  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

  int channels_;
  int height_;
  int width_;
  int scale_h_, scale_w_;
  bool pad_out_h_, pad_out_w_;
  int upsample_h_, upsample_w_;
};

}  // namespace caffe

#endif  // CAFFE_UPSAMPLE_LAYER_HPP_
最后在caffe.pro文件添加这两个信息就可以
message LayerParameter {
  optional string name = 1; // the layer name
  optional string type = 2; // the layer type
  repeated string bottom = 3; // the name of each bottom blob
  repeated string top = 4; // the name of each top blob

  // The train / test phase for computation.
  optional Phase phase = 10;

  // The amount of weight to assign each top blob in the objective.
  // Each layer assigns a default value, usually of either 0 or 1,
  // to each top blob.
  repeated float loss_weight = 5;

  // Specifies training parameters (multipliers on global learning constants,
  // and the name and other settings used for weight sharing).
  repeated ParamSpec param = 6;

  // The blobs containing the numeric parameters of the layer.
  repeated BlobProto blobs = 7;

  // Specifies whether to backpropagate to each bottom. If unspecified,
  // Caffe will automatically infer whether each input needs backpropagation
  // to compute parameter gradients. If set to true for some inputs,
  // backpropagation to those inputs is forced; if set false for some inputs,
  // backpropagation to those inputs is skipped.
  //
  // The size must be either 0 or equal to the number of bottoms.
  repeated bool propagate_down = 11;

  // Rules controlling whether and when a layer is included in the network,
  // based on the current NetState.  You may specify a non-zero number of rules
  // to include OR exclude, but not both.  If no include or exclude rules are
  // specified, the layer is always included.  If the current NetState meets
  // ANY (i.e., one or more) of the specified rules, the layer is
  // included/excluded.
  repeated NetStateRule include = 8;
  repeated NetStateRule exclude = 9;

  // Parameters for data pre-processing.
  optional TransformationParameter transform_param = 100;

  // Parameters shared by loss layers.
  optional LossParameter loss_param = 101;

  // Layer type-specific parameters.
  //
  // Note: certain layers may have more than one computational engine
  // for their implementation. These layers include an Engine type and
  // engine parameter for selecting the implementation.
  // The default for the engine is set by the ENGINE switch at compile-time.
  optional AccuracyParameter accuracy_param = 102;
  optional ArgMaxParameter argmax_param = 103;
  ////////////////////////////////////////////////
  optional BNParameter bn_param = 150;
  optional BatchNormParameter batch_norm_param = 139;
  optional BiasParameter bias_param = 141;
  optional ConcatParameter concat_param = 104;
  optional ContrastiveLossParameter contrastive_loss_param = 105;
  optional ConvolutionParameter convolution_param = 106;
  optional CropParameter crop_param = 144;
  optional DataParameter data_param = 107;
  optional DropoutParameter dropout_param = 108;
  optional DummyDataParameter dummy_data_param = 109;
  optional EltwiseParameter eltwise_param = 110;
  optional ELUParameter elu_param = 140;
  optional EmbedParameter embed_param = 137;
  optional ExpParameter exp_param = 111;
  optional FlattenParameter flatten_param = 135;
  optional HDF5DataParameter hdf5_data_param = 112;
  optional HDF5OutputParameter hdf5_output_param = 113;
  optional HingeLossParameter hinge_loss_param = 114;
  optional ImageDataParameter image_data_param = 115;
  optional InfogainLossParameter infogain_loss_param = 116;
  optional InnerProductParameter inner_product_param = 117;
  optional InputParameter input_param = 143;
  optional LogParameter log_param = 134;
  optional LRNParameter lrn_param = 118;
  optional MemoryDataParameter memory_data_param = 119;
  optional MVNParameter mvn_param = 120;
  optional ParameterParameter parameter_param = 145;
  optional PoolingParameter pooling_param = 121;
  optional PowerParameter power_param = 122;
  optional PReLUParameter prelu_param = 131;
  optional PythonParameter python_param = 130;
  optional RecurrentParameter recurrent_param = 146;
  optional ReductionParameter reduction_param = 136;
  optional ReLUParameter relu_param = 123;
  optional ReshapeParameter reshape_param = 133;
  optional ScaleParameter scale_param = 142;
  optional SigmoidParameter sigmoid_param = 124;
  optional SoftmaxParameter softmax_param = 125;
  optional SPPParameter spp_param = 132;
  optional SliceParameter slice_param = 126;
  optional SwishParameter swish_param = 147;
  optional TanHParameter tanh_param = 127;
  optional ThresholdParameter threshold_param = 128;
  optional TileParameter tile_param = 138;
  ///////////////////////////////////
  optional UpsampleParameter upsample_param = 151;
  optional WindowDataParameter window_data_param = 129;
}

对应的这两个parameter里面的参数:

message BNParameter {
  enum BNMode {
    LEARN = 0;
    INFERENCE = 1;
  }
  optional BNMode bn_mode = 3 [default = LEARN];
  optional FillerParameter scale_filler = 1;  // The filler for the scale
  optional FillerParameter shift_filler = 2;  // The filler for the shift
}
message UpsampleParameter {
  // DEPRECATED. No need to specify upsampling scale factors when
  // exact output shape is given by upsample_h, upsample_w parameters.
  optional uint32 scale = 1 [default = 2];
  // DEPRECATED. No need to specify upsampling scale factors when
  // exact output shape is given by upsample_h, upsample_w parameters.
  optional uint32 scale_h = 2;
  // DEPRECATED. No need to specify upsampling scale factors when
  // exact output shape is given by upsample_h, upsample_w parameters.
  optional uint32 scale_w = 3;
  // DEPRECATED. Specify exact output height using upsample_h. This
  // parameter only works when scale is 2
  optional bool pad_out_h = 4 [default = false];
  // DEPRECATED. Specify exact output width using upsample_w. This
  // parameter only works when scale is 2
  optional bool pad_out_w = 5 [default = false];
  optional uint32 upsample_h = 6;
  optional uint32 upsample_w = 7;
}
最后所有添加工作完成之后直接编译就可以使用了。

中途问题解决地址:

https://blog.youkuaiyun.com/kevin_darkelf/article/details/51683456

https://github.com/BVLC/caffe/issues/5451

https://github.com/BVLC/caffe

http://www.cnblogs.com/yqyouqing/p/7231405.html

您可能感兴趣的与本文相关的镜像

Llama Factory

Llama Factory

模型微调
LLama-Factory

LLaMA Factory 是一个简单易用且高效的大型语言模型(Large Language Model)训练与微调平台。通过 LLaMA Factory,可以在无需编写任何代码的前提下,在本地完成上百种预训练模型的微调

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值