Caffe打印模型前向传播时间

最新推荐文章于 2024-09-03 17:35:19 发布

原创最新推荐文章于 2024-09-03 17:35:19 发布 · 688 阅读

2 ·

CC 4.0 BY-SA版权

Caffe 专栏收录该内容

24 篇文章

订阅专栏

本文介绍了一种在Caffe框架中分析网络性能的方法，通过记录各层前向传播的时间来定位耗时较长的部分，帮助优化网络结构。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1. 前言

有时候在做网络性能分析的时候，需要寻找到网络最为耗时的部分，这时候就需要打印各个层在前向传播过程中所消耗的时间多少了。所幸，在caffe.cpp文件中包含了该部分的代码，这里的使用的代码是在其基础上进行修改的。

2. 代码实现

#include <stdio.h>
#include <stdlib.h>
#include <opencv2/opencv.hpp>
#include <caffe/blob.hpp>
#include <caffe/common.hpp>
#include <caffe/layer.hpp>
#include <caffe/net.hpp>
#include <caffe/util/benchmark.hpp>
#include <caffe/proto/rcnn_caffe.pb.h>

using std::vector;
using caffe::shared_ptr;
using caffe::Blob;
using caffe::Net;
using caffe::Layer;
using caffe::Timer;

/*
* 说明：显示前向传播时间消耗情况
* const std::string deploy_proto：网络定义文件
* const std::string model_file：模型文件
* const int FLAGS_iterations=10：迭代次数
*/
void TimeConsumptionClac(const std::string deploy_proto, const std::string model_file, const int FLAGS_iterations=10)
{
	// Instantiate the caffe net.
	Net<float> caffe_net(deploy_proto, rcnn_caffe::TEST);
	caffe_net.CopyTrainedLayersFrom(model_file);
	
	//prepare net forward，需要各自的使用情况做相应修改
	//image data
	const vector<Blob<float> *> &input_blobs = caffe_net.input_blobs();
	cv::Mat img_in(832, 608, CV_8UC3, cv::Scalar(123,23,223));
	input_blobs[0]->Reshape(1, img_in.channels(), img_in.rows, img_in.cols);
	float *blob_data = input_blobs[0]->mutable_cpu_data();
	const int cols = img_in.cols;
	const int rows = img_in.rows;
	for (int i = 0; i < rows; ++i)
	{
		const cv::Vec3f* data = img_in.ptr<cv::Vec3f>(i);  //此处用指针读取Mat数据，并存到blob中
		for (int j = 0; j < cols; ++j)
		{
			blob_data[(0 * rows + i)*cols + j] = data[j][0];
			blob_data[(1 * rows + i)*cols + j] = data[j][1];
			blob_data[(2 * rows + i)*cols + j] = data[j][2];
		}
	}
	//image info
	input_blobs[1]->Reshape(1, 3, 1, 1);
	blob_data = input_blobs[1]->mutable_cpu_data();
	float img_info[] = {832, 608, 1.0};
	std::memcpy(blob_data, &img_info[0], sizeof(float)*3);

	// Do a clean forward and backward pass, so that memory allocation are done
	// and future iterations will be more stable.
	LOG(INFO) << "Performing Forward";
	// Note that for the speed benchmark, we will assume that the network does
	// not take any input blobs.
	float initial_loss;
	caffe_net.Forward(&initial_loss);
	LOG(INFO) << "Initial loss: " << initial_loss;


	const vector<shared_ptr<Layer<float> > >& layers = caffe_net.layers();
	const vector<vector<Blob<float>*> >& bottom_vecs = caffe_net.bottom_vecs();
	const vector<vector<Blob<float>*> >& top_vecs = caffe_net.top_vecs();

	LOG(INFO) << "*** Benchmark begins ***";
	LOG(INFO) << "Testing for " << FLAGS_iterations << " iterations.";
	Timer total_timer;
	total_timer.Start();
	Timer forward_timer;
	Timer backward_timer;
	Timer timer;
	std::vector<double> forward_time_per_layer(layers.size(), 0.0);

	double forward_time = 0.0;

	for (int j = 0; j < FLAGS_iterations; ++j) {
		Timer iter_timer;
		iter_timer.Start();
		forward_timer.Start();
		for (int i = 0; i < layers.size(); ++i) {
			timer.Start();
			layers[i]->Forward(bottom_vecs[i], top_vecs[i]);
			forward_time_per_layer[i] += timer.MicroSeconds();
		}
		forward_time += forward_timer.MicroSeconds();
    
		LOG(INFO) << "Iteration: " << j + 1 << " forward-backward time: "
			<< iter_timer.MilliSeconds() << " ms.";
	}
	LOG(INFO) << "Average time per layer: ";
	std::vector<rcnn_caffe::string> layernames = caffe_net.layer_names();
	LOG(INFO) << "layer_names size: " << layernames.size() << ";" << "layers size:" << layers.size();
	for (int i = 0; i < layers.size(); ++i) {
		//const rcnn_caffe::string& layername = layers[i]->layer_param().name();
		rcnn_caffe::string layername = layernames[i];
		LOG(INFO) << std::setfill(' ') << std::setw(10) << layername <<
			"\tforward: " << forward_time_per_layer[i] / 1000 /
			FLAGS_iterations << " ms.";
	}
	total_timer.Stop();
	LOG(INFO) << "Average Forward pass: " << forward_time / 1000 /
		FLAGS_iterations << " ms.";

	LOG(INFO) << "Total Time: " << total_timer.MilliSeconds() << " ms.";
	LOG(INFO) << "*** Benchmark ends ***";
}

//using namespace std;
int main(int argc, char* argv[])
{
	if (3!=argc)
	{
		std::cout << "input param not enough!eg.\n 1. xx time gpu_id" << std::endl;
		return 0;
	}
	std::string run_type = std::string(argv[1]);
	if (3 == argc && run_type=="time")
	{
		int device_id = atoi(argv[2]); //GPU_ID
		if (device_id >= 0)
		{
			rcnn_caffe::Caffe::set_mode(rcnn_caffe::Caffe::GPU);
			rcnn_caffe::Caffe::SetDevice(device_id);
		}
		else
		{
			rcnn_caffe::Caffe::set_mode(rcnn_caffe::Caffe::CPU);
			rcnn_caffe::Caffe::SetDevice(1);
		}
		
		TimeConsumptionClac("./models/test.prototxt",
			"./models/caffe.caffemodel",
			10);
	}

	return 0;
}

3. 结果

I0408 16:34:39.648761 66204 main.cpp:159]      input	forward: 0.007 ms.
I0408 16:34:39.648782 66204 main.cpp:159]    conv1_1	forward: 727.992 ms.
I0408 16:34:39.648793 66204 main.cpp:159]    relu1_1	forward: 44.2521 ms.
I0408 16:34:39.648805 66204 main.cpp:159]    conv1_2	forward: 3841.53 ms.
I0408 16:34:39.648814 66204 main.cpp:159]    relu1_2	forward: 51.561 ms.
I0408 16:34:39.648823 66204 main.cpp:159]      pool1	forward: 141.595 ms.
I0408 16:34:39.648829 66204 main.cpp:159]    conv2_1	forward: 1535.58 ms.
I0408 16:34:39.648836 66204 main.cpp:159]    relu2_1	forward: 23.1318 ms.
I0408 16:34:39.648844 66204 main.cpp:159]    conv2_2	forward: 3124.48 ms.
I0408 16:34:39.648850 66204 main.cpp:159]    relu2_2	forward: 26.3654 ms.
I0408 16:34:39.648857 66204 main.cpp:159]      pool2	forward: 80.442 ms.
I0408 16:34:39.648864 66204 main.cpp:159]    conv3_1	forward: 1411.55 ms.
I0408 16:34:39.648872 66204 main.cpp:159]    relu3_1	forward: 12.0473 ms.
I0408 16:34:39.648880 66204 main.cpp:159]    conv3_2	forward: 2761.23 ms.
I0408 16:34:39.648887 66204 main.cpp:159]    relu3_2	forward: 11.3475 ms.
I0408 16:34:39.648895 66204 main.cpp:159]    conv3_3	forward: 2789.08 ms.
I0408 16:34:39.648900 66204 main.cpp:159]    relu3_3	forward: 11.7217 ms.
I0408 16:34:39.648908 66204 main.cpp:159]      pool3	forward: 34.3051 ms.
I0408 16:34:39.648916 66204 main.cpp:159]    conv4_1	forward: 1337.64 ms.
I0408 16:34:39.648922 66204 main.cpp:159]    relu4_1	forward: 6.4562 ms.
I0408 16:34:39.648928 66204 main.cpp:159]    conv4_2	forward: 2633.94 ms.
I0408 16:34:39.648936 66204 main.cpp:159]    relu4_2	forward: 6.0823 ms.
I0408 16:34:39.648943 66204 main.cpp:159]    conv4_3	forward: 2576.46 ms.
I0408 16:34:39.648950 66204 main.cpp:159]    relu4_3	forward: 5.2473 ms.
I0408 16:34:39.648957 66204 main.cpp:159]      pool4	forward: 16.83 ms.
I0408 16:34:39.648964 66204 main.cpp:159]    conv5_1	forward: 679.819 ms.
I0408 16:34:39.648972 66204 main.cpp:159]    relu5_1	forward: 1.7372 ms.
I0408 16:34:39.648978 66204 main.cpp:159]    conv5_2	forward: 691.741 ms.
I0408 16:34:39.648991 66204 main.cpp:159]    relu5_2	forward: 1.3683 ms.
I0408 16:34:39.648998 66204 main.cpp:159]    conv5_3	forward: 661.567 ms.
I0408 16:34:39.649006 66204 main.cpp:159]    relu5_3	forward: 1.1911 ms.