安卓MNN半精度记录
- 默认编译生成的so库,没有开启arm82。若要使用fp16计算,需要修改cmakelists.txt 151行
option(MNN_ARM82 "Enable ARM82" OFF)
改为ON
。 - MNNConvert --fp16,转出来的模型只是缩减了模型的存储大小,跟计算无关。
- 我用较大的模型fp32跑延时大概100ms,fp16跑大概50ms,速度可以提升一半;小模型的话可能只提升几ms;手机需要支持arm82才能开启fp16计算。
- 代码中的fp16配置,只需要设置
m_backend_config.precision = MNN::BackendConfig::PrecisionMode::Precision_Low;
即可。这里有三种精度模式Precision_High
,Precision_Normal
,Precision_Low
;默认用low,速度最快,设备支持fp16计算并且so库开启了arm82的话,实时最好;normal int8使用;high fp32计算;若使用low模式,设备不支持fp16计算,切换成fp32。 - 不需要用torch混合精度训练后再转换模型,只需一切正常训练转换模型即可。
- 若使用high结果正常,使用low结果完全就是错的。你可能直接这么获取结果
tensor->host
这对于fp16来说是错误的,正确获取结果的方式为:
MNN::Tensor *output= m_net->getSessionOutput(m_session, NULL);
MNN::Tensor tensor_scores_host(output, output->getDimensionType());// 得到结果指针
output->copyToHostTensor(&tensor_scores_host);
std::vector<LaneDetect::Lanes> lanes;
decodeHeatmap(lanes,tensor_scores_host, width, height, threshold,lens_threshold);
函数:
void LaneDetect::decodeHeatmap(std::vector<LaneDetect::Lanes>& lanes,const MNN::Tensor& heatmap,int w, int h, double threshold, double lens_threshold)
{
const float* displacement = heatmap.host<float>()+m_hm_size*m_hm_size;
const float* hm = heatmap.host<float>();
...
}
以下是我工程的完整代码:
#include "lane.hpp"
bool LaneDetect::hasGPU = false;
bool LaneDetect::toUseGPU = false;
LaneDetect *LaneDetect::detector = nullptr;
LaneDetect::LaneDetect(const std::string &mnn_path, bool useGPU)
{
toUseGPU = hasGPU && useGPU;
m_net = std::shared_ptr<MNN::Interpreter>(MNN::Interpreter::createFromFile(mnn_path.c_str()));
m_backend_config.precision = MNN::BackendConfig::PrecisionMode::Precision_Low; // 精度
m_backend_config.power = MNN::BackendConfig::Power_Normal; // 功耗
m_backend_config.memory = MNN::BackendConfig::Memory_Normal; // 内存占用
m_config.backendConfig = &m_backend_config;
m_config.numThread = 2;
if (useGPU) {
m_config.type = MNN_FORWARD_OPENCL;
}
m_config.backupType = MNN_FORWARD_CPU;
MNN::CV::ImageProcess::Config img_config; // 图像处理
::memcpy(img_config.mean, m_mean_vals, sizeof(m_mean_vals)); // (img - mean)*norm
::memcpy(img_config.normal, m_norm_vals, sizeof(m_norm_vals));
img_config.sourceFormat = MNN::CV::BGR;
img_config.destFormat = MNN::CV::RGB;
pretreat = std::shared_ptr<MNN::CV::ImageProcess>(MNN::CV::ImageProcess::create(img_config));
MNN::CV::Matrix trans;
trans.setScale(1.0f, 1.0f); // scale
pretreat->setMatrix(trans);
m_session = m_net->createSession(m_config); //创建session
m_inTensor = m_net->getSessionInput(m_session, NULL);
m_net->resizeTensor(m_inTensor, {1, 3, m_input_size, m_input_size});
m_net->resizeSession(m_session);
std::cout << "session created" << std::endl;
m_index.resize(m_hm_size*m_hm_size); // 初始化一次 index
for (int i = 0 ; i != m_hm_size*m_hm_size ; i++) {
m_index[i] = i;
}
}
LaneDetect::~LaneDetect()
{
m_net->releaseModel();
m_net->releaseSession(m_session);
}
inline int LaneDetect::clip(float value)
{
if (value > 0 && value < m_input_size)
return int(value);
else if (value < 0)
return 1;
else
return m_input_size - 1;
}
// old
void LaneDetect::decodeHeatmap(std::vector<LaneDetect::Lanes>& lanes,const MNN::Tensor& heatmap,int w, int h, double threshold, double lens_threshold)
{
// 线段中心点(256*256),线段偏移(4*256*256)
const float* displacement = heatmap.host<float>()+m_hm_size*m_hm_size;
const float* hm = heatmap.host<float>();
// exp(center,center);
std::vector<float> center;
center.resize(m_hm_size*m_hm_size);
// for (int i = 0;i < m_hm_size*m_hm_size; i++) // 用这种方式需要在后面resize center
// {
// center.push_back( hm[i] ); // mlsd.mnn原始需要1/(exp(-hm[i]) + 1)
// }
memcpy(¢er[0],hm,m_hm_size*m_hm_size*sizeof(float));
std::vector<int> index = m_index;
// auto t_start_pre = std::chrono::high_resolution_clock::now();
sort(index.begin(), index.end(),
[&](const int& a, const int& b) {
return (center[a] > center[b]); // 从大到小排序
}
);
// auto t_end_pre = std::chrono::high_resolution_clock::now();
// float total_pre = std::chrono::duration<float, std::milli>(t_end_pre - t_start_pre).count();
// LOGD("time: %f ms",total_pre);
// std::vector<Lanes> lanes;
for (int i = 0; i < m_hm_size*m_hm_size; i++)
{
int yy = index[i]/m_hm_size; // 除以宽得行号
int xx = index[i]%m_hm_size; // 取余宽得列号
Lanes Lane;
Lane.x1 = xx + displacement[index[i] + 0*m_hm_size*m_hm_size];
Lane.y1 = yy + displacement[index[i] + 1*m_hm_size*m_hm_size];
Lane.x2 = xx + displacement[index[i] + 2*m_hm_size*m_hm_size];
Lane.y2 = yy + displacement[index[i] + 3*m_hm_size*m_hm_size];
Lane.lens = sqrt(pow(Lane.x1 - Lane.x2,2) + pow(Lane.y1 - Lane.y2,2));
Lane.conf = center[index[i]];
if (center[index[i]] > threshold && lanes.size() < m_top_k)
{
if ( Lane.lens > lens_threshold)
{
Lane.x1 = clip(w * Lane.x1 / (m_input_size / 2));
Lane.x2 = clip(w * Lane.x2 / (m_input_size / 2));
Lane.y1 = clip(h * Lane.y1 / (m_input_size / 2));
Lane.y2 = clip(h * Lane.y2 / (m_input_size / 2));
lanes.push_back(Lane);
}
}
else
break;
}
// return lanes;
}
std::vector<LaneDetect::Lanes> LaneDetect::detect(const cv::Mat& img, unsigned char* image_bytes, int width, int height, double threshold, double lens_threshold)
{
// 图像处理
cv::Mat preImage = img.clone();
cv::resize(preImage,preImage,cv::Size(m_input_size,m_input_size));
pretreat->convert(preImage.data, m_input_size, m_input_size, 0, m_inTensor);
// 推理
auto t_start_pre = std::chrono::high_resolution_clock::now();
m_net->runSession(m_session);
auto t_end_pre = std::chrono::high_resolution_clock::now();
float total_pre = std::chrono::duration<float, std::milli>(t_end_pre - t_start_pre).count();
LOGD("time1: %f ms",total_pre);
MNN::Tensor *output= m_net->getSessionOutput(m_session, NULL);
MNN::Tensor tensor_scores_host(output, output->getDimensionType());// 得到结果指针
output->copyToHostTensor(&tensor_scores_host);
std::vector<LaneDetect::Lanes> lanes;
auto t_start_pre2 = std::chrono::high_resolution_clock::now();
decodeHeatmap(lanes,tensor_scores_host, width, height, threshold,lens_threshold);
auto t_end_pre2 = std::chrono::high_resolution_clock::now();
float total_pre2 = std::chrono::duration<float, std::milli>(t_end_pre2 - t_start_pre2).count();
LOGD("time2: %f ms",total_pre2);
return lanes;
}
有用的话点个赞哦,亲 :->