MNN-fp16

Ctrl_Cver

已于 2022-07-06 16:33:48 修改

阅读量2k

点赞数 6

分类专栏：深度学习模型手机部署文章标签： mnn 人工智能深度学习

于 2022-07-06 16:29:37 首次发布

本文链接：https://blog.youkuaiyun.com/qq_33596242/article/details/125642069

版权

深度学习模型手机部署专栏收录该内容

8 篇文章

订阅专栏

安卓MNN半精度记录

默认编译生成的so库，没有开启arm82。若要使用fp16计算，需要修改cmakelists.txt 151行option(MNN_ARM82 "Enable ARM82" OFF)改为ON。
MNNConvert --fp16，转出来的模型只是缩减了模型的存储大小，跟计算无关。
我用较大的模型fp32跑延时大概100ms，fp16跑大概50ms，速度可以提升一半；小模型的话可能只提升几ms；手机需要支持arm82才能开启fp16计算。
代码中的fp16配置，只需要设置m_backend_config.precision = MNN::BackendConfig::PrecisionMode::Precision_Low;即可。这里有三种精度模式Precision_High，Precision_Normal，Precision_Low；默认用low，速度最快，设备支持fp16计算并且so库开启了arm82的话，实时最好；normal int8使用；high fp32计算；若使用low模式，设备不支持fp16计算，切换成fp32。
不需要用torch混合精度训练后再转换模型，只需一切正常训练转换模型即可。
若使用high结果正常，使用low结果完全就是错的。你可能直接这么获取结果tensor->host这对于fp16来说是错误的，正确获取结果的方式为：

MNN::Tensor *output= m_net->getSessionOutput(m_session, NULL);
MNN::Tensor tensor_scores_host(output, output->getDimensionType());// 得到结果指针
output->copyToHostTensor(&tensor_scores_host);
std::vector<LaneDetect::Lanes> lanes;
decodeHeatmap(lanes,tensor_scores_host, width, height, threshold,lens_threshold);

函数：
void LaneDetect::decodeHeatmap(std::vector<LaneDetect::Lanes>& lanes,const MNN::Tensor& heatmap,int w, int h, double threshold, double lens_threshold)
{
	const float* displacement = heatmap.host<float>()+m_hm_size*m_hm_size;
    const float* hm = heatmap.host<float>();
	...
}

以下是我工程的完整代码：

#include "lane.hpp"

bool LaneDetect::hasGPU = false;
bool LaneDetect::toUseGPU = false;
LaneDetect *LaneDetect::detector = nullptr;

LaneDetect::LaneDetect(const std::string &mnn_path, bool useGPU)
{
    toUseGPU = hasGPU && useGPU;

    m_net = std::shared_ptr<MNN::Interpreter>(MNN::Interpreter::createFromFile(mnn_path.c_str()));
    m_backend_config.precision = MNN::BackendConfig::PrecisionMode::Precision_Low;  // 精度
    m_backend_config.power = MNN::BackendConfig::Power_Normal; // 功耗
    m_backend_config.memory = MNN::BackendConfig::Memory_Normal; // 内存占用
	m_config.backendConfig = &m_backend_config;
	m_config.numThread = 2;
    if (useGPU) {
        m_config.type = MNN_FORWARD_OPENCL;
    }
    m_config.backupType = MNN_FORWARD_CPU;

    MNN::CV::ImageProcess::Config img_config; // 图像处理
    ::memcpy(img_config.mean, m_mean_vals, sizeof(m_mean_vals)); // (img - mean)*norm
    ::memcpy(img_config.normal, m_norm_vals, sizeof(m_norm_vals));
    img_config.sourceFormat = MNN::CV::BGR;
    img_config.destFormat = MNN::CV::RGB;
    pretreat = std::shared_ptr<MNN::CV::ImageProcess>(MNN::CV::ImageProcess::create(img_config));
    MNN::CV::Matrix trans;
    trans.setScale(1.0f, 1.0f); // scale
    pretreat->setMatrix(trans);

	m_session = m_net->createSession(m_config); //创建session
    m_inTensor = m_net->getSessionInput(m_session, NULL);
    m_net->resizeTensor(m_inTensor, {1, 3, m_input_size, m_input_size});
    m_net->resizeSession(m_session);
	std::cout << "session created" << std::endl;

    m_index.resize(m_hm_size*m_hm_size);  // 初始化一次 index
    for (int i = 0 ; i != m_hm_size*m_hm_size ; i++) {
        m_index[i] = i;
    }

}


LaneDetect::~LaneDetect()
{
    m_net->releaseModel();
    m_net->releaseSession(m_session);
}

inline int LaneDetect::clip(float value)
{
    if (value > 0 && value < m_input_size)
        return int(value);
    else if (value < 0)
        return 1;
    else
        return m_input_size - 1;

}



//  old
void LaneDetect::decodeHeatmap(std::vector<LaneDetect::Lanes>& lanes,const MNN::Tensor& heatmap,int w, int h, double threshold, double lens_threshold)
{
    // 线段中心点(256*256),线段偏移(4*256*256)
    const float* displacement = heatmap.host<float>()+m_hm_size*m_hm_size;
    const float* hm = heatmap.host<float>();
    // exp(center,center);
    std::vector<float> center;
    center.resize(m_hm_size*m_hm_size);

//    for (int i = 0;i < m_hm_size*m_hm_size; i++)  // 用这种方式需要在后面resize center
//    {
//        center.push_back( hm[i] ); // mlsd.mnn原始需要1/(exp(-hm[i]) + 1)
//    }

    memcpy(&center[0],hm,m_hm_size*m_hm_size*sizeof(float));
    std::vector<int> index = m_index;

//   auto t_start_pre = std::chrono::high_resolution_clock::now();
    sort(index.begin(), index.end(),
        [&](const int& a, const int& b) {
            return (center[a] > center[b]); // 从大到小排序
        }
    );
//    auto t_end_pre = std::chrono::high_resolution_clock::now();
//    float total_pre = std::chrono::duration<float, std::milli>(t_end_pre - t_start_pre).count();
//    LOGD("time: %f ms",total_pre);

//    std::vector<Lanes> lanes;
    for (int i = 0; i < m_hm_size*m_hm_size; i++)
    {
        int yy = index[i]/m_hm_size; // 除以宽得行号
        int xx = index[i]%m_hm_size; // 取余宽得列号
        Lanes Lane;
        Lane.x1 = xx + displacement[index[i] + 0*m_hm_size*m_hm_size];
        Lane.y1 = yy + displacement[index[i] + 1*m_hm_size*m_hm_size];
        Lane.x2 = xx + displacement[index[i] + 2*m_hm_size*m_hm_size];
        Lane.y2 = yy + displacement[index[i] + 3*m_hm_size*m_hm_size];
        Lane.lens = sqrt(pow(Lane.x1 - Lane.x2,2) + pow(Lane.y1 - Lane.y2,2));
        Lane.conf = center[index[i]];

        if (center[index[i]] > threshold && lanes.size() < m_top_k)
        {
            if ( Lane.lens > lens_threshold)
            {
                Lane.x1 = clip(w * Lane.x1 / (m_input_size / 2));
                Lane.x2 = clip(w * Lane.x2 / (m_input_size / 2));
                Lane.y1 = clip(h * Lane.y1 / (m_input_size / 2));
                Lane.y2 = clip(h * Lane.y2 / (m_input_size / 2));
                lanes.push_back(Lane);
            }
        }
        else
            break;
    }

//    return lanes;

}





std::vector<LaneDetect::Lanes> LaneDetect::detect(const cv::Mat& img, unsigned char* image_bytes, int width, int height, double threshold, double lens_threshold)
{
    // 图像处理
    cv::Mat preImage = img.clone();
    cv::resize(preImage,preImage,cv::Size(m_input_size,m_input_size));
    pretreat->convert(preImage.data, m_input_size, m_input_size, 0, m_inTensor);
    // 推理
    auto t_start_pre = std::chrono::high_resolution_clock::now();

    m_net->runSession(m_session);

    auto t_end_pre = std::chrono::high_resolution_clock::now();
    float total_pre = std::chrono::duration<float, std::milli>(t_end_pre - t_start_pre).count();
    LOGD("time1: %f ms",total_pre);

    MNN::Tensor *output= m_net->getSessionOutput(m_session, NULL);
    MNN::Tensor tensor_scores_host(output, output->getDimensionType());// 得到结果指针
    output->copyToHostTensor(&tensor_scores_host);

    std::vector<LaneDetect::Lanes> lanes;

    auto t_start_pre2 = std::chrono::high_resolution_clock::now();
    decodeHeatmap(lanes,tensor_scores_host, width, height, threshold,lens_threshold);
    auto t_end_pre2 = std::chrono::high_resolution_clock::now();
    float total_pre2 = std::chrono::duration<float, std::milli>(t_end_pre2 - t_start_pre2).count();
    LOGD("time2: %f ms",total_pre2);


    return lanes;
}

有用的话点个赞哦，亲 :->