手写cuda算子：融合算子Fuse_Resize_Pad_Bgr2rgb实现超50-2000倍加速CPU推理（四）

本文链接：https://blog.youkuaiyun.com/weixin_55083979/article/details/146206653

一.算子原理

二.dynamicbatch infer动态batch推理

三.Fuse_Resize_Pad_Bgr2rgb算子实现与launch函数实现

四.floatptr2cvimg将多batch浮点数指针，转成UINT8格式本地存储

五.OpenCV-Resize_Pad_Bgr2rgb算子实现(待实验)

六.CPU-Fuse_Resize_Pad_Bgr2rgb算子实现(待实验）

七.结果图像对比

八.latency时延对比

一.算子原理

1.仿射变换与放射变换矩阵

仿射变换矩阵： matrix_src2dst，两行三列；scale是缩放比例，rotaion是旋转角度，offset是偏移值；

scale = min(dst_w/src-w，dst_h/src_h);
rotaion = 0；
offset_x = 0.5*(dst_w-src_w*scale+scale-1);
offset_y = 0.5*(dst_y-src_y*scale+scale-1);

2.放射变换逆矩阵: matrix_dst2src；通过求逆矩阵得到

param.src_h = img.rows;
param.src_w = img.cols;
float scale = std::min(param.dst_h/(float)param.src_h, param.dst_w/float(param.src_w));
float offset_x = 0.5*(param.dst_w-param.src_w*scale+scale-1);
float offset_y = 0.5*(param.dst_h-param.src_h*scale+scale-1);
src2dstmat = (cv::Mat_<float>(2,3,CV_32FC1)<<scale,0,offset_x,0,scale,offset_y);
cv::invertAffineTransform(src2dstmat,dst2srcmat);
matrix_dst2src.v0 = dst2srcmat.ptr<float>(0)[0];
matrix_dst2src.v1 = dst2srcmat.ptr<float>(0)[1];
matrix_dst2src.v2 = dst2srcmat.ptr<float>(0)[2];
matrix_dst2src.v3 = dst2srcmat.ptr<float>(1)[0];
matrix_dst2src.v4 = dst2srcmat.ptr<float>(1)[1];
matrix_dst2src.v5 = dst2srcmat.ptr<float>(1)[2];

3.在cuda核函数里面通过遍历dst每一个位置，通过matrix_dst2src 仿射变换计算该位置在src源图像的位置;此时的位置是一个浮点数结果，需要通过双线性插值计算目标点的值。

双线性插值的数学公式可以表示为：
c0 = floorf(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]);
c1 = floorf(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]);
c2 = floorf(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]);

4.权重计算公式

/*
w4  w3
w2  w1

v1  v2
v3  v4 
*/
float w1 = hy*hx;
float w2 = hy*lx;
float w3 = ly*hx;
float w4 = ly*lx;

5.最邻近四个元素的指针计算

if (y_low>=0)
{
    if (x_low >= 0)
    {
        v1 = input_src_device + dy * src_volume + src_w * y_low * 3 + x_low * 3;
    }
    if (x_high < src_w)
    {
        v2 = input_src_device + dy * src_volume + src_w * y_low * 3 + x_high * 3;
    }
}
// v3 v4 通过 y_high绑定。constrain之后 y_high->[1,src_y-1]
if (y_high<src_h)
{
    if (x_low >= 0)
    {
        v3 = input_src_device + dy * src_volume + src_w * y_high * 3 + x_low * 3;
    }
    if (x_high<src_w)
    {
        v4 = input_src_device + dy * src_volume + src_w * y_high * 3 + x_high * 3;
    }

}

6.最终结果：超出src原图像尺寸范围的就默认pad_value填充(小tip这里如果c2与c0换一下位置是不是就是bgr2rgb)

float* pdst = resize_dst + dy * dst_volume + dst_w * dst_y * 3 + dst_x * 3 ;
pdst[0] = c0;
pdst[1] = c1;
pdst[2] = c2;

二.dynamicbatch infer动态batch推理

推理前，声明指针-->根据超参数batch分配显存-->将cv::Mat指针数据拷贝到device

for(int i=0;i<param.batch;++i){
        cudaMemcpy(batch_imgs+i*param.src_w*param.src_h*3,img.data,
        param.src_w*param.src_h*3*sizeof(unsigned char),cudaMemcpyHostToDevice);
    }

三.Fuse_Resize_Pad_Bgr2rgb算子实现与launch函数实现

void launch_cuda_resize_padding(tools::Param param, unsigned char* intput_src_device,float* resize_dst,const tools::AffineMatrix dst2src)
{
    dim3 block_size(param.BLOCK_SIZE,param.BLOCK_SIZE);
    dim3 grid_size((param.dst_w*param.dst_h + param.BLOCK_SIZE -1)/param.BLOCK_SIZE,
        (param.BLOCK_SIZE + param.BLOCK_SIZE -1)/param.BLOCK_SIZE);
    int batch_size = param.batch;
    float pad_value = param.pad_value;
    int src_w = param.src_w;
    int src_h = param.src_h;
    int src_volume = src_w * src_h * 3;
    int dst_w = param.dst_w;
    int dst_h = param.dst_h;
    int dst_volume = dst_w * dst_h * 3;
    int dst_area = dst_w * dst_h;
    resize_padding_device_kernel<<<grid_size,block_size, 0, nullptr>>>(intput_src_device,src_volume,src_w,src_h,resize_dst,dst_volume,dst_area,dst_w,dst_h,dst2src,batch_size,pad_value);

}

__device__ void affine_project_device_kernel(tools::AffineMatrix* dst2src,int dst_x,int dst_y, float* cal_src_x,float* cal_src_y)
{
    *cal_src_x = dst2src->v0 * dst_x + dst2src->v1*dst_y + dst2src->v2;
    *cal_src_y = dst2src->v3*dst_x + dst2src->v4*dst_y + dst2src->v5; 
}

__global__  void resize_padding_device_kernel(unsigned char* input_src_device,int src_volume,int src_w,int src_h,float* resize_dst,int dst_volume,
        int dst_area,int dst_w,int dst_h,tools::AffineMatrix dst2src,int batch_size,float pad_value)
{
    int dx = blockDim.x * blockIdx.x + threadIdx.x;
    int dy = blockDim.y * blockIdx.y + threadIdx.y;
    if (dx < dst_area && dy<batch_size)
    {
        int dst_y = dx / dst_w;
        int dst_x = dx % dst_w;
        float cal_src_x = 0;
        float cal_src_y = 0;
        // 根据dst的（x,y）结合 affinemat--> 获取 src的 （x0,y0）
        affine_project_device_kernel(&dst2src,dst_x,dst_y,&cal_src_x,&cal_src_y);
        // 计算每一个dst（x,y）位置的三通道像素值
        float c0 = pad_value;
        float c1 = pad_value;
        float c2 = pad_value;
        // 如果索引越界了就填充默认值
        if(cal_src_x<-1 || cal_src_x>=src_w || cal_src_y<-1 || cal_src_y>=src_h)
        {
        }
        else
        {
            /*
            cal_src_x: [0,src_x-1]
            x_low-->[-1,src_x-1]
            x_high-->[1,src_x]

            cal_src_y: [0,src_y-1]
            y_low-->[-1,src_y-1]
            y_high-->[1,src_y]
            
            */
            int x_low = floor(cal_src_x);
            int y_low = floor(cal_src_y);
            int x_high = x_low + 1;
            int y_high = y_low + 1;
            /*
                x_low,y_low
                            src_x,src_y
                                        x_high,y_high
            */
            unsigned char const_values[] = {(unsigned char)pad_value,(unsigned char)pad_value,(unsigned char)pad_value};
            float lx = cal_src_x - x_low;
            float ly = cal_src_y - y_low;
            float hx = x_high - cal_src_x;
            float hy = y_high - cal_src_y;
            /*
                w4  w3
                w2  w1

                v1  v2
                v3  v4 
            */
            float w1 = hy*hx;
            float w2 = hy*lx;
            float w3 = ly*hx;
            float w4 = ly*lx;
            unsigned char* v1 = const_values;
            unsigned char* v2 = const_values;
            unsigned char* v3 = const_values;
            unsigned char* v4 = const_values;

            //双线性插值索引越界就用默认常量值。constrain之后 y_low->[0,src_y-1]
            // v1 v2通过y_low绑定
            if (y_low>=0)
            {
                if (x_low >= 0)
                {
                    v1 = input_src_device + dy * src_volume + src_w * y_low * 3 + x_low * 3;
                }
                if (x_high < src_w)
                {
                    v2 = input_src_device + dy * src_volume + src_w * y_low * 3 + x_high * 3;
                }
            }
            // v3 v4 通过 y_high绑定。constrain之后 y_high->[1,src_y-1]
            if (y_high<src_h)
            {
                if (x_low >= 0)
                {
                    v3 = input_src_device + dy * src_volume + src_w * y_high * 3 + x_low * 3;
                }
                if (x_high<src_w)
                {
                    v4 = input_src_device + dy * src_volume + src_w * y_high * 3 + x_high * 3;
                }

            }
            c0 = floorf(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]);
            c1 = floorf(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]);
            c2 = floorf(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]);

        }
        float* pdst = resize_dst + dy * dst_volume + dst_w * dst_y * 3 + dst_x * 3 ;
        pdst[0] = c0;
        pdst[1] = c1;
        pdst[2] = c2;
        
    }

}

四.floatptr2cvimg将多batch浮点数指针，转成UINT8格式本地存储

std::vector<cv::Mat> floatptr2cvmat(tools::Param param,float* imgptr){
    std::vector<cv::Mat> img_list;
    for(int i=0;i<param.batch;++i){
        float* img_data = imgptr+i*param.dst_h*param.dst_w*3;
        cv::Mat floatcvimg(param.dst_h,param.dst_w,CV_32FC3,img_data);
        cv::Mat uint8cvimg;
        floatcvimg.convertTo(uint8cvimg,CV_8UC3);
        img_list.push_back(uint8cvimg.clone());
    }
    return img_list;
}

五.OpenCV-Resize_Pad_Bgr2rgb算子实现(待实验)

cv::Mat img_cpu;
cv::Mat img_cpu_gray(cv::Size(param.dst_w,param.dst_h),CV_8UC3,cv::Scalar(127,127,127));
int target_w = (int)param.src_w*scale;
int target_h = (int)param.src_h*scale;
// offset_x = (param.dst_h-target_h)/2;
// offset_y = (param.dst_w-target_w)/2;
offset_x = std::max(0, (param.dst_w - target_w) / 2);
offset_y = std::max(0, (param.dst_h - target_h) / 2);


for(int j=0;j<param.epochs;++j){
    HostTimer ht0; 
    cv::resize(img,img_cpu,cv::Size(target_w,target_h),0,0,cv::INTER_LINEAR);
    cv::cvtColor(img_cpu,img_cpu,cv::COLOR_BGR2RGB);
    img_cpu.copyTo(img_cpu_gray(cv::Rect(offset_x,offset_y,img_cpu.cols,img_cpu.rows)));
    float hc1 = ht0.getUsedTime();
    printf("cpu opencv const %f ,img_width: %d img_height %d\n",hc1,img.cols,img.rows);

}
cv::imwrite("cpu_rezizepadbgr.jpg",img_cpu_gray);

六.CPU-Fuse_Resize_Pad_Bgr2rgb算子实现(待实验）

七.结果图像对比

1.cuda 算子结果：bgr格式与rgb格式

2.OpenCV算法实现结果：bgr格式与rgb格式

八.Latency时延对比（batch=10/100/1000/5000）

1.cuda计算与cv::resizepad的时间对比

batch = 10

epoch 980 cuda op const 0.172054 ,img_width: 750 img_height 563
epoch 981 cuda op const 0.173248 ,img_width: 750 img_height 563
epoch 982 cuda op const 0.170288 ,img_width: 750 img_height 563
epoch 983 cuda op const 0.166989 ,img_width: 750 img_height 563
epoch 984 cuda op const 0.166275 ,img_width: 750 img_height 563
epoch 985 cuda op const 0.169181 ,img_width: 750 img_height 563
epoch 986 cuda op const 0.171242 ,img_width: 750 img_height 563
epoch 987 cuda op const 0.167664 ,img_width: 750 img_height 563
epoch 988 cuda op const 0.169021 ,img_width: 750 img_height 563
epoch 989 cuda op const 0.172352 ,img_width: 750 img_height 563
epoch 990 cuda op const 0.167555 ,img_width: 750 img_height 563
epoch 991 cuda op const 0.174797 ,img_width: 750 img_height 563
epoch 992 cuda op const 0.171424 ,img_width: 750 img_height 563
epoch 993 cuda op const 0.173725 ,img_width: 750 img_height 563
epoch 994 cuda op const 0.169533 ,img_width: 750 img_height 563
epoch 995 cuda op const 0.167475 ,img_width: 750 img_height 563
epoch 996 cuda op const 0.171782 ,img_width: 750 img_height 563
epoch 997 cuda op const 0.172122 ,img_width: 750 img_height 563
epoch 998 cuda op const 0.194198 ,img_width: 750 img_height 563
epoch 999 cuda op const 0.168038 ,img_width: 750 img_height 563
cpu propcess!!
epoch 0 cpu opencv const 5.732247 ,img_width: 750 img_height 563
epoch 1 cpu opencv const 2.152392 ,img_width: 750 img_height 563
epoch 2 cpu opencv const 2.172201 ,img_width: 750 img_height 563
epoch 3 cpu opencv const 1.556902 ,img_width: 750 img_height 563
epoch 4 cpu opencv const 1.726663 ,img_width: 750 img_height 563
epoch 5 cpu opencv const 1.481510 ,img_width: 750 img_height 563
epoch 6 cpu opencv const 1.440997 ,img_width: 750 img_height 563
epoch 7 cpu opencv const 2.039463 ,img_width: 750 img_height 563
epoch 8 cpu opencv const 1.536422 ,img_width: 750 img_height 563
epoch 9 cpu opencv const 1.607238 ,img_width: 750 img_height 563
epoch 10 cpu opencv const 2.025224 ,img_width: 750 img_height 563
epoch 11 cpu opencv const 1.160740 ,img_width: 750 img_height 563
epoch 12 cpu opencv const 1.112708 ,img_width: 750 img_height 563
epoch 13 cpu opencv const 1.817159 ,img_width: 750 img_height 563
epoch 14 cpu opencv const 1.891463 ,img_width: 750 img_height 563
epoch 15 cpu opencv const 1.704038 ,img_width: 750 img_height 563
epoch 16 cpu opencv const 1.128037 ,img_width: 750 img_height 563
epoch 17 cpu opencv const 1.181764 ,img_width: 750 img_height 563
epoch 18 cpu opencv const 1.511142 ,img_width: 750 img_height 563
epoch 19 cpu opencv const 1.718407 ,img_width: 750 img_height 563
epoch 20 cpu opencv const 1.781127 ,img_width: 750 img_height 563

batch = 100

epoch 980 cuda op const 0.028806 ,img_width: 750 img_height 563
epoch 981 cuda op const 0.028790 ,img_width: 750 img_height 563
epoch 982 cuda op const 0.028797 ,img_width: 750 img_height 563
epoch 983 cuda op const 0.028782 ,img_width: 750 img_height 563
epoch 984 cuda op const 0.028801 ,img_width: 750 img_height 563
epoch 985 cuda op const 0.028824 ,img_width: 750 img_height 563
epoch 986 cuda op const 0.028808 ,img_width: 750 img_height 563
epoch 987 cuda op const 0.028845 ,img_width: 750 img_height 563
epoch 988 cuda op const 0.028838 ,img_width: 750 img_height 563
epoch 989 cuda op const 0.028825 ,img_width: 750 img_height 563
epoch 990 cuda op const 0.028794 ,img_width: 750 img_height 563
epoch 991 cuda op const 0.028817 ,img_width: 750 img_height 563
epoch 992 cuda op const 0.028783 ,img_width: 750 img_height 563
epoch 993 cuda op const 0.028804 ,img_width: 750 img_height 563
epoch 994 cuda op const 0.028774 ,img_width: 750 img_height 563
epoch 995 cuda op const 0.028631 ,img_width: 750 img_height 563
epoch 996 cuda op const 0.028582 ,img_width: 750 img_height 563
epoch 997 cuda op const 0.028596 ,img_width: 750 img_height 563
epoch 998 cuda op const 0.028594 ,img_width: 750 img_height 563
epoch 999 cuda op const 0.028604 ,img_width: 750 img_height 563
cpu propcess!!
epoch 0 cpu opencv const 5.226768 ,img_width: 750 img_height 563
epoch 1 cpu opencv const 3.195691 ,img_width: 750 img_height 563
epoch 2 cpu opencv const 2.124902 ,img_width: 750 img_height 563
epoch 3 cpu opencv const 2.339208 ,img_width: 750 img_height 563
epoch 4 cpu opencv const 1.806246 ,img_width: 750 img_height 563
epoch 5 cpu opencv const 2.067526 ,img_width: 750 img_height 563
epoch 6 cpu opencv const 1.486469 ,img_width: 750 img_height 563
epoch 7 cpu opencv const 1.414405 ,img_width: 750 img_height 563
epoch 8 cpu opencv const 1.417124 ,img_width: 750 img_height 563
epoch 9 cpu opencv const 0.905187 ,img_width: 750 img_height 563
epoch 10 cpu opencv const 1.772198 ,img_width: 750 img_height 563
epoch 11 cpu opencv const 1.603525 ,img_width: 750 img_height 563
epoch 12 cpu opencv const 1.799590 ,img_width: 750 img_height 563
epoch 13 cpu opencv const 1.758662 ,img_width: 750 img_height 563
epoch 14 cpu opencv const 1.768677 ,img_width: 750 img_height 563
epoch 15 cpu opencv const 1.646501 ,img_width: 750 img_height 563
epoch 16 cpu opencv const 2.659240 ,img_width: 750 img_height 563
epoch 17 cpu opencv const 1.101092 ,img_width: 750 img_height 563
epoch 18 cpu opencv const 1.672933 ,img_width: 750 img_height 563
epoch 19 cpu opencv const 1.896550 ,img_width: 750 img_height 563
epoch 20 cpu opencv const 1.555621 ,img_width: 750 img_height 563

batch = 1000

epoch 980 cuda op const 0.002896 ,img_width: 750 img_height 563
epoch 981 cuda op const 0.002896 ,img_width: 750 img_height 563
epoch 982 cuda op const 0.002885 ,img_width: 750 img_height 563
epoch 983 cuda op const 0.003374 ,img_width: 750 img_height 563
epoch 984 cuda op const 0.003342 ,img_width: 750 img_height 563
epoch 985 cuda op const 0.003326 ,img_width: 750 img_height 563
epoch 986 cuda op const 0.003387 ,img_width: 750 img_height 563
epoch 987 cuda op const 0.002899 ,img_width: 750 img_height 563
epoch 988 cuda op const 0.005490 ,img_width: 750 img_height 563
epoch 989 cuda op const 0.003381 ,img_width: 750 img_height 563
epoch 990 cuda op const 0.003330 ,img_width: 750 img_height 563
epoch 991 cuda op const 0.003318 ,img_width: 750 img_height 563
epoch 992 cuda op const 0.003388 ,img_width: 750 img_height 563
epoch 993 cuda op const 0.002888 ,img_width: 750 img_height 563
epoch 994 cuda op const 0.005479 ,img_width: 750 img_height 563
epoch 995 cuda op const 0.003373 ,img_width: 750 img_height 563
epoch 996 cuda op const 0.003340 ,img_width: 750 img_height 563
epoch 997 cuda op const 0.003340 ,img_width: 750 img_height 563
epoch 998 cuda op const 0.003396 ,img_width: 750 img_height 563
epoch 999 cuda op const 0.002887 ,img_width: 750 img_height 563
cpu propcess!!
epoch 0 cpu opencv const 5.238708 ,img_width: 750 img_height 563
epoch 1 cpu opencv const 2.164763 ,img_width: 750 img_height 563
epoch 2 cpu opencv const 2.063035 ,img_width: 750 img_height 563
epoch 3 cpu opencv const 2.486202 ,img_width: 750 img_height 563
epoch 4 cpu opencv const 1.245405 ,img_width: 750 img_height 563
epoch 5 cpu opencv const 1.014045 ,img_width: 750 img_height 563
epoch 6 cpu opencv const 1.394557 ,img_width: 750 img_height 563
epoch 7 cpu opencv const 1.161213 ,img_width: 750 img_height 563
epoch 8 cpu opencv const 1.397469 ,img_width: 750 img_height 563
epoch 9 cpu opencv const 1.737884 ,img_width: 750 img_height 563
epoch 10 cpu opencv const 1.084029 ,img_width: 750 img_height 563
epoch 11 cpu opencv const 1.064254 ,img_width: 750 img_height 563
epoch 12 cpu opencv const 2.396474 ,img_width: 750 img_height 563
epoch 13 cpu opencv const 1.971900 ,img_width: 750 img_height 563
epoch 14 cpu opencv const 1.725276 ,img_width: 750 img_height 563
epoch 15 cpu opencv const 1.567133 ,img_width: 750 img_height 563
epoch 16 cpu opencv const 2.015387 ,img_width: 750 img_height 563
epoch 17 cpu opencv const 1.704092 ,img_width: 750 img_height 563
epoch 18 cpu opencv const 1.677532 ,img_width: 750 img_height 563
epoch 19 cpu opencv const 1.555516 ,img_width: 750 img_height 563
epoch 20 cpu opencv const 1.952060 ,img_width: 750 img_height 563

batch = 5000

epoch 980 cuda op const 0.000574 ,img_width: 750 img_height 563
epoch 981 cuda op const 0.000574 ,img_width: 750 img_height 563
epoch 982 cuda op const 0.000575 ,img_width: 750 img_height 563
epoch 983 cuda op const 0.000573 ,img_width: 750 img_height 563
epoch 984 cuda op const 0.000575 ,img_width: 750 img_height 563
epoch 985 cuda op const 0.000577 ,img_width: 750 img_height 563
epoch 986 cuda op const 0.000575 ,img_width: 750 img_height 563
epoch 987 cuda op const 0.000574 ,img_width: 750 img_height 563
epoch 988 cuda op const 0.000575 ,img_width: 750 img_height 563
epoch 989 cuda op const 0.000575 ,img_width: 750 img_height 563
epoch 990 cuda op const 0.000574 ,img_width: 750 img_height 563
epoch 991 cuda op const 0.000574 ,img_width: 750 img_height 563
epoch 992 cuda op const 0.000574 ,img_width: 750 img_height 563
epoch 993 cuda op const 0.000576 ,img_width: 750 img_height 563
epoch 994 cuda op const 0.000574 ,img_width: 750 img_height 563
epoch 995 cuda op const 0.000576 ,img_width: 750 img_height 563
epoch 996 cuda op const 0.000575 ,img_width: 750 img_height 563
epoch 997 cuda op const 0.000576 ,img_width: 750 img_height 563
epoch 998 cuda op const 0.000575 ,img_width: 750 img_height 563
epoch 999 cuda op const 0.000575 ,img_width: 750 img_height 563
cpu propcess!!
epoch 0 cpu opencv const 5.225226 ,img_width: 750 img_height 563
epoch 1 cpu opencv const 5.118987 ,img_width: 750 img_height 563
epoch 2 cpu opencv const 1.679843 ,img_width: 750 img_height 563
epoch 3 cpu opencv const 1.647299 ,img_width: 750 img_height 563
epoch 4 cpu opencv const 1.635460 ,img_width: 750 img_height 563
epoch 5 cpu opencv const 1.637443 ,img_width: 750 img_height 563
epoch 6 cpu opencv const 1.632451 ,img_width: 750 img_height 563
epoch 7 cpu opencv const 1.624227 ,img_width: 750 img_height 563
epoch 8 cpu opencv const 1.534532 ,img_width: 750 img_height 563
epoch 9 cpu opencv const 1.769283 ,img_width: 750 img_height 563
epoch 10 cpu opencv const 1.611299 ,img_width: 750 img_height 563
epoch 11 cpu opencv const 1.604932 ,img_width: 750 img_height 563
epoch 12 cpu opencv const 1.612163 ,img_width: 750 img_height 563
epoch 13 cpu opencv const 1.655651 ,img_width: 750 img_height 563
epoch 14 cpu opencv const 1.643011 ,img_width: 750 img_height 563
epoch 15 cpu opencv const 1.689508 ,img_width: 750 img_height 563
epoch 16 cpu opencv const 1.681379 ,img_width: 750 img_height 563
epoch 17 cpu opencv const 1.629092 ,img_width: 750 img_height 563
epoch 18 cpu opencv const 2.414660 ,img_width: 750 img_height 563
epoch 19 cpu opencv const 1.696036 ,img_width: 750 img_height 563
epoch 20 cpu opencv const 1.669123 ,img_width: 750 img_height 563

2.cuda计算与cv::resizepad_bgr2rgb的时间对比

九.test_Resize_Pad_Bgr2rgb.cpp源码

#include "src/utils/utils.h"
#include "src/resizepad/resizepad.h"


std::vector<cv::Mat> floatptr2cvmat(tools::Param param,float* imgptr){
    std::vector<cv::Mat> img_list;
    for(int i=0;i<param.batch;++i){
        float* img_data = imgptr+i*param.dst_h*param.dst_w*3;
        cv::Mat floatcvimg(param.dst_h,param.dst_w,CV_32FC3,img_data);
        cv::Mat uint8cvimg;
        floatcvimg.convertTo(uint8cvimg,CV_8UC3);
        img_list.push_back(uint8cvimg.clone());
    }
    return img_list;
}


int main(){
    tools::Param  param;
    tools::AffineMatrix matrix_src2dst;
    tools::AffineMatrix matrix_dst2src;
    cv::Mat src2dstmat;
    cv::Mat dst2srcmat;
    cv::Mat img = cv::imread("../a.jpg");
    param.src_h = img.rows;
    param.src_w = img.cols;
    float scale = std::min(param.dst_h/(float)param.src_h, param.dst_w/float(param.src_w));
    float offset_x = 0.5*(param.dst_w-param.src_w*scale+scale-1);
    float offset_y = 0.5*(param.dst_h-param.src_h*scale+scale-1);
    src2dstmat = (cv::Mat_<float>(2,3,CV_32FC1)<<scale,0,offset_x,0,scale,offset_y);
    cv::invertAffineTransform(src2dstmat,dst2srcmat);
    matrix_dst2src.v0 = dst2srcmat.ptr<float>(0)[0];
    matrix_dst2src.v1 = dst2srcmat.ptr<float>(0)[1];
    matrix_dst2src.v2 = dst2srcmat.ptr<float>(0)[2];
    matrix_dst2src.v3 = dst2srcmat.ptr<float>(1)[0];
    matrix_dst2src.v4 = dst2srcmat.ptr<float>(1)[1];
    matrix_dst2src.v5 = dst2srcmat.ptr<float>(1)[2];
    
    unsigned char* batch_imgs;
    float* batch_imgs_resized_host;
    float* batch_imgs_resized_device;
    // 将多个cv::Mat host指针数据形成dynamicbatch
    cudaMalloc((void**)&batch_imgs,param.batch*param.src_w*param.src_h*3*sizeof(unsigned char));
    // 为cuda核函数计算结果在host端展示的空间分配内存
    batch_imgs_resized_host = (float*)malloc(param.batch*param.dst_h*param.dst_w*3*sizeof(float));
    // 为cuda核函数计算结果在device端的空间分配内存
    cudaMalloc((void**)&batch_imgs_resized_device,param.batch*param.dst_h*param.dst_w*3*sizeof(float));
    // 将cv::Mat host指针数据拷贝到device
    for(int i=0;i<param.batch;++i){
        cudaMemcpy(batch_imgs+i*param.src_w*param.src_h*3,img.data,
        param.src_w*param.src_h*3*sizeof(unsigned char),cudaMemcpyHostToDevice);
    }
    for(int i=0;i<param.epochs;++i){
        DeviceTimer dt0; 
        launch_cuda_resize_padding(param,batch_imgs,batch_imgs_resized_device,matrix_dst2src);
        float dc0 = dt0.getUsedTime()/param.batch;
        printf("epoch %d cuda op const %f ,img_width: %d img_height %d\n",i,dc0,img.cols,img.rows);
    }
    printf("cpu propcess!!");
    cv::Mat img_cpu;
    cv::Mat img_cpu_gray(cv::Size(param.dst_w,param.dst_h),CV_8UC3,cv::Scalar(127,127,127));
    int target_w = (int)param.src_w*scale;
    int target_h = (int)param.src_h*scale;
    // offset_x = (param.dst_h-target_h)/2;
    // offset_y = (param.dst_w-target_w)/2;
    offset_x = std::max(0, (param.dst_w - target_w) / 2);
    offset_y = std::max(0, (param.dst_h - target_h) / 2);


    for(int j=0;j<param.epochs;++j){
        HostTimer ht0; 
        cv::resize(img,img_cpu,cv::Size(target_w,target_h),0,0,cv::INTER_LINEAR);
        cv::cvtColor(img_cpu,img_cpu,cv::COLOR_BGR2RGB);
        img_cpu.copyTo(img_cpu_gray(cv::Rect(offset_x,offset_y,img_cpu.cols,img_cpu.rows)));
        float hc1 = ht0.getUsedTime();
        printf("epoch %d cpu opencv const %f ,img_width: %d img_height %d\n",j,hc1,img.cols,img.rows);

    }
    cv::imwrite("cpu_rezizepadbgr.jpg",img_cpu_gray);
    
    cudaMemcpy(batch_imgs_resized_host,batch_imgs_resized_device,
        param.batch*param.dst_h*param.dst_w*3*sizeof(float),cudaMemcpyDeviceToHost);

    std::vector<cv::Mat> img_list = floatptr2cvmat(param,batch_imgs_resized_host);
    for(int j=0;j<img_list.size();++j){
        std::string img_name = std::to_string(j)+".jpg";
        cv::imwrite(img_name,img_list[j]);  
    }
    free(batch_imgs_resized_host);
    cudaFree(batch_imgs);
    cudaFree(batch_imgs_resized_device);
    
}