图像融合(三)__C++和CUDA做加速处理

在船当牛马的Bing

已于 2024-06-28 10:33:34 修改

阅读量671

点赞数 4

CC 4.0 BY-SA版权

文章标签： c++ opencv

于 2024-06-25 21:41:47 首次发布

本文链接：https://blog.youkuaiyun.com/Xiang_bin_bin/article/details/139970279

针对图像融合的矩阵变化速度较慢的情况做cuda加速处理,本文是自己学习后加入自己深入理解的总结记录，方便自己以后查看。

如何解决opencv算法cpu处理较慢的问题，我这边利用cuda编程在GPU上并行计算加快速度

一、方法一

使用c++先将通道图片从host拷贝到device中,使用cuda核函数做通道融合和阈值加减，再将核函数的输出从device中转到host,c++调用直接cv：：mat生成图片，python调用同样用cv2.mat（），C#调用使用BitmapToByte和ByteToBitmap转换成bitmap格式（后面附带转换方法）

1、cuda核函数

__global__ void merge_image(uchar* src1, uchar* src2, uchar* src3, uchar* drc, int rows, int clos, int value_b, int value_g, int value_r)
{
	//blockDim.x一个线程格中的网格指标，默认一维。blockIdx一个网格中的线程块指标，默认是一维。threadIdx一个线程块中的线程指标，默认是一维。1800*600
	int id = blockIdx.x * blockDim.x + threadIdx.x;
	if (id < clos * rows)
	{
		if (value_b > 0 && (255 - src1[id]) < value_b)
			drc[id * 3] = 255;
		else if(value_b < 0 && src1[id]<=(-1* value_b))
			drc[id * 3] = 0;
		else
			drc[id * 3] = src1[id]+value_b;
		if (value_g > 0 && (255 - src2[id]) < value_g)
			drc[id * 3+1] = 255;
		else if (value_g < 0 && src2[id] <= (-1 * value_g))
			drc[id * 3+1] = 0;
		else
			drc[id * 3+1] = src2[id] + value_g;
		if (value_r > 0 && (255 - src3[id]) < value_r)
			drc[id * 3 + 2] = 255;
		else if (value_r < 0 && src3[id] <= (-1 * value_r))
			drc[id * 3 + 2] = 0;
		else
			drc[id * 3 + 2] = src3[id] + value_r;
	}
}
uchar* merge_cuda_process(uchar* src1, uchar* src2, uchar* src3, uchar* drc, int rows, int clos, int value_b, int value_g ,int value_r) {
	int block = rows;
	int grid = (clos * rows + block - 1) / block;
	//grid---网格大小，block---线程块大小（线程格->线程块->线程）线程数=网格大小*线程块大小
	merge_image << <grid, block >> > (src1, src2, src3, drc, rows, clos, value_b, value_g, value_r);
	return drc;
}

2、c++_infer_class部分

int Infer::merge_init(int width , int height)
{
    //创建B、G、R通道显存Device大小
    cudaMalloc((void**)&inputdatab, sizeof(uchar) * (width * height));    
    cudaMalloc((void**)&inputdatag, sizeof(uchar) * (width * height));    
    cudaMalloc((void**)&inputdatar, sizeof(uchar) * (width * height));
    //创建输出bgr的显存Device大小
    cudaMalloc((void**)&outputdata, sizeof(uchar) * (3 * width * height));
    //创建输出图片内存地址
    output_image_data = new uchar[3 * height * width];
    return 1;
}
uchar* Infer::merge_run( uchar* data_b,  uchar* data_g,  uchar* data_r, int width, int height,int value_b,  int value_g,  int value_r)
{
    double start_proess12 = omp_get_wtime();
    //内存to显存、host->Device 
    cudaMemcpy(inputdatab, data_b, sizeof(uchar) * (width * height), cudaMemcpyHostToDevice);
    cudaMemcpy(inputdatag, data_g, sizeof(uchar) * (width * height), cudaMemcpyHostToDevice);
    cudaMemcpy(inputdatar, data_r, sizeof(uchar) * (width * height), cudaMemcpyHostToDevice);
    //cuda核函数
    merge_cuda_process(inputdatab, inputdatag, inputdatar, outputdata, height, width, value_b, value_g, value_r);
    //显存to内存、Device->host
    cudaMemcpy(output_image_data, outputdata, (3 * height * width) * sizeof(uchar), cudaMemcpyDeviceToHost);
    double end_proess12 = omp_get_wtime();
    std::cout << "*********************\n";
    std::cout << 1 << "融合图片耗时:" << end_proess12 - start_proess12 << "\n";
    std::cout << "*********************\n";
    return output_image_data;
}
int Infer::merge_release()
{
    cudaFree(inputdatab);
    cudaFree(inputdatag);
    cudaFree(inputdatar);
    cudaFree(outputdata);
    delete output_image_data;
    return 1;
}

3、c++_main部分

Infer infer;
extern "C" __declspec(dllexport) void merge_init(unsigned int width,unsigned int height)
{
    infer.merge_init(width,height);
}
extern "C" __declspec(dllexport) void mergeToColor(unsigned char* data_b, unsigned char* data_g, unsigned char* data_r, unsigned int width, unsigned int height,
    unsigned int value_b, unsigned int value_g, unsigned int value_r, uchar * data_)
{
    try
    {
        uchar* output_image_data = infer.merge_run(data_b, data_g, data_r, width, height, value_b, value_g, value_r);
        std::copy(output_image_data, output_image_data + 3*width* height, data_);
    }
    catch (const std::exception& ex)
    {
        std::cout << ex.what() << "\n" << std::endl;
        cv::Mat image(width, height, CV_8UC1, cv::Scalar(0));
        data_ = image.data;
    }
}
extern "C" __declspec(dllexport) void merge_relese()
{
    infer.merge_release();
}
int main()
{
    std::string path = R"(D:\C++Test\CudaRuntime1\2560_1024_1.bmp)";
    cv::Mat image1 = cv::imread(path, cv::IMREAD_GRAYSCALE);
    uchar* data2 = new uchar[3 * 2560 * 1024];
    merge_init(image1.cols, image1.rows);
    mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
    mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
    mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
    mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
    mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
    mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
    mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
    mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
    mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
    mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
    merge_relese();
    cv::Mat img12 = cv::Mat(cv::Size(2560, 1024), CV_8UC3, data2);
    cv::imwrite("D:\\1112.png", img12);
    cv::namedWindow("image", 10);
    cv::imshow("image", img12);
    cv::waitKey();
    cv::destroyAllWindows();
    
}

二、运行效率结果

电脑内存不够(已解决，没有限制线程快的最大值，将cuda核函数中block改为固定值512，测试4096*13440图片融合时间为40ms)，选择小图片测试，经测试2560*1024耗时为3ms左右，推算4096*3840耗时为18ms左右，相比60ms的时间快了许多。

三、C#调用方法BitmapToByte和ByteToBitmap的代码如下

  /// <summary>
  /// 图片转数组
  /// </summary>
  public static byte[] BitmapToGrayByte(Bitmap bitmap)
  {
    BitmapData bitmapData = bitmap.LockBits(new Rectangle(0, 0, bitmap.Width, bitmap.Height), ImageLockMode.ReadWrite, PixelFormat.Format24bppRgb);
    IntPtr intPtr = bitmapData.Scan0;
    byte[] image = new byte[3*bitmap.Width * bitmap.Height];
    Marshal.Copy(intPtr, image, 0, 3*bitmap.Width * bitmap.Height);
    bitmap.UnlockBits(bitmapData);
    return image;
  }
  /// <summary>
  /// 数组转图片
  /// </summary>
  public static ICogImage ByteToGrayBitmap(byte[] rawValues, int width, int height)
  {

    //申请目标位图的变量，并将其内存区域锁定
    Bitmap bmp = new Bitmap(width, height, PixelFormat.Format8bppIndexed);
    BitmapData bmpData = bmp.LockBits(new Rectangle(0, 0, width, height),
      ImageLockMode.WriteOnly, PixelFormat.Format8bppIndexed);

    //获取图像参数
    int stride = bmpData.Stride;  // 扫描线的宽度  
    int offset = stride - width;  // 显示宽度与扫描线宽度的间隙  
    IntPtr iptr = bmpData.Scan0;  // 获取bmpData的内存起始位置  
    int scanBytes = stride * height;// 用stride宽度，表示这是内存区域的大小  

    //下面把原始的显示大小字节数组转换为内存中实际存放的字节数组
    int posScan = 0, posReal = 0;// 分别设置两个位置指针，指向源数组和目标数组  
    byte[] pixelValues = new byte[scanBytes];  //为目标数组分配内存  

    for (int x = 0; x < height; x++)
    {
      //下面的循环节是模拟行扫描
      for (int y = 0; y < width; y++)
      {
        pixelValues[posScan++] = rawValues[posReal++];
      }
      posScan += offset;  //行扫描结束，要将目标位置指针移过那段“间隙”  
    }

    //用Marshal的Copy方法，将刚才得到的内存字节数组复制到BitmapData中
    System.Runtime.InteropServices.Marshal.Copy(pixelValues, 0, iptr, scanBytes);
    //解锁内存区域  
    bmp.UnlockBits(bmpData);
    return bmp;
}

图像融合和算法界面平台开发请查看链接：图像融合(一)__融合效果调试算法平台开发-优快云博客

图像融合——溢出问题处理链接：图像融合(二)__解决阈值溢出问题-优快云博客