如何解决opencv算法cpu处理较慢的问题,我这边利用cuda编程在GPU上并行计算加快速度
一、方法一
使用c++先将通道图片从host拷贝到device中,使用cuda核函数做通道融合和阈值加减,再将核函数的输出从device中转到host,c++调用直接cv::mat生成图片,python调用同样用cv2.mat(),C#调用使用BitmapToByte和ByteToBitmap转换成bitmap格式(后面附带转换方法)
1、cuda核函数
__global__ void merge_image(uchar* src1, uchar* src2, uchar* src3, uchar* drc, int rows, int clos, int value_b, int value_g, int value_r)
{
//blockDim.x一个线程格中的网格指标,默认一维。blockIdx一个网格中的线程块指标,默认是一维。threadIdx一个线程块中的线程指标,默认是一维。1800*600
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < clos * rows)
{
if (value_b > 0 && (255 - src1[id]) < value_b)
drc[id * 3] = 255;
else if(value_b < 0 && src1[id]<=(-1* value_b))
drc[id * 3] = 0;
else
drc[id * 3] = src1[id]+value_b;
if (value_g > 0 && (255 - src2[id]) < value_g)
drc[id * 3+1] = 255;
else if (value_g < 0 && src2[id] <= (-1 * value_g))
drc[id * 3+1] = 0;
else
drc[id * 3+1] = src2[id] + value_g;
if (value_r > 0 && (255 - src3[id]) < value_r)
drc[id * 3 + 2] = 255;
else if (value_r < 0 && src3[id] <= (-1 * value_r))
drc[id * 3 + 2] = 0;
else
drc[id * 3 + 2] = src3[id] + value_r;
}
}
uchar* merge_cuda_process(uchar* src1, uchar* src2, uchar* src3, uchar* drc, int rows, int clos, int value_b, int value_g ,int value_r) {
int block = rows;
int grid = (clos * rows + block - 1) / block;
//grid---网格大小,block---线程块大小(线程格->线程块->线程)线程数=网格大小*线程块大小
merge_image << <grid, block >> > (src1, src2, src3, drc, rows, clos, value_b, value_g, value_r);
return drc;
}
2、c++_infer_class部分
int Infer::merge_init(int width , int height)
{
//创建B、G、R通道显存Device大小
cudaMalloc((void**)&inputdatab, sizeof(uchar) * (width * height));
cudaMalloc((void**)&inputdatag, sizeof(uchar) * (width * height));
cudaMalloc((void**)&inputdatar, sizeof(uchar) * (width * height));
//创建输出bgr的显存Device大小
cudaMalloc((void**)&outputdata, sizeof(uchar) * (3 * width * height));
//创建输出图片内存地址
output_image_data = new uchar[3 * height * width];
return 1;
}
uchar* Infer::merge_run( uchar* data_b, uchar* data_g, uchar* data_r, int width, int height,int value_b, int value_g, int value_r)
{
double start_proess12 = omp_get_wtime();
//内存to显存、host->Device
cudaMemcpy(inputdatab, data_b, sizeof(uchar) * (width * height), cudaMemcpyHostToDevice);
cudaMemcpy(inputdatag, data_g, sizeof(uchar) * (width * height), cudaMemcpyHostToDevice);
cudaMemcpy(inputdatar, data_r, sizeof(uchar) * (width * height), cudaMemcpyHostToDevice);
//cuda核函数
merge_cuda_process(inputdatab, inputdatag, inputdatar, outputdata, height, width, value_b, value_g, value_r);
//显存to内存、Device->host
cudaMemcpy(output_image_data, outputdata, (3 * height * width) * sizeof(uchar), cudaMemcpyDeviceToHost);
double end_proess12 = omp_get_wtime();
std::cout << "*********************\n";
std::cout << 1 << "融合图片耗时:" << end_proess12 - start_proess12 << "\n";
std::cout << "*********************\n";
return output_image_data;
}
int Infer::merge_release()
{
cudaFree(inputdatab);
cudaFree(inputdatag);
cudaFree(inputdatar);
cudaFree(outputdata);
delete output_image_data;
return 1;
}
3、c++_main部分
Infer infer;
extern "C" __declspec(dllexport) void merge_init(unsigned int width,unsigned int height)
{
infer.merge_init(width,height);
}
extern "C" __declspec(dllexport) void mergeToColor(unsigned char* data_b, unsigned char* data_g, unsigned char* data_r, unsigned int width, unsigned int height,
unsigned int value_b, unsigned int value_g, unsigned int value_r, uchar * data_)
{
try
{
uchar* output_image_data = infer.merge_run(data_b, data_g, data_r, width, height, value_b, value_g, value_r);
std::copy(output_image_data, output_image_data + 3*width* height, data_);
}
catch (const std::exception& ex)
{
std::cout << ex.what() << "\n" << std::endl;
cv::Mat image(width, height, CV_8UC1, cv::Scalar(0));
data_ = image.data;
}
}
extern "C" __declspec(dllexport) void merge_relese()
{
infer.merge_release();
}
int main()
{
std::string path = R"(D:\C++Test\CudaRuntime1\2560_1024_1.bmp)";
cv::Mat image1 = cv::imread(path, cv::IMREAD_GRAYSCALE);
uchar* data2 = new uchar[3 * 2560 * 1024];
merge_init(image1.cols, image1.rows);
mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
mergeToColor(image1.data, image1.data, image1.data, image1.cols, image1.rows, 20, 255, -255, data2);
merge_relese();
cv::Mat img12 = cv::Mat(cv::Size(2560, 1024), CV_8UC3, data2);
cv::imwrite("D:\\1112.png", img12);
cv::namedWindow("image", 10);
cv::imshow("image", img12);
cv::waitKey();
cv::destroyAllWindows();
}
二、运行效率结果
电脑内存不够(已解决,没有限制线程快的最大值,将cuda核函数中block改为固定值512,测试4096*13440图片融合时间为40ms),选择小图片测试,经测试2560*1024耗时为3ms左右,推算4096*3840耗时为18ms左右,相比60ms的时间快了许多。
三、C#调用方法BitmapToByte和ByteToBitmap的代码如下
/// <summary>
/// 图片转数组
/// </summary>
public static byte[] BitmapToGrayByte(Bitmap bitmap)
{
BitmapData bitmapData = bitmap.LockBits(new Rectangle(0, 0, bitmap.Width, bitmap.Height), ImageLockMode.ReadWrite, PixelFormat.Format24bppRgb);
IntPtr intPtr = bitmapData.Scan0;
byte[] image = new byte[3*bitmap.Width * bitmap.Height];
Marshal.Copy(intPtr, image, 0, 3*bitmap.Width * bitmap.Height);
bitmap.UnlockBits(bitmapData);
return image;
}
/// <summary>
/// 数组转图片
/// </summary>
public static ICogImage ByteToGrayBitmap(byte[] rawValues, int width, int height)
{
//申请目标位图的变量,并将其内存区域锁定
Bitmap bmp = new Bitmap(width, height, PixelFormat.Format8bppIndexed);
BitmapData bmpData = bmp.LockBits(new Rectangle(0, 0, width, height),
ImageLockMode.WriteOnly, PixelFormat.Format8bppIndexed);
//获取图像参数
int stride = bmpData.Stride; // 扫描线的宽度
int offset = stride - width; // 显示宽度与扫描线宽度的间隙
IntPtr iptr = bmpData.Scan0; // 获取bmpData的内存起始位置
int scanBytes = stride * height;// 用stride宽度,表示这是内存区域的大小
//下面把原始的显示大小字节数组转换为内存中实际存放的字节数组
int posScan = 0, posReal = 0;// 分别设置两个位置指针,指向源数组和目标数组
byte[] pixelValues = new byte[scanBytes]; //为目标数组分配内存
for (int x = 0; x < height; x++)
{
//下面的循环节是模拟行扫描
for (int y = 0; y < width; y++)
{
pixelValues[posScan++] = rawValues[posReal++];
}
posScan += offset; //行扫描结束,要将目标位置指针移过那段“间隙”
}
//用Marshal的Copy方法,将刚才得到的内存字节数组复制到BitmapData中
System.Runtime.InteropServices.Marshal.Copy(pixelValues, 0, iptr, scanBytes);
//解锁内存区域
bmp.UnlockBits(bmpData);
return bmp;
}
图像融合和算法界面平台开发请查看链接:图像融合(一)__融合效果调试算法平台开发-优快云博客
图像融合——溢出问题处理链接:图像融合(二)__解决阈值溢出问题-优快云博客