使用CUDA STREAM处理一张图片

最新推荐文章于 2024-12-06 17:18:23 发布

YaoJiawei329

最新推荐文章于 2024-12-06 17:18:23 发布

阅读量927

点赞数

文章标签： opencv cuda

本文链接：https://blog.youkuaiyun.com/YaoJiawei329/article/details/113934662

版权

该代码示例展示了如何使用CUDA流进行并行处理，将RGB图像转换为HSV图像。通过创建两个CUDA流，分别处理图片的上半部分和下半部分，实现了任务的异步执行。虽然时间提升不明显，但表明了CUDA流在加速计算上的潜力。代码运行后，1280x720像素的图片转换时间在10ms以内，比未使用CUDA流时快。转换后的图片在imshow中显示为灰色可能是由于16位图像显示问题，但保存后打开显示正常。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

本文写了一个demo，利用cuda流概念，将处理图片的任务分为两个任务：第一个流(stream[0])处理上半张图片，第二个流(stream[1])处理下半张图片。下面先看代码：

#include <iostream>
#include <vector>
#include <opencv2/opencv.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/core.hpp>
#include <cuda.h>
#include <cuda_runtime.h>

using namespace cv;
using namespace std;

//主要的核函数
__global__ void global_RGB_to_HSV(uchar3* d_image_RGB, short3* d_image_HSV ,int height, int width){
    int R, G, B;
    int V;    //V = max{R, G, B}
    int m;      //m = min{R, G, B}
    int S;    //S = (V - min{R, G, B}) / V, if V > min{R, G, B}
                //S = 0, if R = G = B
                //delta = max{R, G, B} - min{R, G, B}
    int H;      //H = 0, if max{R, G, B} = min{R, G, B}
                //H = (60 * (G - B)) / delta, if max{R, G, B} = R
                //H = 120 + (60 * (B - R)) / delta, if max{R, G, B} = G
                //H = 240 + (60 * (R - G)) / delta, if max{R, G, B} = B
    for (int row = blockDim.y * blockIdx.y + threadIdx.y; row < height/2; row = row + gridDim.y * blockDim.y) {
        for (int col = blockDim.x * blockIdx.x + threadIdx.x; col < width; col = col + gridDim.x * blockDim.x) {
            B = d_image_RGB[row * width + col].x;
            G = d_image_RGB[row * width + col].y;
            R = d_image_RGB[row * width + col].z;
            //Confirm value V and m
            if ((B >= G) && (G >= R)) {
                V = B;
                m = R;
            } else if ((B >= G) && (G < R) && (B >= R)) {
                V = B;
                m = G;
            } else if ((B >= G) && (G < R) && (B < R)) {
                V = R;
                m = G;
            } else if ((B < G) && (G < R)) {
                V = R;
                m = B;
            } else if ((B < G) && (G >= R) && (B >= R)) {
                V = G;
                m = R;
            } else {
                V = G;
                m = B;
            }
            //Confirm value S
            if (V > m) {
                S = (int)((V - m) / V);
            } else {
                S = 0;
            }
            //Confirm value H
            if (V == m) {
                H = 0;
            } else if (V == R) {
                H = (int)(60 * (G - B) / (V - m));
            } else if (V == G) {
                H = (int)(120 + 60 * (B - R) / (V - m));
            } else {
                H = (int)(240 + 60 * (R - G) / (V - m));
            }
            //if H < 0, H should +360
            if (H < 0) {
                H = H + 360;
            }
            d_image_HSV[row * width + col].x = H;
            d_image_HSV[row * width + col].y = S;
            d_image_HSV[row * width + col].z = V;
        }
    }
}

int main()
{
    cv::Mat image_RGB = cv::imread("../../learning_cudastream/front.jpg");
    int height = image_RGB.rows;
    int width = image_RGB.cols;
    cv::Mat image_HSV(height, width, CV_16SC3);

    uchar3 *d_image_RGB_0, *d_image_RGB_1;
    //uchar3 *d_image_HSV_0, *d_image_HSV_1;
    short3 *d_image_HSV_0, *d_image_HSV_1;

	//创建两个cuda流
    cudaStream_t stream[2];
    for (int i = 0; i < 2; i = i + 1) {
        cudaStreamCreate(&stream[i]);
    }

    cudaEvent_t e_start, e_stop;
    cudaEventCreate(&e_start);
    cudaEventCreate(&e_stop);
    cudaEventRecord(e_start, 0);

    cudaMalloc(&d_image_RGB_0, height*width*sizeof(uchar3)/2);
    cudaMalloc(&d_image_RGB_1, height*width*sizeof(uchar3)/2);
    cudaMalloc(&d_image_HSV_0, height*width*sizeof(short3)/2);
    cudaMalloc(&d_image_HSV_1, height*width*sizeof(short3)/2);

    cudaMemcpyAsync(d_image_RGB_0, (uchar3*)image_RGB.data, height*width*sizeof(uchar3)/2, cudaMemcpyHostToDevice, stream[0]);
    cudaMemcpyAsync(d_image_RGB_1, (uchar3*)((uchar3*)image_RGB.data+height*width/2), height*width*sizeof(uchar3)/2, cudaMemcpyHostToDevice, stream[1]);

    dim3 blocksPerGrid(10, 10, 1);
    dim3 threadsPerBlock(32, 32, 1);

    global_RGB_to_HSV <<<blocksPerGrid, threadsPerBlock, 0, stream[0]>>> (d_image_RGB_0, d_image_HSV_0, height, width);
    global_RGB_to_HSV <<<blocksPerGrid, threadsPerBlock, 0, stream[1]>>> (d_image_RGB_1, d_image_HSV_1, height, width);

    cudaMemcpyAsync((short3*)image_HSV.data, d_image_HSV_0, height*width*sizeof(short3)/2, cudaMemcpyDeviceToHost, stream[0]);
    cudaMemcpyAsync((short3*)((short3*)image_HSV.data+height*width/2), d_image_HSV_1, height*width*sizeof(short3)/2, cudaMemcpyDeviceToHost, stream[1]);
    
    cudaStreamSynchronize(stream[0]);
    cudaStreamSynchronize(stream[1]);

    cudaEventRecord(e_stop, 0);
    cudaEventSynchronize(e_stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, e_start, e_stop);
    printf("Time is %3.2f ms\n", elapsedTime);

    cudaFree(d_image_RGB_0);
    cudaFree(d_image_RGB_1);
    cudaFree(d_image_HSV_0);
    cudaFree(d_image_HSV_1);

    cv::imwrite("../../learning_cudastream/front_HSV.jpg", image_HSV);
    cv::imshow("HSV", image_HSV);
    cv::waitKey(0);

    return 0;
}

简单说一下代码：代码实现的是将RGB图片转换为HSV图片

__global__ void global_RGB_to_HSV();//具体实现的内核函数
因为HSV中的H取值范围是0-360，超过255，所以用16位数来容纳。
cudaMemcpyAsync((short3*)((short3*)image_HSV.data+height*width/2), d_image_HSV_1, height*width*sizeof(short3)/2, cudaMemcpyDeviceToHost, stream[1]);//在传递的时候要注意指针需要强制类型转换成对应的类型

RGB原图：
变为HSV图：
HSV图但实际在代码运行过后，通过imshow显示的是一张完全灰色的图片，可能是显示不了16位图片的原因，但是保存之后再打开就是这样正常的结果。
处理这样一张1280x720的图片，用时基本在10ms以内。如果不用cuda stream，用时100%在10ms以上，平均用时是在11ms。使用cuda stream确实要比不使用要快，但此处提升效果不是很明显，可能来回复制占了大部分时间。