CUDA实现常用的卷积操作，加快运算速度。

maobin_1
于 2024-12-03 10:12:51 发布
阅读量212
点赞数 3
CC 4.0 BY-SA版权
文章标签： c++ 算法 opencv
本文链接：https://blog.youkuaiyun.com/maobin_1/article/details/144205897
#include <iostream>  
#include <opencv2/opencv.hpp>  
#include <cuda_runtime.h>  

// CUDA卷积核函数  
__global__ void convolutionKernel(float *input, float *kernel, float *output, int inputWidth, int inputHeight, int kernelWidth, int kernelHeight) {  
    int row = blockIdx.y * blockDim.y + threadIdx.y;  
    int col = blockIdx.x * blockDim.x + threadIdx.x;  

    if (row < inputHeight && col < inputWidth) {  
        float value = 0.0f;  

        // 计算卷积  
        for (int k = 0; k < kernelHeight; ++k) {  
            for (int j = 0; j < kernelWidth; ++j) {  
                int inputRow = row + k - kernelHeight / 2;  
                int inputCol = col + j - kernelWidth / 2;  

                // 确保输入索引在有效范围内  
                if (inputRow >= 0 && inputRow < inputHeight && inputCol >= 0 && inputCol < inputWidth) {  
                    value += input[inputRow * inputWidth + inputCol] * kernel[k * kernelWidth + j];  
                }  
            }  
        }  
        
        output[row * inputWidth + col] = value; // 将结果保存到输出数组  
    }  
}  

void convolution(float *h_input, float *h_kernel, float *h_output, int inputWidth, int inputHeight, int kernelWidth, int kernelHeight) {  
    float *d_input, *d_kernel, *d_output;  

    size_t inputSize = inputWidth * inputHeight * sizeof(float);  
    size_t kernelSize = kernelWidth * kernelHeight * sizeof(float);  
    size_t outputSize = inputWidth * inputHeight * sizeof(float);  

    // 分配GPU内存  
    cudaMalloc((void**)&d_input, inputSize);  
    cudaMalloc((void**)&d_kernel, kernelSize);  
    cudaMalloc((void**)&d_output, outputSize);  

    // 复制数据到GPU  
    cudaMemcpy(d_input, h_input, inputSize, cudaMemcpyHostToDevice);  
    cudaMemcpy(d_kernel, h_kernel, kernelSize, cudaMemcpyHostToDevice);  

    // 定义线程块和网格大小  
    dim3 threadsPerBlock(16, 16);  
    dim3 numBlocks((inputWidth + threadsPerBlock.x - 1) / threadsPerBlock.x, (inputHeight + threadsPerBlock.y - 1) / threadsPerBlock.y);  
    
    // 调用核函数  
    convolutionKernel<<<numBlocks, threadsPerBlock>>>(d_input, d_kernel, d_output, inputWidth, inputHeight, kernelWidth, kernelHeight);  

    // 复制结果回主机  
    cudaMemcpy(h_output, d_output, outputSize, cudaMemcpyDeviceToHost);  

    // 释放GPU内存  
    cudaFree(d_input);  
    cudaFree(d_kernel);  
    cudaFree(d_output);  
}  

int main() {  
    // 读取图像  
    std::string imagePath = "C:\\Users\\mao\\Desktop\\cuda\\cuda3\\1479710347_3342.png";  // 修改为您的图像路径  
    cv::Mat image = cv::imread(imagePath, cv::IMREAD_GRAYSCALE); // 以灰度模式读取图像  
    
    if (image.empty()) {  
        std::cerr << "Error: Could not open or find the image!" << std::endl;  
        return -1;  
    }  

    // 获取输入图像的维度  
    int inputWidth = image.cols;  
    int inputHeight = image.rows;  

    // 创建输入图像的浮点数组  
    float *h_input = new float[inputWidth * inputHeight];  
    
    // 将CV图像数据转换为浮点数组  
    for (int r = 0; r < inputHeight; r++) {  
        for (int c = 0; c < inputWidth; c++) {  
            h_input[r * inputWidth + c] = static_cast<float>(image.at<uchar>(r, c));  
        }  
    }  

    // 定义卷积核  
    int kernelWidth = 3;  
    int kernelHeight = 3;  
    // float h_kernel[3 * 3] = 
    // {  
    //     0.0f, -1.0f, 0.0f,  
    //     -1.0f, 5.0f, -1.0f,  
    //     0.0f, -1.0f, 0.0f  
    // };  // 锐化卷积核

    // float h_kernel[3 * 3] = 
    // {  
    //     1.0f / 9.0f, 1.0f / 9.0f, 1.0f / 9.0f,  
    //     1.0f / 9.0f, 1.0f / 9.0f, 1.0f / 9.0f,  
    //     1.0f / 9.0f, 1.0f / 9.0f, 1.0f / 9.0f  
    // };//均值

    // float h_kernel[3 * 3] = {  
    //     0.0f, -1.0f, 0.0f,  
    //     -1.0f, 4.0f, -1.0f,  
    //     0.0f, -1.0f, 0.0f  
    // };  // 拉普拉斯算子

    // 水平 Sobel算子  
    // float h_kernel[3 * 3] = {  
    //     -1.0f, 0.0f, 1.0f,  
    //     -2.0f, 0.0f, 2.0f,  
    //     -1.0f, 0.0f, 1.0f  
    // };  // 水平边缘检测
    // 垂直 Sobel算子  
    // float h_kernel[3 * 3] = {  
    //     1.0f, 2.0f, 1.0f,  
    //     0.0f, 0.0f, 0.0f,  
    //     -1.0f, -2.0f, -1.0f  
    // };  // 垂直边缘检测

    float h_kernel[3 * 3] = {  
        1.0f/16.0f, 2.0f/16.0f, 1.0f/16.0f,  
        2.0f/16.0f, 4.0f/16.0f, 2.0f/16.0f,  
        1.0f/16.0f, 2.0f/16.0f, 1.0f/16.0f  
    };  // 高斯模糊卷积核
    
    // 创建输出图像数组  
    float *h_output = new float[inputWidth * inputHeight];  

    // 调用卷积函数  
    convolution(h_input, h_kernel, h_output, inputWidth, inputHeight, kernelWidth, kernelHeight);  

    // 创建输出图像并将输出数据复制到OpenCV Mat  
    cv::Mat outputImage(inputHeight, inputWidth, CV_32FC1, h_output);  
    
    // 将输出图像转换为8位图像，确保数值范围在0到255之间  
    cv::Mat outputImage8U;  
    outputImage.convertTo(outputImage8U, CV_8U, 1.0);  

    // 显示原始图像和卷积后的图像  
    cv::imshow("Original Image", image);  
    cv::imshow("Convoluted Image", outputImage8U);  
    cv::waitKey(0);  // 等待按键输入  

    // 清理内存  
    delete[] h_input;  
    delete[] h_output;  

    // 释放窗口  
    cv::destroyAllWindows();  

    return 0;  
}