双线性插值运算的CUDA实现

最新推荐文章于 2025-02-20 11:02:17 发布

你有多高

最新推荐文章于 2025-02-20 11:02:17 发布

阅读量2.1k

点赞数 3

分类专栏： # 算法实现文章标签： cuda 算法

本文链接：https://blog.youkuaiyun.com/MM__1997/article/details/122996968

版权

算法实现专栏收录该内容

4 篇文章

订阅专栏

双线性插值运算原理可以戳这位博主
cuda_utils.h

#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_

#include <cuda_runtime_api.h>

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)\
    {\
        cudaError_t error_code = callstr;\
        if (error_code != cudaSuccess) {\
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
            assert(0);\
        }\
    }
#endif  // CUDA_CHECK
#endif  // TRTX_CUDA_UTILS_H_

process.cu

#include <cuda_runtime.h>
#include <math.h>
#include <stdio.h>
#include <stdio.h>
#include <vector>

using namespace std;

#define GPU_BLOCK_THREADS 512
#define KERNEL_POSITION                                     \
    int position = (blockDim.x * blockIdx.x + threadIdx.x); \
    if (position >= (edge))                                 \
        return;

static dim3 gridDims(int numJobs) {
    int numBlockThreads = numJobs < GPU_BLOCK_THREADS ? numJobs : GPU_BLOCK_THREADS;
    return dim3(ceil(numJobs / (float)numBlockThreads));
}

static dim3 blockDims(int numJobs) {
    return numJobs < GPU_BLOCK_THREADS ? numJobs : GPU_BLOCK_THREADS;
}
// 预处理内核，双线性差值resize、转灰度、归一化、减均值除方差
__global__ static void preprocess_kernel(
    float *dst_gpu, unsigned char *src_gpu,
    int src_width, int src_height,
    int dst_width, int dst_height, int edge, int if_reverse) {

    KERNEL_POSITION;
    int dst_area = dst_width * dst_height;
    int src_area = src_width * src_height;

    // 根据position计算偏移量，确定 输出图像素位置
    int cur_ox = position % dst_width;
    int cur_oy = (position / dst_width) % dst_height;

    // // XXX:这个没有做几何中心对齐
    // // 计算输入到输出的比例
    // float sx = (src_width - 1) / (float)(dst_width - 1);
    // float sy = (src_height - 1) / (float)(dst_height - 1);
    // // 计算插值的相关参数
    // float cur_ix = cur_ox * sx; // 根据比例计算 cur_ix 和 cur_iy 可能不是整数
    // float cur_iy = cur_oy * sy; // 偶然情况下可能是整数，此时和low_x、low_y重合

    // XXX: 几何中心对齐
    // 计算输入到输出的比例
    float sx = src_width / (float)dst_width;
    float sy = src_height / (float)dst_height;
    // 计算插值的相关参数
    float cur_ix = (cur_ox + 0.5) * sx - 0.5;
    float cur_iy = (cur_oy + 0.5) * sy - 0.5;

    int low_ix = floor(cur_ix) < 0 ? 0 : floor(cur_ix);
    int low_iy = floor(cur_iy) < 0 ? 0 : floor(cur_iy);
    // int low_ix = floor(cur_ix);
    // int low_iy = floor(cur_iy);
    int high_ix = low_ix + 1;
    int high_iy = low_iy + 1;

    int cursor = (low_ix + low_iy * src_width) * 3;
    float s_b = (1 - (cur_ix - low_ix)) * (1 - (cur_iy - low_iy)) * src_gpu[cursor];
    float s_g = (1 - (cur_ix - low_ix)) * (1 - (cur_iy - low_iy)) * src_gpu[cursor + 1];
    float s_r = (1 - (cur_ix - low_ix)) * (1 - (cur_iy - low_iy)) * src_gpu[cursor + 2];

    if (high_ix < src_width) { //　边界判断
        s_b += (1 - (high_ix - cur_ix)) * (1 - (cur_iy - low_iy)) * src_gpu[3 + cursor];
        s_g += (1 - (high_ix - cur_ix)) * (1 - (cur_iy - low_iy)) * src_gpu[3 + cursor + 1];
        s_r += (1 - (high_ix - cur_ix)) * (1 - (cur_iy - low_iy)) * src_gpu[3 + cursor + 2];
    }

    if (high_iy < src_height) {
        s_b += (1 - (cur_ix - low_ix)) * (1 - (high_iy - cur_iy)) * src_gpu[3 * src_width + cursor];
        s_g += (1 - (cur_ix - low_ix)) * (1 - (high_iy - cur_iy)) * src_gpu[3 * src_width + cursor + 1];
        s_r += (1 - (cur_ix - low_ix)) * (1 - (high_iy - cur_iy)) * src_gpu[3 * src_width + cursor + 2];
    }

    if (high_ix < src_width && high_iy < src_height) {
        s_b += (1 - (high_ix - cur_ix)) * (1 - (high_iy - cur_iy)) * src_gpu[3 * src_width + 3 + cursor];
        s_g += (1 - (high_ix - cur_ix)) * (1 - (high_iy - cur_iy)) * src_gpu[3 * src_width + 3 + cursor + 1];
        s_r += (1 - (high_ix - cur_ix)) * (1 - (high_iy - cur_iy)) * src_gpu[3 * src_width + 3 + cursor + 2];
    }

    // float result = s_r * 0.299f + s_g * 0.587f + s_b * 0.114f;// 转灰度图操作
    // // result = floor(result+0.5) / 255.0f; // 模拟和python处理一致，像素值是整数然后再除255
    // if (if_reverse == 1)
    //     result = (255.0f - floor(result + 0.5)) / 255.0f; // 做一下颜色反转
    // else
    //     result = floor(result + 0.5) / 255.0f; // 模拟和python处理一致，像素值是整数然后再除255
    // dst_gpu[position] = (result - 0.5f) / 0.5f;

    dst_gpu[position * 3] = s_b;
    dst_gpu[position * 3 + 1] = s_g;
    dst_gpu[position * 3 + 2] = s_r;
}

// 预处理调用 resize, 多batch异步copy计算
void preprocess_resize_mulbatch(
    int batch,
    float *dst_gpu,
    unsigned char *src_gpu, unsigned char **src_cpu,
    int *src_widths, int *src_heights,
    int dst_width, int dst_height,
    CUstream_st *stream) {

    int dst_area = dst_width * dst_height;
    int dst_batch_plane = dst_area * 3;
    int job_count = dst_area;

    unsigned char *sp = src_gpu;
    for (int b = 0; b < batch; ++b) {
        int src_batch_plane = src_widths[b] * src_heights[b] * 3;
        cudaMemcpyAsync(sp, src_cpu[b], src_batch_plane, cudaMemcpyKind::cudaMemcpyHostToDevice, stream);
        sp += src_batch_plane;
    }

    auto grid = gridDims(job_count);
    auto block = blockDims(job_count);
    for (int b = 0; b < batch; ++b) {
        int src_area = src_widths[b] * src_heights[b];
        int src_batch_plane = src_area * 3;

        preprocess_kernel<<<grid, block, 0, stream>>>(dst_gpu, src_gpu, src_widths[b], src_heights[b], dst_width, dst_height, job_count, 0);

        dst_gpu += dst_batch_plane;
        src_gpu += src_batch_plane;
    }

}

main.cu

#include <bits/stdc++.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <opencv2/opencv.hpp>

#include "cuda_utils.h"
#include "process.cu"
using namespace std;
using namespace cv;

const int batch = 128;

int main() {

    Mat img = imread("");
    vector<Mat> img_vec;
    for (int i = 0; i < batch; i++) {
        img_vec.push_back(img);
    }

    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    int dst_width = 640;
    int dst_height = 640;
    float *dst_gpu;
    float *dst_cpu;
    unsigned char *src_gpu;
    unsigned char *src_cpu[batch];
    int src_width[batch];
    int src_height[batch];

    int size = 0;
    for (int i = 0; i < batch; i++) {
        size += (3 * img_vec[i].cols * img_vec[i].rows);
    }

    dst_cpu = new float[batch * 3 * dst_width * dst_height];
    CUDA_CHECK(cudaMalloc((void**)&dst_gpu, batch * 3 * dst_width * dst_height * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)&src_gpu, size * sizeof(unsigned char)));

    for (int i = 0; i < batch; i++) {
        src_cpu[i] = img_vec[i].data;
        src_width[i] = img_vec[i].cols;
        src_height[i] = img_vec[i].rows;
    }

    preprocess_resize_mulbatch(batch, dst_gpu, src_gpu, src_cpu, src_width, src_height, dst_width, dst_height, stream);
    CUDA_CHECK(cudaMemcpyAsync(dst_cpu, dst_gpu, batch * 3 * dst_width * dst_height * sizeof(float), cudaMemcpyDeviceToHost, stream));

    Mat img_show(dst_height, dst_width, CV_8UC3);
    for (int i = 0; i < 3 * dst_width * dst_height; i++) {
        img_show.data[i] = dst_cpu[i];
    }

    // namedWindow("show", WINDOW_NORMAL);
    imshow("show", img_show);
    waitKey(0);

    return 0;
}