-
双线性插值运算原理可以戳这位博主
-
cuda_utils.h
#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_
#include <cuda_runtime_api.h>
#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)\
{\
cudaError_t error_code = callstr;\
if (error_code != cudaSuccess) {\
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
assert(0);\
}\
}
#endif // CUDA_CHECK
#endif // TRTX_CUDA_UTILS_H_
process.cu
#include <cuda_runtime.h>
#include <math.h>
#include <stdio.h>
#include <stdio.h>
#include <vector>
using namespace std;
#define GPU_BLOCK_THREADS 512
#define KERNEL_POSITION \
int position = (blockDim.x * blockIdx.x + threadIdx.x); \
if (position >= (edge)) \
return;
static dim3 gridDims(int numJobs) {
int numBlockThreads = numJobs < GPU_BLOCK_THREADS ? numJobs : GPU_BLOCK_THREADS;
return dim3(ceil(numJobs / (float)numBlockThreads));
}
static dim3 blockDims(int numJobs) {
return numJobs < GPU_BLOCK_THREADS ? numJobs : GPU_BLOCK_THREADS;
}
// 预处理内核,双线性差值resize、转灰度、归一化、减均值除方差
__global__ static void preprocess_kernel(
float *dst_gpu, unsigned char *src_gpu,
int src_width, int src_height,
int dst_width, int dst_height, int edge, int if_reverse) {
KERNEL_POSITION;
int dst_area = dst_width * dst_height;
int src_area = src_width * src_height;
// 根据position计算偏移量,确定 输出图像素位置
int cur_ox = position % dst_width;
int cur_oy = (position / dst_width) % dst_height;
// // XXX:这个没有做几何中心对齐
// // 计算输入到输出的比例
// float sx = (src_width - 1) / (float)(dst_width - 1);
// float sy = (src_height - 1) / (float)(dst_height - 1);
// // 计算插值的相关参数
// float cur_ix = cur_ox * sx; // 根据比例计算 cur_ix 和 cur_iy 可能不是整数
// float cur_iy = cur_oy * sy; // 偶然情况下可能是整数,此时和low_x、low_y重合
// XXX: 几何中心对齐
// 计算输入到输出的比例
float sx = src_width / (float)dst_width;
float sy = src_height / (float)dst_height;
// 计算插值的相关参数
float cur_ix = (cur_ox + 0.5) * sx - 0.5;
float cur_iy = (cur_oy + 0.5) * sy - 0.5;
int low_ix = floor(cur_ix) < 0 ? 0 : floor(cur_ix);
int low_iy = floor(cur_iy) < 0 ? 0 : floor(cur_iy);
// int low_ix = floor(cur_ix);
// int low_iy = floor(cur_iy);
int high_ix = low_ix + 1;
int high_iy = low_iy + 1;
int cursor = (low_ix + low_iy * src_width) * 3;
float s_b = (1 - (cur_ix - low_ix)) * (1 - (cur_iy - low_iy)) * src_gpu[cursor];
float s_g = (1 - (cur_ix - low_ix)) * (1 - (cur_iy - low_iy)) * src_gpu[cursor + 1];
float s_r = (1 - (cur_ix - low_ix)) * (1 - (cur_iy - low_iy)) * src_gpu[cursor + 2];
if (high_ix < src_width) { // 边界判断
s_b += (1 - (high_ix - cur_ix)) * (1 - (cur_iy - low_iy)) * src_gpu[3 + cursor];
s_g += (1 - (high_ix - cur_ix)) * (1 - (cur_iy - low_iy)) * src_gpu[3 + cursor + 1];
s_r += (1 - (high_ix - cur_ix)) * (1 - (cur_iy - low_iy)) * src_gpu[3 + cursor + 2];
}
if (high_iy < src_height) {
s_b += (1 - (cur_ix - low_ix)) * (1 - (high_iy - cur_iy)) * src_gpu[3 * src_width + cursor];
s_g += (1 - (cur_ix - low_ix)) * (1 - (high_iy - cur_iy)) * src_gpu[3 * src_width + cursor + 1];
s_r += (1 - (cur_ix - low_ix)) * (1 - (high_iy - cur_iy)) * src_gpu[3 * src_width + cursor + 2];
}
if (high_ix < src_width && high_iy < src_height) {
s_b += (1 - (high_ix - cur_ix)) * (1 - (high_iy - cur_iy)) * src_gpu[3 * src_width + 3 + cursor];
s_g += (1 - (high_ix - cur_ix)) * (1 - (high_iy - cur_iy)) * src_gpu[3 * src_width + 3 + cursor + 1];
s_r += (1 - (high_ix - cur_ix)) * (1 - (high_iy - cur_iy)) * src_gpu[3 * src_width + 3 + cursor + 2];
}
// float result = s_r * 0.299f + s_g * 0.587f + s_b * 0.114f;// 转灰度图操作
// // result = floor(result+0.5) / 255.0f; // 模拟和python处理一致,像素值是整数然后再除255
// if (if_reverse == 1)
// result = (255.0f - floor(result + 0.5)) / 255.0f; // 做一下颜色反转
// else
// result = floor(result + 0.5) / 255.0f; // 模拟和python处理一致,像素值是整数然后再除255
// dst_gpu[position] = (result - 0.5f) / 0.5f;
dst_gpu[position * 3] = s_b;
dst_gpu[position * 3 + 1] = s_g;
dst_gpu[position * 3 + 2] = s_r;
}
// 预处理调用 resize, 多batch异步copy计算
void preprocess_resize_mulbatch(
int batch,
float *dst_gpu,
unsigned char *src_gpu, unsigned char **src_cpu,
int *src_widths, int *src_heights,
int dst_width, int dst_height,
CUstream_st *stream) {
int dst_area = dst_width * dst_height;
int dst_batch_plane = dst_area * 3;
int job_count = dst_area;
unsigned char *sp = src_gpu;
for (int b = 0; b < batch; ++b) {
int src_batch_plane = src_widths[b] * src_heights[b] * 3;
cudaMemcpyAsync(sp, src_cpu[b], src_batch_plane, cudaMemcpyKind::cudaMemcpyHostToDevice, stream);
sp += src_batch_plane;
}
auto grid = gridDims(job_count);
auto block = blockDims(job_count);
for (int b = 0; b < batch; ++b) {
int src_area = src_widths[b] * src_heights[b];
int src_batch_plane = src_area * 3;
preprocess_kernel<<<grid, block, 0, stream>>>(dst_gpu, src_gpu, src_widths[b], src_heights[b], dst_width, dst_height, job_count, 0);
dst_gpu += dst_batch_plane;
src_gpu += src_batch_plane;
}
}
main.cu
#include <bits/stdc++.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "process.cu"
using namespace std;
using namespace cv;
const int batch = 128;
int main() {
Mat img = imread("");
vector<Mat> img_vec;
for (int i = 0; i < batch; i++) {
img_vec.push_back(img);
}
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
int dst_width = 640;
int dst_height = 640;
float *dst_gpu;
float *dst_cpu;
unsigned char *src_gpu;
unsigned char *src_cpu[batch];
int src_width[batch];
int src_height[batch];
int size = 0;
for (int i = 0; i < batch; i++) {
size += (3 * img_vec[i].cols * img_vec[i].rows);
}
dst_cpu = new float[batch * 3 * dst_width * dst_height];
CUDA_CHECK(cudaMalloc((void**)&dst_gpu, batch * 3 * dst_width * dst_height * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)&src_gpu, size * sizeof(unsigned char)));
for (int i = 0; i < batch; i++) {
src_cpu[i] = img_vec[i].data;
src_width[i] = img_vec[i].cols;
src_height[i] = img_vec[i].rows;
}
preprocess_resize_mulbatch(batch, dst_gpu, src_gpu, src_cpu, src_width, src_height, dst_width, dst_height, stream);
CUDA_CHECK(cudaMemcpyAsync(dst_cpu, dst_gpu, batch * 3 * dst_width * dst_height * sizeof(float), cudaMemcpyDeviceToHost, stream));
Mat img_show(dst_height, dst_width, CV_8UC3);
for (int i = 0; i < 3 * dst_width * dst_height; i++) {
img_show.data[i] = dst_cpu[i];
}
// namedWindow("show", WINDOW_NORMAL);
imshow("show", img_show);
waitKey(0);
return 0;
}