最近心血来潮,想用CUDA实现加速光流计算,简单研究了一下CUDA编程,自然地想到了CPP和CUDA的混合编程。
一般来说,CPP可通过g++、clang、MSVC等编译器编译,CUDA则使用NVIDIA的nvcc编译。
CUDA编程其实很大程度兼容C++,所以其实可以直接使用nvcc作为CPP文件的编译器。
下面举一个简单的例子来说明。
kernel.cu
#include <cuda_runtime.h>
#include <iostream>
__global__ void addKernel(int* c, const int* a, const int* b, int size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < size) {
c[i] = a[i] + b[i];
}
}
void addWithCuda(int* c, const int* a, const int* b, int size) {
int* dev_a = nullptr;
int* dev_b = nullptr;
int* dev_c = nullptr;
cudaMalloc((void**)&dev_a, size * sizeof(int));
cudaMalloc((void**)&dev_b, size * sizeof(int));
cudaMalloc((void**)&dev_c, size * sizeof(int));
cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
addKernel<<<1, size>>>(dev_c, dev_a, dev_b, size);
cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
}
#include <iostream>
void addWithCuda(int *c, const int *a, const int *b, int size);
int main()
{
const int size = 5;
int a[size] = {1, 2, 3, 4, 5};
int b[size] = {10, 20, 30, 40, 50};
int c[size] = {0};
addWithCuda(c, a, b, size);
std::cout << "Result: ";
for (int i = 0; i < size; ++i)
{
std::cout << c[i] << " ";
}
std::cout << std::endl;
return 0;
}
或者在同一个CUDA文件中编写两种代码:
#include <cuda_runtime.h>
#include <iostream>
__global__ void addKernel(int* c, const int* a, const int* b, int size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < size) {
c[i] = a[i] + b[i];
}
}
void addWithCuda(int* c, const int* a, const int* b, int size) {
int* dev_a = nullptr;
int* dev_b = nullptr;
int* dev_c = nullptr;
cudaMalloc((void**)&dev_a, size * sizeof(int));
cudaMalloc((void**)&dev_b, size * sizeof(int));
cudaMalloc((void**)&dev_c, size * sizeof(int));
cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
addKernel<<<1, size>>>(dev_c, dev_a, dev_b, size);
cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
}
int main()
{
const int size = 5;
int a[size] = {1, 2, 3, 4, 5};
int b[size] = {10, 20, 30, 40, 50};
int c[size] = {0};
addWithCuda(c, a, b, size);
std::cout << "Result: ";
for (int i = 0; i < size; ++i)
{
std::cout << c[i] << " ";
}
std::cout << std::endl;
return 0;
}
如果是前者,需要编译main.cpp
和kernel.cu
,那么编译指令是:
nvcc main.cpp kernel.cu -o main.exe -lcudart -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\lib\x64"
后者直接编译kernel.cu
:
nvcc kernel.cu -o main.exe -lcudart -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\lib\x64"
最后都会生成可执行文件kernel.exe
。
实测nvcc
应该可以支持C++ 17的标准,需要版本大于9.0。