//CUDA编程主函数模板
int main(){
//Allocate memory on GPU(分配GPU内存)
float *Md;
cudaMalloc((void**)&Md, size);
//Copy data from CPU to GPU(从CPU内存中拷贝数据到GPU中)
cud M (Md M i d M H tT D i ) aMemcpy(Md, M, size, cudaMemcpyHostToDevice);
//Call GPU kernel function(调用CUDA内核函数完成程序指定的运算)
kernel<<<dimGrid, dimBlock>>> (arguments);
//Copy data from GPU back to CPU(将数据从GPU拷贝回CPU内存中)
CopyFromDeviceMatrix(M, Md);
//Free device matrices(释放GPU内存空间)
FreeDeviceMatrix(Md);
}
应用实例:基于CUDA的两幅数字图像相加
部分主函数代码如下:
int main() {
// source images, gray
Mat img1_host = imread("D:\\pictures\\3.jpg", IMREAD_GRAYSCALE);
Mat img2_host = imread("D:\\pictures\\4.jpg", IMREAD_GRAYSCALE);
int row = img1_host.rows;
int col = img1_host.cols;
int length = row * col;
// memory size
int memSize = length * sizeof(uchar);
// device memory
uchar* img1_device;
uchar* img2_device;
uchar* imgres_device;
cudaMalloc((void**)&img1_device, memSize);
cudaMalloc((void**)&img2_device, memSize);
cudaMalloc((void**)&imgres_device, memSize);
// copy host to device
cudaMemcpy(img1_device, img1_host.data, memSize, cudaMemcpyHostToDevice);
cudaMemcpy(img2_device, img2_host.data, memSize, cudaMemcpyHostToDevice);
// setting parameters and run the kernel function
dim3 grid(1 + (length / (32 * 32 + 1)), 1, 1); // grid
dim3 block(32, 32, 1); // block
imageAdd << <grid, block >> >(img1_device, img2_device, imgres_device, length);
// copy device to host
Mat imgres_host = Mat::zeros(row, col, CV_8UC1);
cudaMemcpy(imgres_host.data, imgres_device, memSize, cudaMemcpyDeviceToHost);
// show source and result images
imshow("img1", img1_host);
imshow("img2", img2_host);
imshow("imgres", imgres_host);
waitKey(0);
cudaFree(img1_device);
cudaFree(img2_device);
cudaFree(imgres_device);
system("pause");
return 0;
}
主函数中包括了上述编程的模板。