//CUDA编程主函数模板
int main(){ //Allocate memory on GPU(分配GPU内存) float *Md; cudaMalloc((void**)&Md, size); //Copy data from CPU to GPU(从CPU内存中拷贝数据到GPU中) cud M (Md M i d M H tT D i ) aMemcpy(Md, M, size, cudaMemcpyHostToDevice); //Call GPU kernel function(调用CUDA内核函数完成程序指定的运算) kernel<<<dimGrid, dimBlock>>> (arguments); //Copy data from GPU back to CPU(将数据从GPU拷贝回CPU内存中) CopyFromDeviceMatrix(M, Md); //Free device matrices(释放GPU内存空间) FreeDeviceMatrix(Md); }
应用实例:基于CUDA的两幅数字图像相加
部分主函数代码如下:
主函数中包括了上述编程的模板。int main() { // source images, gray Mat img1_host = imread("D:\\pictures\\3.jpg", IMREAD_GRAYSCALE); Mat img2_host = imread("D:\\pictures\\4.jpg", IMREAD_GRAYSCALE); int row = img1_host.rows; int col = img1_host.cols; int length = row * col; // memory size int memSize = length * sizeof(uchar); // device memory uchar* img1_device; uchar* img2_device; uchar* imgres_device; cudaMalloc((void**)&img1_device, memSize); cudaMalloc((void**)&img2_device, memSize); cudaMalloc((void**)&imgres_device, memSize); // copy host to device cudaMemcpy(img1_device, img1_host.data, memSize, cudaMemcpyHostToDevice); cudaMemcpy(img2_device, img2_host.data, memSize, cudaMemcpyHostToDevice); // setting parameters and run the kernel function dim3 grid(1 + (length / (32 * 32 + 1)), 1, 1); // grid dim3 block(32, 32, 1); // block imageAdd << <grid, block >> >(img1_device, img2_device, imgres_device, length); // copy device to host Mat imgres_host = Mat::zeros(row, col, CV_8UC1); cudaMemcpy(imgres_host.data, imgres_device, memSize, cudaMemcpyDeviceToHost); // show source and result images imshow("img1", img1_host); imshow("img2", img2_host); imshow("imgres", imgres_host); waitKey(0); cudaFree(img1_device); cudaFree(img2_device); cudaFree(imgres_device); system("pause"); return 0; }
CUDA编程主函数模板及实例
最新推荐文章于 2024-03-31 03:33:36 发布