CUDA的host与device数据交互示例

最新推荐文章于 2024-10-16 22:30:57 发布

原创最新推荐文章于 2024-10-16 22:30:57 发布 · 1.6k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#cuda #timer #float #processing

计算电磁学专栏收录该内容

41 篇文章

订阅专栏

本文介绍了一个简单的CUDA程序示例，演示了如何初始化CUDA环境、分配设备内存，并通过两个内核函数实现数据处理：字符串复制到设备内存及浮点数数组的生成与填充。此外，还展示了如何测量内核执行时间并从设备复制结果到主机。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

/******************************************************************** * sample.cu * This is a example of the CUDA program. *********************************************************************/ #include <stdio.h> #include <stdlib.h> #include <cutil_inline.h> /************************************************************************/ /* Init CUDA */ /************************************************************************/ #if __DEVICE_EMULATION__ bool InitCUDA(void){return true;} #else bool InitCUDA(void) { int count = 0; int i = 0; cudaGetDeviceCount(&count); if(count == 0) { fprintf(stderr, "There is no device./n"); return false; } for(i = 0; i < count; i++) { cudaDeviceProp prop; if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if(prop.major >= 1) { break; } } } if(i == count) { fprintf(stderr, "There is no device supporting CUDA./n"); return false; } cudaSetDevice(i); printf("CUDA initialized./n"); return true; } #endif /************************************************************************/ /* Example */ /************************************************************************/ __global__ static void HelloCUDA(char* result, int num) { int i = 0; char p_HelloCUDA[] = "Hello CUDA!"; for(i = 0; i < num; i++) { result[i] = p_HelloCUDA[i]; } } __global__ static void TESTcuda(float* result, int num) { float test = 1.1; for(int idx = 0 ;idx<num ; idx++) { result[idx] = test + idx; } } /************************************************************************/ /* HelloCUDA */ /************************************************************************/ int main(int argc, char* argv[]) { if(!InitCUDA()) { return 0; } // char *device_result = 0; // char host_result[12] ={0}; float *device_result2 = 0;// float host_result2[12]={0};//.. // cutilSafeCall( cudaMalloc((void**) &device_result, sizeof(char) * 11));//MALLOC DEVICE MEMORY cutilSafeCall( cudaMalloc((void**) &device_result2, sizeof(float) * 12)); unsigned int timer = 0; cutilCheckError( cutCreateTimer( &timer)); cutilCheckError( cutStartTimer( timer)); // HelloCUDA<<<1, 1, 0>>>(device_result, 11); TESTcuda<<<1,1,0>>>(device_result2,12); cutilCheckMsg("Kernel execution failed/n"); cudaThreadSynchronize(); cutilCheckError( cutStopTimer( timer)); printf("Processing time: %f (ms)/n", cutGetTimerValue( timer)); cutilCheckError( cutDeleteTimer( timer)); //cutilSafeCall( cudaMemcpy(host_result, device_result, sizeof(char) * 11, cudaMemcpyDeviceToHost));//FROM DEVICE TO HOST //printf("%s/n", host_result); cutilSafeCall( cudaMemcpy(host_result2, device_result2, sizeof(float) * 12, cudaMemcpyDeviceToHost));//FROM DEVICE TO HOST printf("the result is :/n"); for(int idx=0;idx<12;idx++) { printf("%f /n", host_result2[idx]); } cutilSafeCall( cudaFree(device_result2)); return 0; }