主函数在main.cpp中,用clang++编译,cuda函数放在KernelWrapper.cu中,用nvcc编译。另外main.cpp中需要包含头文件KernelWrapper.h
KernelWrapper.h
#ifndef _KernelWrapper_h
#define _KernelWrapper_h
void RunTest();
#endif
KernelWrapper.cu
#include <stdio.h>
#include "KernelWrapper.h"
__global__ void TestDevice(int *deviceArray)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
deviceArray[idx] = deviceArray[idx]*deviceArray[idx];
}
void RunTest()
{
int* hostArray;
int* deviceArray;
const int arrayLength = 16;
const unsigned int memSize = sizeof(int) * arrayLength;
hostArray = (int*)malloc(memSize);
cudaMalloc((void**) &deviceArray, memSize);
printf("Init Data\n");
for(int i=0;i<arrayLength;i++)
{
hostArray[i] = i+1;
printf("%d\n", hostArray[i]);
}
cudaMemcpy(deviceArray, hostArray, memSize, cudaMemcpyHostToDevice);
TestDevice <<< 4, 4 >>> (deviceArray);
cudaMemcpy(hostArray, deviceArray, memSize, cudaMemcpyDeviceToHost);
printf("After Kernel Function\n");
for(int i=0;i<arrayLength;i++)
{
printf("%d\n", hostArray[i]);
}
cudaFree(deviceArray);
free(hostArray);
printf("done");
}
main.cpp#include "KernelWrapper.h"
int main( int argc, char** argv)
{
RunTest();
return 0;
}makefile
all: program
program: KernelWrapper.o main.o
clang++ -o program -L/usr/local/cuda/lib -lcuda -lcudart KernelWrapper.o main.o
KernelWrapper.o:KernelWrapper.cu
/usr/local/cuda/bin/nvcc -c -arch=sm_20 KernelWrapper.cu
main.o:main.cpp
clang++ -c main.cpp
clean:
rm -f *.o program
本文介绍了如何在Mac环境中利用clang++编译C++部分,结合nvcc编译CUDA内核代码,以实现CUDA程序的完整构建过程。主要涉及KernelWrapper.h头文件的定义,以及在main.cpp中调用cuda函数。
335

被折叠的 条评论
为什么被折叠?



