C++调用cuda库函数

爱吃橙子的哈士奇

已于 2023-09-12 23:06:14 修改

阅读量1.4k

点赞数 1

文章标签： c++ 算法开发语言

于 2023-07-03 11:27:12 首次发布

本文链接：https://blog.youkuaiyun.com/qq_34176467/article/details/131511700

版权

本文详细介绍了如何配置CUDA环境，包括设置环境变量、创建新工程并编写CUDA测试代码，以及通过C++调用CUDA函数进行加法运算。同时，文章还展示了如何获取GPU设备的信息，如名称、内存、计算能力等硬件特性。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

一、配置CUDA环境变量
参考：https://zhuanlan.zhihu.com/p/488518526

二、新建一个工程，在源文件中添加.cu和main.cpp

在这里插入图片描述

三、敲测试代码
File.cu

#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void addKernel(int *c,  int *a,  int *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}
extern "C" void addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
    int * dev_a = 0;
    int * dev_b = 0;
    int * dev_c = 0;
    cudaError_t cudaStatus;
    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    // Allocate GPU buffers for three vectors (two input, one output)    
    cudaStatus = cudaMalloc((void **) & dev_c, size * sizeof(int));
    cudaStatus = cudaMalloc((void **) & dev_a, size * sizeof(int));
    cudaStatus = cudaMalloc((void **) & dev_b, size * sizeof(int));
    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    // Launch a kernel on the GPU with one thread for each element.
    addKernel<< <1, size>>>(dev_c, dev_a, dev_b);
}

main.cpp

#include <stdio.h>
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
extern "C" void addWithCuda(int* c, const int* a, const int* b, unsigned int size);
int main()
{
    const int arraySize = 5;
    const int a[arraySize] = { 1, 2, 3, 4, 5 };
    const int b[arraySize] = { 10, 20, 30, 40, 50 };
    int c[arraySize] = { 0 };
    // Add vectors in parallel.
    addWithCuda(c, a, b, arraySize);
    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);
    system("pause");
    return 0;
}

四、配置环境
1）右键工程-》生成依赖性-》生成自定义-》CUDA_12.0
在这里插入图片描述

在工程的属性-》连接器-》附加依赖项中添加cudart.lib；

2）右键File.cu，属性-》常规-》项类型-》CUDA C/C++
在这里插入图片描述
五、运行结果

打印显卡信息

void GetCudaInfo()
{
	int deviceCount;
	cudaGetDeviceCount(&deviceCount);	//Returns in *deviceCount the number of devices
	cout << "deviceCount:   " << deviceCount << "\n\n";
	if (deviceCount == 0)
	{
		cout << "error: no devices supporting CUDA.\n";
		exit(EXIT_FAILURE);
	}
	int dev = 0;
	cudaSetDevice(dev);	//Sets dev=0 device as the current device for the calling host thread.
	cudaDeviceProp devProps;
	cudaGetDeviceProperties(&devProps, dev);

	cout << "name: " << devProps.name << "\n";
	cout << "totalGlobalMem: " << devProps.totalGlobalMem << "\n";
	cout << "regsPerBlock: " << devProps.regsPerBlock << "\n";
	cout << "warpSize: " << devProps.warpSize << "\n";
	cout << "memPitch: " << devProps.memPitch << "\n\n";

	cout << "一个线程块中可使用的最大共享内存\n";
	cout << "devProps.sharedMemPerBlock: " << devProps.sharedMemPerBlock << "\n\n";

	cout << "一个线程块中可包含的最大线程数量\n";
	cout << "maxThreadsPerBlock: " << devProps.maxThreadsPerBlock << "\n\n";

	cout << "多维线程块数组中每一维可包含的最大线程数量\n";
	cout << "maxThreadsDim[0]: " << devProps.maxThreadsDim[0] << "\n";
	cout << "maxThreadsDim[1]: " << devProps.maxThreadsDim[1] << "\n";
	cout << "maxThreadsDim[2]: " << devProps.maxThreadsDim[2] << "\n\n";

	cout << "一个线程格中每一维可包含的最大线程块数量\n";
	cout << "maxGridSize[0]: " << devProps.maxGridSize[0] << "\n";
	cout << "maxGridSize[1]: " << devProps.maxGridSize[1] << "\n";
	cout << "maxGridSize[2]: " << devProps.maxGridSize[2] << "\n\n";

	cout << "clockRate: " << devProps.clockRate << "\n";
	cout << "totalConstMem: " << devProps.totalConstMem << "\n";
	cout << "textureAlignment: " << devProps.textureAlignment << "\n\n";

	cout << "计算能力：" << devProps.major << "." << devProps.minor << "\n\n";
	cout << "minor: " << devProps.minor << "\n";
	cout << "texturePitchAlignment: " << devProps.texturePitchAlignment << "\n";
	cout << "deviceOverlap: " << devProps.deviceOverlap << "\n";
	cout << "multiProcessorCount: " << devProps.multiProcessorCount << "\n";
	cout << "kernelExecTimeoutEnabled: " << devProps.kernelExecTimeoutEnabled << "\n";
	cout << "integrated: " << devProps.integrated << "\n";
	cout << "canMapHostMemory: " << devProps.canMapHostMemory << "\n";
	cout << "computeMode: " << devProps.computeMode << "\n";
	cout << "maxTexture1D: " << devProps.maxTexture1D << "\n";
	cout << "maxTexture1DMipmap: " << devProps.maxTexture1DMipmap << "\n";
	cout << "maxTexture1DLinear: " << devProps.maxTexture1DLinear << "\n";
	cout << "maxTexture2D: " << devProps.maxTexture2D << "\n";
	cout << "maxTexture2DMipmap: " << devProps.maxTexture2DMipmap << "\n";
	cout << "maxTexture2DLinear: " << devProps.maxTexture2DLinear << "\n";
	cout << "maxTexture2DGather: " << devProps.maxTexture2DGather << "\n";
	cout << "maxTexture3D: " << devProps.maxTexture3D << "\n";
	cout << "maxTexture3DAlt: " << devProps.maxTexture3DAlt << "\n";
	cout << "maxTextureCubemap: " << devProps.maxTextureCubemap << "\n";
	cout << "maxTexture1DLayered: " << devProps.maxTexture1DLayered << "\n";
	cout << "maxTexture2DLayered: " << devProps.maxTexture2DLayered << "\n";
	cout << "maxTextureCubemapLayered: " << devProps.maxTextureCubemapLayered << "\n";
	cout << "maxSurface1D: " << devProps.maxSurface1D << "\n";
	cout << "maxSurface2D: " << devProps.maxSurface2D << "\n";
	cout << "maxSurface3D: " << devProps.maxSurface3D << "\n";
	cout << "maxSurface1DLayered: " << devProps.maxSurface1DLayered << "\n";
	cout << "maxSurface2DLayered: " << devProps.maxSurface2DLayered << "\n";
	cout << "maxSurfaceCubemap: " << devProps.maxSurfaceCubemap << "\n";
	cout << "maxSurfaceCubemapLayered: " << devProps.maxSurfaceCubemapLayered << "\n";
	cout << "surfaceAlignment: " << devProps.surfaceAlignment << "\n";
	cout << "concurrentKernels: " << devProps.concurrentKernels << "\n";
	cout << "ECCEnabled: " << devProps.ECCEnabled << "\n";
	cout << "pciBusID: " << devProps.pciBusID << "\n";
	cout << "pciDeviceID: " << devProps.pciDeviceID << "\n";
	cout << "pciDomainID: " << devProps.pciDomainID << "\n";
	cout << "tccDriver: " << devProps.tccDriver << "\n";
	cout << "asyncEngineCount: " << devProps.asyncEngineCount << "\n";
	cout << "unifiedAddressing: " << devProps.unifiedAddressing << "\n";
	cout << "memoryClockRate: " << devProps.memoryClockRate << "\n";
	cout << "memoryBusWidth: " << devProps.memoryBusWidth << "\n";
	cout << "l2CacheSize: " << devProps.l2CacheSize << "\n";
	cout << "maxThreadsPerMultiProcessor: " << devProps.maxThreadsPerMultiProcessor << "\n";
	cout << "streamPrioritiesSupported: " << devProps.streamPrioritiesSupported << "\n";
	cout << "globalL1CacheSupported: " << devProps.globalL1CacheSupported << "\n";
	cout << "localL1CacheSupported: " << devProps.localL1CacheSupported << "\n";
	cout << "sharedMemPerMultiprocessor: " << devProps.sharedMemPerMultiprocessor << "\n";
	cout << "regsPerMultiprocessor: " << devProps.regsPerMultiprocessor << "\n";
	cout << "isMultiGpuBoard: " << devProps.isMultiGpuBoard << "\n";
	cout << "multiGpuBoardGroupID: " << devProps.multiGpuBoardGroupID << "\n";
	cout << "singleToDoublePrecisionPerfRatio: " << devProps.singleToDoublePrecisionPerfRatio << "\n";
	cout << "pageableMemoryAccess: " << devProps.pageableMemoryAccess << "\n";
	cout << "concurrentManagedAccess: " << devProps.concurrentManagedAccess << "\n";
}