用API查询自己的显卡所有信息
这里使用的API是cudaGetDeviceProperties ,所有API来源是是官方的API文档,见https://docs.nvidia.com/cuda/pdf/CUDA_Runtime_API.pdf
可以看到这个API的修饰符有__host__和__cudaError_t,意思是这个API在主机中调用,且错误返回的类型是__cudaError_t。所以第一步先写一个头文件包括这个错误返回。
/******************************************************************
* Author: Da Liu
* Date: 2024-07-24
* File: common.cuh
* Description: 返回cuda运行报错错误(精确到行)
*****************************************************************/
#pragma once
#include<iostream>
#include<stdlib.h>
#include<stdio.h>
#include<cuda_runtime.h>
cudaError_t ErrorCheck(cudaError_t error_code, const char* filename, int lineNumber)
{
if(error_code != cudaSuccess)
{
std::cout << "CUDA error:\r\ncode=%d,name=%s,description=%s\r\nfile=%s,line=%d\r\n" << error_code
<< cudaGetErrorName(error_code) << cudaGetErrorString(error_code) << filename << lineNumber << std::endl;
return error_code;
}
return error_code;
}
现在开始写如何调用这个API,创建一个.cu文件。当然这里也可以都不加ErrorCheck函数,cudaGetDeviceProperties这个api需要先定义一下结构体,如下面代码中的”cudaDeviceProp device_prop;“,这里的device_id就是你的显卡id,默认是0,如果是多卡就是0,1,2这样。往api里面传入device_id,就会获得很多返回的参数(就是这个device_prop结构体,怎么使用结构体这个很简单,看下面代码即可。
/******************************************************************
* Author: Da Liu
* Date: 2024-07-24
* File: query.cu
* Description: runtime API query the information of the GPU device.
*****************************************************************/
#include <stdio.h>
#include "../cudalearn/tools/common.cuh"
int main()
{
int device_id = 0;
ErrorCheck(cudaSetDevice(device_id), __FILE__, __LINE__);
cudaDeviceProp device_prop;
ErrorCheck(cudaGetDeviceProperties(&device_prop, device_id), __FILE__, __LINE__);
printf("Device %d: %s\n",
device_id, device_prop.name);
printf("Compute Capability: %d.%d\n",
device_prop.major, device_prop.minor);
printf("Memory Clock Rate (KHz): %d\n",
device_prop.memoryClockRate);
printf("Memory Bus Width (bits): %d\n",
device_prop.memoryBusWidth);
printf("Peak Memory Bandwidth (GB/s): %f\n",
2.0 * device_prop.memoryClockRate * (device_prop.memoryBusWidth / 8) / 1.0e6);
printf("Multiprocessors: %d\n",
device_prop.multiProcessorCount);
printf("Total Global Memory (GB): %f\n",
device_prop.totalGlobalMem / 1.0e9);
printf("Total Constant Memory (KB): %d\n",
device_prop.totalConstMem / 1024);
printf("Total Shared Memory per Block (KB): %d\n",
device_prop.sharedMemPerBlock / 1024);
printf("Warp Size: %d\n",
device_prop.warpSize);
printf("Maximum Threads per Block: %d\n",
device_prop.maxThreadsPerBlock);
printf("Maximum Threads per Multiprocessor: %d\n",
device_prop.maxThreadsPerMultiProcessor);
printf("Maximum Grid Size: (%d, %d, %d)\n",
device_prop.maxGridSize[0], device_prop.maxGridSize[1], device_prop.maxGridSize[2]);
printf("Concurrent Kernels: %d\n",
device_prop.concurrentKernels);
printf("ECC Enabled: %d\n",
device_prop.ECCEnabled);
printf("PCI Bus ID: %d\n",
device_prop.pciBusID);
printf("PCI Device ID: %d\n",
device_prop.pciDeviceID);
return 0;
}
此时需要将这个代码变成可执行文件,在代码目录的终端中输入
nvcc name.cu -o name
./name.exe
这样就可以输出结果。
通过API返回的主次GPU版本号来计算核心数量
看上面的运行图可以看到有一个计算能力结果是8.9,这个是通过device_prop.major和device_prop.minor得到参数,所以我现在这个4060显卡的主次版本号分别是8和9。根据这两个数字来计算cuda核心数量,多处理器乘以一个固定的数(这里我也没弄明白),参考了别人计算的公式。下面是计算cuda核心数的代码。
/******************************************************************
* Author: Da Liu
* Date: 2024-07-25
* File: computcores.cu
* Description: 计算当前显卡的核心数量.
*****************************************************************/
#include <stdio.h>
#include "../cudalearn/tools/common.cuh"
int getSPcores(cudaDeviceProp deviceProp, const char** archName)
{
int cores = 0;
int mp = deviceProp.multiProcessorCount;
switch (deviceProp.major)
{
case 2: // Fermi
*archName = "Fermi";
if (deviceProp.minor == 1) cores = mp * 48;
else cores = mp * 32;
break;
case 3: // Kepler
*archName = "Kepler";
cores = mp * 192;
break;
case 5: // Maxwell
*archName = "Maxwell";
cores = mp * 128;
break;
case 6: // Pascal
*archName = "Pascal";
if ((deviceProp.minor == 1) || (deviceProp.minor == 2)) cores = mp * 128;
else if (deviceProp.major == 0) cores = mp * 64;
else printf("Unknown GPU architecture\n");
break;
case 7: // Volta and Turing
*archName = "Volta";
if ((deviceProp.minor == 0) || (deviceProp.minor == 5)) cores = mp * 64;
else printf("Unknown GPU architecture\n");
break;
case 8: // Ampere
*archName = "Ampere";
if (deviceProp.minor == 0) cores = mp * 64;
else if ((deviceProp.minor == 6) || (deviceProp.minor == 9)) cores = mp * 128;
else printf("Unknown GPU architecture\n");
break;
case 9: //Hopper
*archName = "Hopper";
if (deviceProp.minor == 0) cores = mp * 128;
else printf("Unknown GPU architecture\n");
break;
default:
printf("Unknown GPU architecture\n");
break;
}
return cores;
}
int main()
{
int deviceCount = 0;
const char* archname = NULL;
ErrorCheck(cudaSetDevice(deviceCount), __FILE__, __LINE__);
cudaDeviceProp deviceProp;
ErrorCheck(cudaGetDeviceProperties(&deviceProp, deviceCount), __FILE__, __LINE__);
printf("Device: %s\n", deviceProp.name);
int spCores = getSPcores(deviceProp, &archname);
printf("GPU Architecture: %s\n", archname);
printf("SP cores: %d\n", spCores);
return 0;
}
此时需要将这个代码变成可执行文件,在代码目录的终端中输入
nvcc name.cu -o name
./name.exe
这样就可以输出结果。
可以看到我的4060显卡是安培架构,有3072个cuda核心。