CUDA6.0开始 有"统一寻址"(Unified Memory)编程模型,可以用单个指针访问CPU和GPU内存,无须手动拷贝
主机启动内核后,管理权立刻返回给主机(类似启动线程后,不join)
C函数 | CUDA C函数 |
---|---|
malloc | cudaMalloc |
memcpy | cudaMemcpy |
memset | cudaMemset |
free | cudaFree |
cudaError_t cudaMalloc(void** devPtr, size_t size)
分配线性内存,devPtr是内存指针
cudaError_t cudaMemcpy(void* dst, const void* src, size_t count,cudaMemcpyKind kind)
-
主机和设备之间的数据传输,从src向dst复制字节,复制方向由kind指定
-
kind:
cudaMemcpyHostToHost
cudaMemcpyHostToDevice
cudaMemcpyDeviceToHost
cudaMemcpyDeviceToDevice -
同步方式,在cudaMemcpy函数返回及传输操作完成之前主机是阻塞的
-
返回错误枚举类型cudaError_t,成功返回cudaSuccess,失败返回cudaErrorMemoryAllocation
char* cudaGetErrorString(cudaError_t error)
获得报错信息,和C语言中的strerror函数类似
内存层次结构
#include <stdlib.h>
#include <time.h>
void sumArraysOnHost(float *A, float *B, float *C, const int N)//数组相加
{
for (int idx = 0; idx < N; idx++)
{
C[idx] = A[idx] + B[idx];
}
}
void initialData(float *ip, int size)//初始化数组,生成随机数
{
time_t t;
srand((unsigned) time(&t));//随机数发生器
for (int i = 0; i < size; i++)
{//rand()随机数函数返回一个int型的数,将rand()返回值的高16位变成0,低16位不变,用来控制最大值
ip[i] = (float)(rand() & 0xFF) / 10.0f;//
}
return;
}
int main(int argc, char **argv)
{
int nElem = 1024;
size_t nBytes = nElem * sizeof(float);
float *h_A, *h_B, *h_C;
h_A = (float *)malloc(nBytes);
h_B = (float *)malloc(nBytes);
h_C = (float *)malloc(nBytes);
initialData(h_A, nElem);
initialData(h_B, nElem);
sumArraysOnHost(h_A, h_B, h_C, nElem);
free(h_A);
free(h_B);
free(h_C);
return(0);
}
nvcc -Xcompiler -std=c99 sumArraysOnHost.c -o sum
-Xcompiler用于指定命令行选项是指向C编译器还是预处理器
-std=c99传递给编译器,指定编译标准
参考: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
线程管理
- 同一grid所有线程共享相同的全局内存空间
- 一个grid由多个block构成,一个线程block包含一组thread
- 同一block内线程:1. 同步,2.共享内存;不同block不能协作
- 索引
- 线程依靠blockIdx(block在grid里的索引)和threadIdx(block内索引),基于坐标将不同的数据分配给不同的线程
- 坐标变量基于unint3定义的CUDA内置向量类型(blockIdx.x,blockIdx.y,blockIdx.z;threadIdx.x,threadIdx.y,threadIdx.z)
- grid和block都可以组织为3维的,一个grid,默认是二维,一个block默认三维
- gridDim线程格维度(每个grid中有多少个block) blockDim线程块的维度(每个block中有多少个线程)是dim3类型的变量
- dim3类型的变量,所有未指定的元素都被初始化为1,可以通过x,y,z获得各个维度的长度(blockDim.x,blockDim.y,blockDim.z)
- host端使用dim3定义grid和block,在device端会议unit3类型显示;既host端的dim3 对应了device端的unit(block对应blockDim,grid对应gridDim)
host和device对应
#include <cuda_runtime.h>
#include <stdio.h>
__global__ void checkIndex(void)
{
printf("threadIdx:(%d, %d, %d)\n", threadIdx.x, threadIdx.y, threadIdx.z);//因为是1维的,所以y,z维度坐标都是0
printf("blockIdx:(%d, %d, %d)\n", blockIdx.x, blockIdx.y, blockIdx.z);
printf("blockDim:(%d, %d, %d)\n", blockDim.x, blockDim.y, blockDim.z);//因为是1维的,所以y,z维度都初始化为1
printf("gridDim:(%d, %d, %d)\n", gridDim.x, gridDim.y, gridDim.z);//因为是1维的,所以y,z维度都初始化为1
}
int main(int argc, char **argv)
{
int nElem = 6;//假设有6个数据
dim3 block(3);//每个block有3个thread
//C语言中除号两边都是整型时,做的求模运算,即整除,得到的是一个整数,为什么不直接Elem/block.x?
dim3 grid((nElem + block.x - 1) / block.x);//每个grid有2个block,
// host端检查grid和block的配置
printf("grid.x %d grid.y %d grid.z %d\n", grid.x, grid.y, grid.z);
printf("block.x %d block.y %d block.z %d\n", block.x, block.y, block.z);
// device端检查grid和block的配置
checkIndex<<<grid, block>>>();
cudaDeviceReset();
return(0);
}
重置grid和block
#include <cuda_runtime.h>
#include <stdio.h>
int main(int argc, char **argv)
{
int nElem = 1024;// 总共设置1024个数据
// block,1维,共1024个thread;grid1维,共1个block
dim3 block (1024);
dim3 grid ((nElem + block.x - 1) / block.x);
printf("grid.x %d block.x %d \n", grid.x, block.x);
// block,1维,共512个thread;grid1维,共2个block
block.x = 512;
grid.x = (nElem + block.x - 1) / block.x;
printf("grid.x %d block.x %d \n", grid.x, block.x);
// block,1维,共256个thread;grid1维,共4个block
block.x = 256;
grid.x = (nElem + block.x - 1) / block.x;
printf("grid.x %d block.x %d \n", grid.x, block.x);
// block,1维,共128个thread;grid1维,共8个block
block.x = 128;
grid.x = (nElem + block.x - 1) / block.x;
printf("grid.x %d block.x %d \n", grid.x, block.x);
cudaDeviceReset();
return(0);
}
启动核函数
kernel_name <<< grid,block>>>(argument list);
相比c普通函数function_name(argument list)
多了<<< grid,block>>>
- 假设有8个元素,启动2个1维块,每个块有4个thread
kernel_name<<< 2, 4>>>(argument list)
3. kernel函数调用和host是异步的,调用后,控制权立刻返回给host,可以使用cudaError_t cuda DevicesSynchronize(void)
来同步
4. 有些kernel函数是同步的,如cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind)
,拷贝完成后,控制权才返回主机
编写核函数
限定符 | 调用位置 | 执行位置 | 备注 |
---|---|---|---|
global | host | device | 必须有void返回类型 |
device | device | device | |
host | host | host | 可以省略 |
__device__
和__host__
可以一起使用,这样函数同时在主机和设备端进行编译- 核函数限制
1)只能访问设备内存
2)必须有void返回类型
3)不支持可变数量的参数
4)不支持静态变量
5)基本都是异步 - cpu:
void sumArraysOnHost(float *A, float *B, float *C, const int N){
for (int i = 0; i < N; i++)
C[i] = A[i] + B[i];
}
gpu:
__global__ void sumArraysOnGPU(float *A, float *B, float *C)//数组相加
{
int i = threadIdx.x;//省略循环,通过threadIdx来
C[i] = A[i] + B[i];
}
验证核函数
验证结果
void checkResult(float *h_C, float *gpuRef, const int N){//检验cpu和gpu的计算结果
double epsilon = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++){
if (abs(h_C[i] - gpuRef[i]) > epsilon){//host计算数组对应位置元素值-device计算数组对应元素值
match = 0;//更改全正确标志为0,既不会打印全部正确
printf("Arrays do not match!\n");
printf("host %5.2f gpu %5.2f at current %d\n", h_C[i],gpuRef[i], i);
break;
}
}
if (match) printf("Arrays match.\n\n");
return;
}
处理错误
综合
#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>
void checkResult(float *h_C, float *gpuRef, const int N){//检验cpu和gpu的计算结果
double epsilon = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++){
if (abs(h_C[i] - gpuRef[i]) > epsilon){//host计算数组对应位置元素值-device计算数组对应元素值
match = 0;//更改全正确标志为0,既不会打印全部正确
printf("Arrays do not match!\n");
printf("host %5.2f gpu %5.2f at current %d\n", h_C[i],gpuRef[i], i);
break;
}
}
if (match) printf("Arrays match.\n\n");
return;
}
void initialData(float *ip, int size){//目的是生成随机的浮点数
time_t t;
srand((unsigned) time(&t));
for (int i = 0; i < size; i++){
ip[i] = (float)(rand() & 0xFF) / 10.0f;
}
return;
}
void sumArraysOnHost(float *A, float *B, float *C, const int N){
for (int idx = 0; idx < N; idx++)
C[idx] = A[idx] + B[idx];
}
__global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N){
int i = threadIdx.x;
if (i < N) C[i] = A[i] + B[i];//i<N 只让小于i的核来工作
}
int main(int argc, char **argv){
int nElem = 32;
// malloc host memory
size_t nBytes = nElem * sizeof(float);
float *h_A, *h_B, *h_C, *gpuRef;
h_A = (float *)malloc(nBytes);//host 数组A
h_B = (float *)malloc(nBytes);//host 数组B
h_C = (float *)malloc(nBytes);//host 数组C,存储h_A + h_B的结果
gpuRef = (float *)malloc(nBytes);//device 结果数组,从gpu中结果复制到CPU
initialData(h_A, nElem); //初始化 host数组A,生成随机的浮点数
initialData(h_B, nElem); //初始化 host数组B,生成随机的浮点数
memset(h_C, 0, nBytes); //初始化 host 结果数组,全部置为0
memset(gpuRef, 0, nBytes); //初始化 device 结果数组,全部置为0
float *d_A, *d_B, *d_C;
cudaMalloc((float**)&d_A, nBytes); //初始化 device数组A 第一个参数意思是传入指针的地址,d_A本来在host内存中
cudaMalloc((float**)&d_B, nBytes); //初始化 device数组B 经过cudaMalloc作用后指针将指向device的内存
cudaMalloc((float**)&d_C, nBytes); //初始化 device数组C
cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);//将host数组内容拷贝到device中
cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);//将host数组内容拷贝到device中
cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice);//将host数组内容拷贝到device中
//host配置grid和block
dim3 block (nElem);
dim3 grid (1);
sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);//device计算数组相加
cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);//将device结果拷贝到host
sumArraysOnHost(h_A, h_B, h_C, nElem);//host数组相加
checkResult(h_C, gpuRef, nElem);//检测结果
// free device global memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// free host memory
free(h_A);
free(h_B);
free(h_C);
free(gpuRef);
cudaDeviceReset();
return(0);
}
计时
#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>
void checkResult(float *h_C, float *gpuRef, const int N){
double epsilon = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++){
if (abs(h_C[i] - gpuRef[i]) > epsilon){
match = 0;
printf("Arrays do not match!\n");
printf("host %5.2f gpu %5.2f at current %d\n", h_C[i],
gpuRef[i], i);
break;
}
}
if (match) printf("Arrays match.\n\n");
return;
}
double seconds(){
struct timeval tp;
struct timezone tzp;
int i = gettimeofday(&tp, &tzp);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
void initialData(float *ip, int size){
time_t t;
srand((unsigned) time(&t));
for (int i = 0; i < size; i++){
ip[i] = (float)( rand() & 0xFF ) / 10.0f;
}
return;
}
void sumArraysOnHost(float *A, float *B, float *C, const int N){
for (int idx = 0; idx < N; idx++){
C[idx] = A[idx] + B[idx];
}
}
__global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N){
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) C[i] = A[i] + B[i];//device的blockIdx可能会超出数组的界限
}
int main(int argc, char **argv)
{
printf("%s Starting...\n", argv[0]);
int nElem = 32;
size_t nBytes = nElem * sizeof(float);
float *h_A, *h_B, *h_C, *gpuRef;
h_A = (float *)malloc(nBytes);
h_B = (float *)malloc(nBytes);
h_C = (float *)malloc(nBytes);
gpuRef = (float *)malloc(nBytes);
double iStart, iElaps;
// 初始化host
iStart = seconds();
initialData(h_A, nElem);
initialData(h_B, nElem);
iElaps = seconds() - iStart;
printf("initialData Time elapsed %f sec\n", iElaps);
memset(h_C, 0, nBytes);
memset(gpuRef, 0, nBytes);
// host计算
iStart = seconds();
sumArraysOnHost(h_A, h_B, h_C, nElem);
iElaps = seconds() - iStart;
printf("sumArraysOnHost Time elapsed %f sec\n", iElaps);
// 初始化device
float *d_A, *d_B, *d_C;
cudaMalloc((float**)&d_A, nBytes);//(float**)可以省略,使用是为了明确传入的是指针的地址(二重指针)
cudaMalloc((float**)&d_B, nBytes);
cudaMalloc((float**)&d_C, nBytes);
cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice);
int iLen = 512;
dim3 block (iLen);
dim3 grid ((nElem + block.x - 1) / block.x);
iStart = seconds();
sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);
cudaDeviceSynchronize();
iElaps = seconds() - iStart;
printf("sumArraysOnGPU <<< %d, %d >>> Time elapsed %f sec\n", grid.x,
block.x, iElaps);
// 从device取数据
cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
// 检查结果
checkResult(h_C, gpuRef, nElem);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
free(gpuRef);
return(0);
}
nvprof
CUDA5.0以来,提供给了程序执行的信息
nvprof [nvprof_args] <application> [application_args]
nvprof --help
`nvprof ./sumArraysOnGPU-timer
~/code/cuda$ nvprof ./sum
./sum Starting...
Vector size 32
initialData Time elapsed 0.000004 sec
sumArraysOnHost Time elapsed 0.000000 sec
==5121== NVPROF is profiling process 5121, command: ./sum
sumArraysOnGPU <<< 1, 512 >>> Time elapsed 0.000025 sec
Arrays match.
==5121== Profiling application: ./sum
==5121== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 43.48% 1.9200us 3 640ns 576ns 768ns [CUDA memcpy HtoD]
42.03% 1.8560us 1 1.8560us 1.8560us 1.8560us sumArraysOnGPU(float*, float*, float*, int)
14.49% 640ns 1 640ns 640ns 640ns [CUDA memcpy DtoH]
API calls: 98.84% 143.30ms 3 47.768ms 3.2070us 143.29ms cudaMalloc
0.72% 1.0374ms 96 10.806us 519ns 507.17us cuDeviceGetAttribute
0.25% 364.98us 1 364.98us 364.98us 364.98us cuDeviceTotalMem
0.08% 113.68us 1 113.68us 113.68us 113.68us cuDeviceGetName
0.06% 85.675us 3 28.558us 4.0100us 75.346us cudaFree
0.03% 41.697us 4 10.424us 4.4080us 24.272us cudaMemcpy
0.01% 20.049us 1 20.049us 20.049us 20.049us cudaLaunchKernel
0.01% 7.6710us 1 7.6710us 7.6710us 7.6710us cuDeviceGetPCIBusId
0.00% 4.1670us 3 1.3890us 516ns 2.7560us cuDeviceGetCount
0.00% 3.7380us 1 3.7380us 3.7380us 3.7380us cudaDeviceSynchronize
0.00% 3.5770us 2 1.7880us 533ns 3.0440us cuDeviceGet
0.00% 1.0310us 1 1.0310us 1.0310us 1.0310us cuDeviceGetUuid
组织并行线程
使用block和thread建立矩阵索引
使用线性存储二维矩阵
#include <cuda_runtime.h>
#include <stdio.h>
void printMatrix(int *C, const int nx, const int ny)
{
int *ic = C;
for (int iy = 0; iy < ny; iy++){
for (int ix = 0; ix < nx; ix++){
printf("%3d", ic[ix]);
}
ic += nx;//指针移动了一行的元素距离
printf("\n");
}
printf("\n");
return;
}
__global__ void printThreadIndex(int *A, const int nx, const int ny){//二维block
//dim3 block(4, 2);x:[0,1,2,3],y:[0,1]
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int idx = iy * nx + ix;
printf("thread_id (%d,%d) block_id (%d,%d) coordinate (%d,%d) global index"
" %2d ival %2d\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y,
ix, iy, idx, A[idx]);
}
int main(int argc, char **argv)
{
printf("%s Starting...\n", argv[0]);
int nx = 8; //每行8个元素
int ny = 6; //6列
int nxy = nx * ny; //总元素个数
int nBytes = nxy * sizeof(float);//host端申请内存,线性内存存储矩阵
int *h_A = (int *)malloc(nBytes);
for (int i = 0; i < nxy; i++){//初始化矩阵
h_A[i] = i;
}
printMatrix(h_A, nx, ny);
int *d_MatA;//device申请内存
cudaMalloc((void **)&d_MatA, nBytes);
cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice); //将host端矩阵和device端同步
dim3 block(4, 2);
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);//(8+4-1)/4=2,(6+2-1)/2=3
printThreadIndex<<<grid, block>>>(d_MatA, nx, ny);//<<<6,8>>>
cudaFree(d_MatA);
free(h_A);
cudaDeviceReset();
return (0);
}
二维网格和二维块对矩阵求和
#include <cuda_runtime.h>
#include <stdio.h>
void initialData(float *ip, const int size){//初始化矩阵
for(int i = 0; i < size; i++){
ip[i] = (float)(rand() & 0xFF) / 10.0f;
}
return;
}
void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny){//host端矩阵加法
float *ia = A;
float *ib = B;
float *ic = C;
for (int iy = 0; iy < ny; iy++){
for (int ix = 0; ix < nx; ix++){
ic[ix] = ia[ix] + ib[ix];
}
ia += nx;//每计算一行,指针位置前进一行
ib += nx;
ic += nx;
}
return;
}
void checkResult(float *h_C, float *gpuRef, const int N){
double epsilon = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++){
if (abs(h_C[i] - gpuRef[i]) > epsilon){
match = 0;
printf("host %f gpu %f\n", h_C[i], gpuRef[i]);
break;
}
}
if (match)
printf("Arrays match.\n\n");
else
printf("Arrays do not match.\n\n");
}
__global__ void sumMatrixOnGPU2D(float *MatA, float *MatB, float *MatC, int nx, int ny){
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;//横坐标,列数
unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;//纵坐标,行数
unsigned int idx = iy * nx + ix;
if (ix < nx && iy < ny)
MatC[idx] = MatA[idx] + MatB[idx];
}
int main(int argc, char **argv)
{
printf("%s Starting...\n", argv[0]);
//初始化host
int nx = 1 << 14;//矩阵列数
int ny = 1 << 14;//矩阵行数
int nxy = nx * ny;//总元素个数
int nBytes = nxy * sizeof(float);
float *h_A, *h_B, *h_C, *gpuRef;
h_A = (float *)malloc(nBytes);
h_B = (float *)malloc(nBytes);
h_C = (float *)malloc(nBytes);
gpuRef = (float *)malloc(nBytes);
initialData(h_A, nxy);
initialData(h_B, nxy);
memset(h_C, 0, nBytes);
memset(gpuRef, 0, nBytes);
sumMatrixOnHost(h_A, h_B, h_C, nx, ny);
//初始化device
float *d_MatA, *d_MatB, *d_MatC;
cudaMalloc((void **)&d_MatA, nBytes);
cudaMalloc((void **)&d_MatB, nBytes);
cudaMalloc((void **)&d_MatC, nBytes);
cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice);
int dimx = 32;
int dimy = 32;
dim3 block(dimx, dimy);
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
sumMatrixOnGPU2D<<<grid, block>>>(d_MatA, d_MatB, d_MatC, nx, ny);
cudaDeviceSynchronize();
cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost);
checkResult(h_C, gpuRef, nxy);
//释放内存
cudaFree(d_MatA);
cudaFree(d_MatB);
cudaFree(d_MatC);
free(h_A);
free(h_B);
free(h_C);
free(gpuRef);
cudaDeviceReset();
return (0);
}
使用一维网格一维块对矩阵求和
__global__ void sumMatrixOnGPU1D(float *MatA, float *MatB, float *MatC, int nx,int ny){
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
if (ix < nx )
for (int iy = 0; iy < ny; iy++){
int idx = iy * nx + ix;
MatC[idx] = MatA[idx] + MatB[idx];
}
}
dim3 block(32, 1);
dim3 grid((nx + block.x - 1) / block.x, 1);
使用二维网格和一维块对矩阵求和
__global__ void sumMatrixOnGPUMix(float *MatA, float *MatB, float *MatC, int nx,
int ny)
{
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = blockIdx.y;
unsigned int idx = iy * nx + ix;
if (ix < nx && iy < ny)
MatC[idx] = MatA[idx] + MatB[idx];
}
int dimx = 32;
dim3 block(dimx, 1);
dim3 grid((nx + block.x - 1) / block.x, ny);
设备管理
使用运行时API查询GPU信息
#include <cuda_runtime.h>
#include <stdio.h>
int main(int argc, char **argv){
printf("%s Starting...\n", argv[0]);
int deviceCount = 0;
cudaGetDeviceCount(&deviceCount);//查看GPU数量
printf("Detected %d CUDA Capable device(s)\n", deviceCount);
int dev = 0, driverVersion = 0, runtimeVersion = 0;
cudaSetDevice(dev);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("Device %d: \"%s\"\n", dev, deviceProp.name);
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n",
driverVersion / 1000, (driverVersion % 100) / 10,
runtimeVersion / 1000, (runtimeVersion % 100) / 10);
printf(" CUDA Capability Major/Minor version number: %d.%d\n",
deviceProp.major, deviceProp.minor);
printf(" Total amount of global memory: %.2f GBytes (%llu "
"bytes)\n", (float)deviceProp.totalGlobalMem / pow(1024.0, 3),
(unsigned long long)deviceProp.totalGlobalMem);
printf(" GPU Clock rate: %.0f MHz (%0.2f "
"GHz)\n", deviceProp.clockRate * 1e-3f,
deviceProp.clockRate * 1e-6f);
printf(" Memory Clock rate: %.0f Mhz\n",
deviceProp.memoryClockRate * 1e-3f);
printf(" Memory Bus Width: %d-bit\n",
deviceProp.memoryBusWidth);
if (deviceProp.l2CacheSize)
{
printf(" L2 Cache Size: %d bytes\n",
deviceProp.l2CacheSize);
}
printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), "
"2D=(%d,%d), 3D=(%d,%d,%d)\n", deviceProp.maxTexture1D,
deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1],
deviceProp.maxTexture3D[2]);
printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, "
"2D=(%d,%d) x %d\n", deviceProp.maxTexture1DLayered[0],
deviceProp.maxTexture1DLayered[1], deviceProp.maxTexture2DLayered[0],
deviceProp.maxTexture2DLayered[1],
deviceProp.maxTexture2DLayered[2]);
printf(" Total amount of constant memory: %lu bytes\n",
deviceProp.totalConstMem);
printf(" Total amount of shared memory per block: %lu bytes\n",
deviceProp.sharedMemPerBlock);
printf(" Total number of registers available per block: %d\n",
deviceProp.regsPerBlock);
printf(" Warp size: %d\n",
deviceProp.warpSize);
printf(" Maximum number of threads per multiprocessor: %d\n",
deviceProp.maxThreadsPerMultiProcessor);
printf(" Maximum number of threads per block: %d\n",
deviceProp.maxThreadsPerBlock);
printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n",
deviceProp.maxThreadsDim[0],
deviceProp.maxThreadsDim[1],
deviceProp.maxThreadsDim[2]);
printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n",
deviceProp.maxGridSize[0],
deviceProp.maxGridSize[1],
deviceProp.maxGridSize[2]);
printf(" Maximum memory pitch: %lu bytes\n",
deviceProp.memPitch);
exit(EXIT_SUCCESS);
}
确定最优gpu
int numDevices = 0;
cudaGetDeviceCount(&numDevices);
if(numDevices > 1){
int maxMultiprocessors = 0, maxDevice = 0;
for(int device=0; device<numDevices; device++){
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device);
if (maxMultiprocessors < props.multipProcessorCount){
maxMultiprocessors = props.multiProcessorCount;
maxDevice = device;
}
}
}
使用nvidia-smi查询GPU信息
nvida-smi -L
nvidia-smi -q -i -0
nvidia-smi -q -i 0 -d MEMORY | tail -n 5
nvidia-smi -q -i 0 -d UTILIZATION | tail -n 4
运行时设置设备
有N个GPU的系统,nvidia-smi从0到N-1来记录设备,使用环境变量CUDA_VISIBLE_DEVICES
来设置