前言
记录cuda 常用基本语句
1、数组相加
#include <math.h>
#include <stdio.h>
const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void __global__ add(const double *x, const double *y, double *z);
void check(const double *z, const int N);
int main(void)
{
const int N = 100000000;
const int M = sizeof(double) * N;
double *h_x = (double*) malloc(M);
double *h_y = (double*) malloc(M);
double *h_z = (double*) malloc(M);
for (int n = 0; n < N; ++n)
{
h_x[n] = a;
h_y[n] = b;
}
double *d_x, *d_y, *d_z;
cudaMalloc((void **)&d_x, M);
cudaMalloc((void **)&d_y, M);
cudaMalloc((void **)&d_z, M);
cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice);
cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice);
const int block_size = 128;
const int grid_size = N / block_size;
add<<<grid_size, block_size>>>(d_x, d_y, d_z);
cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost);
check(h_z, N);
free(h_x);
free(h_y);
free(h_z);
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_z);
return 0;
}
void __global__ add(const double *x, const double *y, double *z)
{
const int n = blockDim.x * blockIdx.x + threadIdx.x;
z[n] = x[n] + y[n];
}
void check(const double *z, const int N)
{
bool has_error = false;
for (int n = 0; n < N; ++n)
{
if (fabs(z[n] - c) > EPSILON)
{
has_error = true;
}
}
printf("%s\n", has_error ? "Has errors" : "No errors");
}
包含:
- cudaMalloc
- cudaMemcpy
- cudaMemcpyHostToDevice
- add<<<grid_size, block_size>>>(d_x, d_y, d_z)
- cudaFree
2、error.cuh
#pragma once
#include <stdio.h>
#define CHECK(call) \
do \
{ \
const cudaError_t error_code = call; \
if (error_code != cudaSuccess) \
{ \
printf("CUDA Error:\n"); \
printf(" File: %s\n", __FILE__); \
printf(" Line: %d\n", __LINE__); \
printf(" Error code: %d\n", error_code); \
printf(" Error text: %s\n", \
cudaGetErrorString(error_code)); \
exit(1); \
} \
} while (0)
3、error check api and kernel
#include "error.cuh"
#include <math.h>
#include <stdio.h>
const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void __global__ add(const double *x, const double *y, double *z, const int N);
void check(const double *z, const int N);
int main(void)
{
const int N = 100000000;
const int M = sizeof(double) * N;
double *h_x = (double*) malloc(M);
double *h_y = (double*) malloc(M);
double *h_z = (double*) malloc(M);
for (int n = 0; n < N; ++n)
{
h_x[n] = a;
h_y[n] = b;
}
double *d_x, *d_y, *d_z;
CHECK(cudaMalloc((void **)&d_x, M));
CHECK(cudaMalloc((void **)&d_y, M));
CHECK(cudaMalloc((void **)&d_z, M));
CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice));
const int block_size = 1280;
const int grid_size = (N + block_size - 1) / block_size;
add<<<grid_size, block_size>>>(d_x, d_y, d_z, N);
CHECK(cudaGetLastError());
CHECK(cudaDeviceSynchronize());
CHECK(cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost));
check(h_z, N);
free(h_x);
free(h_y);
free(h_z);
CHECK(cudaFree(d_x));
CHECK(cudaFree(d_y));
CHECK(cudaFree(d_z));
return 0;
}
void __global__ add(const double *x, const double *y, double *z, const int N)
{
const int n = blockDim.x * blockIdx.x + threadIdx.x;
if (n < N)
{
z[n] = x[n] + y[n];
}
}
void check(const double *z, const int N)
{
bool has_error = false;
for (int n = 0; n < N; ++n)
{
if (fabs(z[n] - c) > EPSILON)
{
has_error = true;
}
}
printf("%s\n", has_error ? "Has errors" : "No errors");
}
4、 time 计时函数
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start); // cannot use the macro function CHECK here
// The code block to be timed
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("Time = %g ms.\n", elapsed_time);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
在代码中一般这样使用
void timing(const real *d_A, real *d_B, const int N, const int task)
{
const int grid_size_x = (N + TILE_DIM - 1) / TILE_DIM;
const int grid_size_y = grid_size_x;
const dim3 block_size(TILE_DIM, TILE_DIM);
const dim3 grid_size(grid_size_x, grid_size_y);
float t_sum = 0;
float t2_sum = 0;
for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat)
{
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
switch (task)
{
case 0:
copy<<<grid_size, block_size>>>(d_A, d_B, N);
break;
case 1:
transpose1<<<grid_size, block_size>>>(d_A, d_B, N);
break;
case 2:
transpose2<<<grid_size, block_size>>>(d_A, d_B, N);
break;
case 3:
transpose3<<<grid_size, block_size>>>(d_A, d_B, N);
break;
default:
printf("Error: wrong task\n");
exit(1);
break;
}
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("Time = %g ms.\n", elapsed_time);
if (repeat > 0)
{
t_sum += elapsed_time;
t2_sum += elapsed_time * elapsed_time;
}
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
}
const float t_ave = t_sum / NUM_REPEATS;
const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave);
printf("Time = %g +- %g ms.\n", t_ave, t_err);
}
5、示例 time
#include "error.cuh"
#include <math.h>
#include <stdio.h>
#ifdef USE_DP
typedef double real;
const real EPSILON = 1.0e-15;
#else
typedef float real;
const real EPSILON = 1.0e-6f;
#endif
const int NUM_REPEATS = 10;
const real a = 1.23;
const real b = 2.34;
const real c = 3.57;
void __global__ add(const real *x, const real *y, real *z, const int N);
void check(const real *z, const int N);
int main(void)
{
const int N = 100000000;
const int M = sizeof(real) * N;
real *h_x = (real*) malloc(M);
real *h_y = (real*) malloc(M);
real *h_z = (real*) malloc(M);
for (int n = 0; n < N; ++n)
{
h_x[n] = a;
h_y[n] = b;
}
real *d_x, *d_y, *d_z;
CHECK(cudaMalloc((void **)&d_x, M));
CHECK(cudaMalloc((void **)&d_y, M));
CHECK(cudaMalloc((void **)&d_z, M));
CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice));
const int block_size = 128;
const int grid_size = (N + block_size - 1) / block_size;
float t_sum = 0;
float t2_sum = 0;
for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat)
{
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
add<<<grid_size, block_size>>>(d_x, d_y, d_z, N);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("Time = %g ms.\n", elapsed_time);
if (repeat > 0)
{
t_sum += elapsed_time;
t2_sum += elapsed_time * elapsed_time;
}
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
}
const float t_ave = t_sum / NUM_REPEATS;
const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave);
printf("Time = %g +- %g ms.\n", t_ave, t_err);
CHECK(cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost));
check(h_z, N);
free(h_x);
free(h_y);
free(h_z);
CHECK(cudaFree(d_x));
CHECK(cudaFree(d_y));
CHECK(cudaFree(d_z));
return 0;
}
void __global__ add(const real *x, const real *y, real *z, const int N)
{
const int n = blockDim.x * blockIdx.x + threadIdx.x;
if (n < N)
{
z[n] = x[n] + y[n];
}
}
void check(const real *z, const int N)
{
bool has_error = false;
for (int n = 0; n < N; ++n)
{
if (fabs(z[n] - c) > EPSILON)
{
has_error = true;
}
}
printf("%s\n", has_error ? "Has errors" : "No errors");
}
6、query
基本的设备信息
#include "error.cuh"
#include <stdio.h>
int main(int argc, char *argv[])
{
int device_id = 0;
if (argc > 1) device_id = atoi(argv[1]);
CHECK(cudaSetDevice(device_id));
cudaDeviceProp prop;
CHECK(cudaGetDeviceProperties(&prop, device_id));
printf("Device id: %d\n",
device_id);
printf("Device name: %s\n",
prop.name);
printf("Compute capability: %d.%d\n",
prop.major, prop.minor);
printf("Amount of global memory: %g GB\n",
prop.totalGlobalMem / (1024.0 * 1024 * 1024));
printf("Amount of constant memory: %g KB\n",
prop.totalConstMem / 1024.0);
printf("Maximum grid size: %d %d %d\n",
prop.maxGridSize[0],
prop.maxGridSize[1], prop.maxGridSize[2]);
printf("Maximum block size: %d %d %d\n",
prop.maxThreadsDim[0], prop.maxThreadsDim[1],
prop.maxThreadsDim[2]);
printf("Number of SMs: %d\n",
prop.multiProcessorCount);
printf("Maximum amount of shared memory per block: %g KB\n",
prop.sharedMemPerBlock / 1024.0);
printf("Maximum amount of shared memory per SM: %g KB\n",
prop.sharedMemPerMultiprocessor / 1024.0);
printf("Maximum number of registers per block: %d K\n",
prop.regsPerBlock / 1024);
printf("Maximum number of registers per SM: %d K\n",
prop.regsPerMultiprocessor / 1024);
printf("Maximum number of threads per block: %d\n",
prop.maxThreadsPerBlock);
printf("Maximum number of threads per SM: %d\n",
prop.maxThreadsPerMultiProcessor);
return 0;
}