P4 向量加法程序解析
#include <stdio.h>
#include <cuda.h>
#include "uax.h"
typedef float FLOAT;
void vec_add_host(FLOAT *x, FLOAT *y, FLOAT *z, int N);
__global__ void vec_add(FLOAT *x, FLOAT *y, FLOAT *z, int N)
{
int idx = get_tid();
if (idx < N) z[idx] = z[idx] + y[idx] + x[idx];
}
void vec_add_host(FLOAT *x, FLOAT *y, FLOAT *z, int N)
{
int i;
for (i = 0; i < N; i++) z[i] = z[i] + y[i] + x[i];
}
#define get_tid() (blockDim.x*(blockIdx.x+blockIdx.y*blockDim.x)+threadIdx.x)
#define get_bid() (blockIdx.x + blockIdx.y*blockDim.x)
#include<sys/time.h>
#include<time.h>
double get_time(void){
struct timeval tv;
double t;
gettimeofday(&tv, (struct timezone *)0);
t = tv.tv_sec + (double)tv.tv_usec * 1e-6;
return t;
}
__global__ void warmpup_knl() {
int i, j;
i = 1;
j = 2;
i = i + j;
}
void warmup() {
int i;
for (i = 0; i < 8; i++) {
warmpup_knl <<<1, 256 >>>();
}
}
int main()
{
int N = 20000000;
int nbytes = N * sizeof(FLOAT);
int bs = 256;
int s = ceil(sqrt((N + bs - 1.) / bs));
dim3 grid = dim3(s, s);
FLOAT *dx = NULL, *hx = NULL;
FLOAT *dy = NULL, *hy = NULL;
FLOAT *dz = NULL, *hz = NULL;
int itr = 30;
int i;
double th, td;
cudaMalloc((void **)&dx, nbytes);
cudaMalloc((void **)&dy, nbytes);
cudaMalloc((void **)&dz, nbytes);
if (dx == NULL || dy == NULL || dz == NULL) {
printf("couldn't allocate GPU memory\n");
return -1;
}
printf("allocated %.2f MB on GPU\n", nbytes / (1024.f * 1024.f));
hx = (FLOAT *) malloc(nbytes);
hy = (FLOAT *) malloc(nbytes);
hz = (FLOAT *) malloc(nbytes);
if (hx == NULL || hy == NULL || hz == NULL) {
printf("couldn't allocate CPU memory\n");
return -2;
}
printf("allocated %.2f MB on CPU\n", nbytes / (1024.f * 1024.f));
for (i = 0; i < N; i++) {
hx[i] = 1;
hy[i] = 1;
hz[i] = 1;
}
cudaMemcpy(dx, hx, nbytes, cudaMemcpyHostToDevice);
cudaMemcpy(dy, hy, nbytes, cudaMemcpyHostToDevice);
cudaMemcpy(dz, hz, nbytes, cudaMemcpyHostToDevice);
warmup();
cudaDeviceSynchronize();
td = get_time();
for (i = 0; i < itr; i++) vec_add<<<grid, bs>>>(dx, dy, dz, N);
cudaDeviceSynchronize();
td = get_time() - td;
th = get_time();
for (i = 0; i < itr; i++) vec_add_host(hx, hy, hz, N);
th = get_time() - th;
printf("GPU time: %e, CPU time: %e, speedup: %g\n", td, th, th / td);
cudaFree(dx);
cudaFree(dy);
cudaFree(dz);
free(hx);
free(hy);
free(hz);
return 0;
}
nvcc -O3 -arch=sm_52 -Xcompiler "-Wall -Wextra" -o my-vec-add vec-add.cu
./my-vec-add
