#include "../common/common.h"
#include <cuda_runtime.h>
#include <stdio.h>
/*
outdata=indata 指针复制,此语句使oudata指向的地址和indata指向的地址相同,它们都指向同一内存区域
*outdata=*indata 指针赋值,此语句使指针outdata指向的地址(内存位置)的内容与指针indata指向地址(内存位置)的内容相同,但outdata与indata指向的地址不一定相
*/
// .execfilename ikernel arraysize -v (verify) -p(print)
// ./matrix 1 32 1 1024 -v -p
void initialArray(float *array, int size)
{
for (int i = 0; i < size; i++)
{
array[i] = (float)(rand() % 10 + 1);
}
}
void printArray(const char *name, float *array, int size)
{
printf("%s:arraySize:%d\n", name, size);
for (int i = 0; i < size; i++)
{
printf("%4.0f", array[i]);
}
printf("\n");
}
int addblockSum(float *array, int size)
{
float sum = 0.0;
for (int i = 0; i < size; i++)
{
sum += array[i];
}
return sum;
}
int verifyArray(float sum1, float sum2)
{
if (sum1 != sum2)
{
printf("\n The result %lf and %lf is not same!\n", sum1, sum2);
return 1;
}
printf("\nThe result is same!\n");
return 0;
}
/*
int recursiveReduce(float *indata,int const size)
{
if (size == 1)
{
return indata[0];
}
int const stride = size / 2;
for (int i = 0; i < stride; i++)
{
indata[i] += indata[i + stride];
}
return recursiveReduce(indata,stride);
}
*/
int recursiveReduce(float *indata, int const size)
{
float sum = 0.0;
for (int i = 0; i < size; i++)
{
sum += indata[i];
}
return sum;
}
//kernel 1
__global__ void reduceNeighbored(float *g_idata, float *g_odata, unsigned int size)
{
unsigned int tid = threadIdx.x;
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
float *idata = g_idata + blockIdx.x * blockDim.x;
if (idx >= size) return;
for (int stride = 1; stride < blockDim.x; stride *= 2)
{
if ((tid % (2 * stride)) == 0)
{
idata[tid] += idata[tid + stride];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = idata[0];
}
//kernel 2 邻近配对 block内的线程数量减少一半
__global__ void reduceNeighboredLess(float *g_idata, float *g_odata, unsigned int n)
{
unsigned int tid = threadIdx.x;
unsigned int idx = blockIdx.x * blockDim.x * 2 + threadIdx.x;
float *idata = g_idata + blockIdx.x * blockDim.x * 2;
if (idx >= n) return;
for (int stride = 1; stride < blockDim.x * 2; stride *= 2)
{
int index = 2 * stride * tid;
if (index < blockDim.x * 2)
{
idata[index] = idata[index] + idata[index + stride];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = idata[0];
}
//kernel 3 交错配对 block内的线程数量减少一半
__global__ void reduceInterleaved(float *g_idata, float *g_odata, unsigned int n)
{
unsigned int tid = threadIdx.x;
unsigned int idx = blockIdx.x * blockDim.x * 2 + threadIdx.x;
float *idata = g_idata + blockIdx.x * blockDim.x * 2;
if (idx >= n) return;
for (int stride = blockDim.x * 2 / 2; stride > 0; stride >>= 1)
{
if (tid < stride)
{
idata[tid] += idata[tid + stride];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = idata[0];
}
//kernel 4 block数量减少一半
__global__ void reduceUnrolling2(float *g_idata, float *g_odata, unsigned int n)
{
unsigned int tid = threadIdx.x;
unsigned int idx = blockIdx.x * blockDim.x * 2 + threadIdx.x;
float *idata = g_idata + blockIdx.x * blockDim.x * 2;
if (idx + blockDim.x < n) g_idata[idx] += g_idata[idx + blockDim.x];
__syncthreads();
for (int stride = blockDim.x / 2; stride > 0; stride >>= 1)
{
if (tid < stride)
{
idata[tid] += idata[tid + stride];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = idata[0];
}
//kernel 5 block数量减少到四分之一
__global__ void reduceUnrolling4(float *g_idata, float *g_odata, unsigned int n)
{
unsigned int tid = threadIdx.x;
unsigned int idx = blockIdx.x * blockDim.x * 4 + threadIdx.x;
float *idata = g_idata + blockIdx.x * blockDim.x * 4;
if (idx + 3 * blockDim.x < n)
{
int a1 = g_idata[idx];
int a2 = g_idata[idx + blockDim.x];
int a3 = g_idata[idx + 2 * blockDim.x];
int a4 = g_idata[idx + 3 * blockDim.x];
g_idata[idx] = a1 + a2 + a3 + a4;
}
__syncthreads();
for (int stride = blockDim.x / 2; stride > 0; stride >>= 1)
{
if (tid < stride)
{
idata[tid] += idata[tid + stride];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = idata[0];
}
//kernel 6 block数量减少到八分之一
__global__ void reduceUnrolling8(float *g_idata, float *g_odata, unsigned int n)
{
unsigned int tid = threadIdx.x;
unsigned int idx = blockIdx.x * blockDim.x * 8 + threadIdx.x;
float *idata = g_idata + blockIdx.x * blockDim.x * 8;
if (idx + 7 * blockDim.x < n)
{
int a1 = g_idata[idx];
int a2 = g_idata[idx + blockDim.x];
int a3 = g_idata[idx + 2 * blockDim.x];
int a4 = g_idata[idx + 3 * blockDim.x];
int b1 = g_idata[idx + 4 * blockDim.x];
int b2 = g_idata[idx + 5 * blockDim.x];
int b3 = g_idata[idx + 6 * blockDim.x];
int b4 = g_idata[idx + 7 * blockDim.x];
g_idata[idx] = a1 + a2 + a3 + a4 + b1 + b2 + b3 + b4;
}
__syncthreads();
for (int stride = blockDim.x / 2; stride > 0; stride >>= 1)
{
if (tid < stride)
{
idata[tid] += idata[tid + stride];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = idata[0];
}
// kernel 7 block数量减少到八分之一 线程束展开
__global__ void reduceUnrollWarps8(float *g_idata, float *g_odata, unsigned int n)
{
unsigned int tid = threadIdx.x;
unsigned int idx = blockIdx.x * blockDim.x * 8 + threadIdx.x;
float *idata = g_idata + blockIdx.x * blockDim.x * 8;
if (idx + 7 * blockDim.x < n)
{
int a1 = g_idata[idx];
int a2 = g_idata[idx + blockDim.x];
int a3 = g_idata[idx + 2 * blockDim.x];
int a4 = g_idata[idx + 3 * blockDim.x];
int b1 = g_idata[idx + 4 * blockDim.x];
int b2 = g_idata[idx + 5 * blockDim.x];
int b3 = g_idata[idx + 6 * blockDim.x];
int b4 = g_idata[idx + 7 * blockDim.x];
g_idata[idx] = a1 + a2 + a3 + a4 + b1 + b2 + b3 + b4;
}
__syncthreads();
for (int stride = blockDim.x / 2; stride > 32; stride >>= 1)
{
if (tid < stride)
{
idata[tid] += idata[tid + stride];
}
__syncthreads();
}
if (tid < 32)
{
volatile float *vmem = idata;
vmem[tid] += vmem[tid + 32];
vmem[tid] += vmem[tid + 16];
vmem[tid] += vmem[tid + 8];
vmem[tid] += vmem[tid + 4];
vmem[tid] += vmem[tid + 2];
vmem[tid] += vmem[tid + 1];
}
if (tid == 0) g_odata[blockIdx.x] = idata[0];
}
// kernel 8 block数量减少到八分之一 block内循环展开 完全循环展开
__global__ void reduceCompleteUnrollWarps8(float *g_idata, float *g_odata, unsigned int n)
{
unsigned int tid = threadIdx.x;
unsigned int idx = blockIdx.x * blockDim.x * 8 + threadIdx.x;
float *idata = g_idata + blockIdx.x * blockDim.x * 8;
if (idx + 7 * blockDim.x < n)
{
int a1 = g_idata[idx];
int a2 = g_idata[idx + blockDim.x];
int a3 = g_idata[idx + 2 * blockDim.x];
int a4 = g_idata[idx + 3 * blockDim.x];
int b1 = g_idata[idx + 4 * blockDim.x];
int b2 = g_idata[idx + 5 * blockDim.x];
int b3 = g_idata[idx + 6 * blockDim.x];
int b4 = g_idata[idx + 7 * blockDim.x];
g_idata[idx] = a1 + a2 + a3 + a4 + b1 + b2 + b3 + b4;
}
__syncthreads();
if (blockDim.x >= 1024 && tid < 512) idata[tid] += idata[tid + 512];
__syncthreads();
if (blockDim.x >= 512 && tid < 256) idata[tid] += idata[tid + 256];
__syncthreads();
if (blockDim.x >= 256 && tid < 128) idata[tid] += idata[tid + 128];
__syncthreads();
if (blockDim.x >= 128 && tid < 64) idata[tid] += idata[tid + 64];
__syncthreads();
if (tid < 32)
{
volatile float *vsmem = idata;
vsmem[tid] += vsmem[tid + 32];
vsmem[tid] += vsmem[tid + 16];
vsmem[tid] += vsmem[tid + 8];
vsmem[tid] += vsmem[tid + 4];
vsmem[tid] += vsmem[tid + 2];
vsmem[tid] += vsmem[tid + 1];
}
if (tid == 0) g_odata[blockIdx.x] = idata[0];
}
// kernel 9 block数量减少到八分之一 block内循环展开 完全循环展开 共享内存
__global__ void reduceSmemUnrolling4(float *g_idata, float *g_odata, unsigned int n)
{
__shared__ float smem[1024];
unsigned int tid = threadIdx.x;
unsigned int idx = blockIdx.x * blockDim.x * 4 + threadIdx.x;
float tmpSum = 0.0;
if (idx + 3 * blockDim.x < n)
{
int a1 = g_idata[idx];
int a2 = g_idata[idx + blockDim.x];
int a3 = g_idata[idx + 2 * blockDim.x];
int a4 = g_idata[idx + 3 * blockDim.x];
tmpSum = a1 + a2 + a3 + a4;
}
smem[tid] = tmpSum;
__syncthreads();
if (blockDim.x >= 1024 && tid < 512) smem[tid] += smem[tid + 512];
__syncthreads();
if (blockDim.x >= 512 && tid < 256) smem[tid] += smem[tid + 256];
__syncthreads();
if (blockDim.x >= 256 && tid < 128) smem[tid] += smem[tid + 128];
__syncthreads();
if (blockDim.x >= 128 && tid < 64) smem[tid] += smem[tid + 64];
__syncthreads();
if (tid < 32)
{
volatile float *vsmem = smem;
vsmem[tid] += vsmem[tid + 32];
vsmem[tid] += vsmem[tid + 16];
vsmem[tid] += vsmem[tid + 8];
vsmem[tid] += vsmem[tid + 4];
vsmem[tid] += vsmem[tid + 2];
vsmem[tid] += vsmem[tid + 1];
}
if (tid == 0) g_odata[blockIdx.x] = smem[0];
}
int main(int argc, char **argv)
{
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("%s starting transpose at ", argv[0]);
printf("device %d: %s \n", dev, deviceProp.name);
cudaSetDevice(dev);
int arraySize = 1024;
int dimx = 1024;
int dimy = 1;
int ikernel = 0;
bool ifverify = 0;
bool ifprintf = 0;
if (argc > 1) ikernel = atoi(argv[1]);
if (argc > 2) dimx = atoi(argv[2]);
if (argc > 3) dimy = atoi(argv[3]);
if (argc > 4) arraySize = atoi(argv[4]);
if ((argc > 5 && strcmp(argv[5], "-v") == 0)) ifverify = 1;
if ((argc > 6 && strcmp(argv[6], "-p") == 0) || (argc > 5 && strcmp(argv[5], "-p") == 0)) ifprintf = 1;
dim3 block(dimx, dimy);
dim3 block2(dimx / 2, dimy);
dim3 grid((arraySize + block.x - 1) / block.x, 1);
dim3 grid4(grid.x / 2, 1);
dim3 grid5(grid.x / 4, 1);
dim3 grid6(grid.x / 8, 1);
float *h_idata, *d_result;
h_idata = (float*)malloc(arraySize * sizeof(float));
d_result = (float*)malloc(grid.x * sizeof(float));
initialArray(h_idata, arraySize);
if (ifprintf) printArray("h_idata", h_idata, arraySize);
float *d_idata, *d_odata;
cudaMalloc((void**)&d_idata, arraySize * sizeof(float));
cudaMalloc((void**)&d_odata, grid.x * sizeof(float));
cudaMemcpy(d_idata, h_idata, arraySize * sizeof(float), cudaMemcpyHostToDevice);
double iStart, iElaps;
iStart = seconds();
float cpu_sum = recursiveReduce(h_idata, arraySize);
iElaps = seconds() - iStart;
printf("cpu_sum:\n%0.0f\n", cpu_sum);
printf("CPU运行时间为:%lfs\n\n", iElaps);
char const *kernelName;
iStart = seconds();
switch (ikernel)
{
case 1:
kernelName = "multiplicateMatrixOnDevice";
reduceNeighbored << <grid, block >> > (d_idata, d_odata, arraySize);
break;
case 2:
kernelName = "reduceNeighboredLess";
block = block2;
reduceNeighboredLess << <grid, block >> > (d_idata, d_odata, arraySize);
break;
case 3:
kernelName = "reduceInterleaved";
block = block2;
reduceInterleaved << <grid, block >> > (d_idata, d_odata, arraySize);
break;
case 4:
kernelName = "reduceUnrolling2 ";
grid = grid4;
reduceUnrolling2 << <grid, block >> > (d_idata, d_odata, arraySize);
break;
case 5:
kernelName = "reduceUnrolling4 ";
grid = grid5;
reduceUnrolling4 << <grid, block >> > (d_idata, d_odata, arraySize);
break;
case 6:
kernelName = "reduceUnrolling8 ";
grid = grid6;
reduceUnrolling8 << <grid, block >> > (d_idata, d_odata, arraySize);
break;
case 7:
kernelName = "reduceUnrollWarps8 ";
grid = grid6;
reduceUnrollWarps8 << <grid, block >> > (d_idata, d_odata, arraySize);
break;
case 8:
kernelName = "reduceCompleteUnrollWarps8 ";
grid = grid6;
reduceCompleteUnrollWarps8 << <grid, block >> > (d_idata, d_odata, arraySize);
break;
case 9:
kernelName = "reduceSmemUnrolling4 ";
grid = grid5;
reduceSmemUnrolling4 << <grid, block >> > (d_idata, d_odata, arraySize);
break;
}
iElaps = seconds() - iStart;
cudaDeviceSynchronize();
cudaMemcpy(d_result, d_odata, grid.x * sizeof(float), cudaMemcpyDeviceToHost);
float gpu_sum = addblockSum(d_result, grid.x);
printf("%s functions <<<grid(%d,%d),block(%d,%d)>>> GPU运行时间为:%fs\n", kernelName, grid.x, grid.y, block.x, block.y, iElaps);
printf("gpu_sum:\n%0.0f\n", gpu_sum);
if (ifverify) verifyArray(cpu_sum, gpu_sum);
cudaFree(d_idata);
cudaFree(d_odata);
free(h_idata);
free(d_result);
return 0;
}
CUDA技术测试-归约算法
最新推荐文章于 2025-03-10 16:40:30 发布