#include "../common/common.h"
#include <cuda_runtime.h>
#include <stdio.h>
#define BLOCK_SIZE 2
#define BDIMX 2
#define BDIMY 4
void initialData(float *in, const int size)
{
for (int i = 0; i < size; i++)
{
in[i] = i;
}
return;
}
void printData(const char *name, float *in, const int size)
{
printf("%s:\n", name);
for (int i = 0; i < size; i++)
{
printf("%3.0lf", in[i]);
}
printf("\n\n");
return;
}
void printMatrix(float *array, int row, int col)
{
float *p = array;
for (int y = 0; y < row; y++)
{
for (int x = 0; x < col; x++)
{
printf("%3.0lf", p[x]);
}
p = p + col;
printf("\n");
}
printf("\n");
return;
}
void transposeHost(float *out, float *in, const int row, const int col)
{
for (int i = 0; i < row; ++i)
{
for (int j = 0; j < col; ++j)
{
out[j * row + i] = in[i * col + j];
}
}
return;
}
int checkResult(float *hostRef, float *gpuRef, const int size)
{
for (int i = 0; i < size; i++)
{
int a = hostRef[i];
int b = gpuRef[i];
if (a != b)
{
printf("\n%d Error in computation and values are %f and %f\n\n", i, hostRef[i], gpuRef[i]);
return 1;
}
}
printf("The result is same!\n\n\n");
return 0;
}
__global__ void transposeNaiveRow(float *out, float *in, const int row, const int col)
{
unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
if (ix < col && iy < row)
{
out[ix * row + iy] = in[iy * col + ix];
}
}
__global__ void transposeNaiveCol(float *out, float *in, const int row, const int col)
{
unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
if (ix < col && iy < row)
{
out[iy * col + ix] = in[ix * row + iy];
}
}
__global__ void transposeNaiveCol1(float *out, float *in, const int row, const int col)
{
unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
if (ix < row && iy < col)
{
out[ iy * row + ix ] = in[ ix*col+iy ];
}
}
__global__ void transposeSmem(float *out, float *in, int nrow, int ncol)
{
__shared__ float tile[BDIMX][BDIMY];
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
int offset=row*ncol+col;
int bidx = threadIdx.y * blockDim.x + threadIdx.x;
int irow = bidx / blockDim.y;
int icol = bidx % blockDim.y;
int gcol = blockIdx.y * blockDim.y + icol;
int grow = blockIdx.x * blockDim.x + irow;
int transposed_offset = grow*nrow+gcol;
if (row < nrow && col < ncol)
{
tile[threadIdx.y][threadIdx.x]=in[offset];
__syncthreads();
out[transposed_offset] = tile[icol][irow];
}
}
__global__ void transposeSmemUnroll(float *out, float *in, const int nrows, const int ncols)
{
__shared__ float tile[BDIMX][BDIMY*2];
unsigned int iy = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int ix = (blockIdx.x * blockDim.x * 2) + threadIdx.x;
unsigned int index =ix+iy*ncols;
if (iy < nrows && ix+blockDim.x < ncols)
{
tile[threadIdx.y][threadIdx.x] = in[index];
tile[threadIdx.y][ threadIdx.x+blockDim.x] = in[index+blockDim.x];
}
__syncthreads();
int bidx=threadIdx.x+blockDim.x*threadIdx.y;
unsigned int ty2 = bidx / blockDim.y;
unsigned int tx2 = bidx % blockDim.y;
int ix2=blockIdx.y*blockDim.y+tx2;
int iy2=2*blockIdx.x*blockDim.x+ty2;
int index2 = iy2*nrows+ix2;
out[index2] = tile[tx2][ty2];
out[index2+nrows*blockDim.x] = tile[tx2][ty2+blockDim.x];
}
int main(int argc, char **argv)
{
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("%s starting transpose at ", argv[0]);
printf("device %d: %s ", dev, deviceProp.name);
CHECK(cudaSetDevice(dev));
int nx = 4;
int ny = 2;
int iKernel = 0;
int blockx = 2;
int blocky = 2;
if (argc > 1) iKernel = atoi(argv[1]);
if (argc > 2) blockx = atoi(argv[2]);
if (argc > 3) blocky = atoi(argv[3]);
if (argc > 4) nx = atoi(argv[4]);
if (argc > 5) ny = atoi(argv[5]);
printf(" with matrix nx %d ny %d with kernel %d\n", nx, ny, iKernel);
size_t nBytes = nx * ny * sizeof(float);
float *h_A = (float *)malloc(nBytes);
float *hostRef = (float *)malloc(nBytes);
float *gpuRef = (float *)malloc(nBytes);
initialData(h_A, nx * ny);
printData("h_A", h_A, nx * ny);
printMatrix(h_A, nx, ny);
transposeHost(hostRef, h_A, nx, ny);
printData("hostRef", hostRef, nx * ny);
printMatrix(hostRef, ny, nx);
float *d_A, *d_C;
CHECK(cudaMalloc((float**)&d_A, nBytes));
CHECK(cudaMalloc((float**)&d_C, nBytes));
CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
dim3 block(blockx, blocky);
dim3 grid((ny + block.x - 1) / block.x, (nx + block.y - 1) / block.y);
dim3 grid2((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
memset(gpuRef,0,nBytes);
double iStart = seconds();
transposeNaiveRow << <grid, block >> > (d_C, d_A, nx, ny);
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
CHECK(cudaDeviceSynchronize());
double iElaps = seconds() - iStart;
float ibnd = 2 * nx * ny * sizeof(float) / 1e9 / iElaps;
printf("%s elapsed %f sec <<< grid (%d,%d) block (%d,%d)>>> effective "
"bandwidth %f GB\n", "transposeNaiveRow", iElaps, grid.x, grid.y, block.x,
block.y, ibnd);
CHECK(cudaGetLastError());
printData("transposeNaiveRow", gpuRef, nx * ny);
printMatrix(gpuRef, ny, nx);
checkResult(hostRef, gpuRef, nx * ny);
memset(gpuRef,0,nBytes);
iStart = seconds();
transposeNaiveCol<< <grid, block >> > (d_C, d_A, nx, ny);
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
CHECK(cudaDeviceSynchronize());
iElaps = seconds() - iStart;
ibnd = 2 * nx * ny * sizeof(float) / 1e9 / iElaps;
printf("%s elapsed %f sec <<< grid (%d,%d) block (%d,%d)>>> effective "
"bandwidth %f GB\n", "transposeNaiveCol", iElaps, grid.x, grid.y, block.x,
block.y, ibnd);
CHECK(cudaGetLastError());
printData("transposeNaiveCol", gpuRef, nx * ny);
printMatrix(gpuRef, ny, nx);
checkResult(hostRef, gpuRef, nx * ny);
memset(gpuRef,0,nBytes);
iStart = seconds();
transposeNaiveCol1 << <grid2, block >> > (d_C, d_A, nx, ny);
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
CHECK(cudaDeviceSynchronize());
iElaps = seconds() - iStart;
ibnd = 2 * nx * ny * sizeof(float) / 1e9 / iElaps;
printf("%s elapsed %f sec <<< grid (%d,%d) block (%d,%d)>>> effective "
"bandwidth %f GB\n", "transposeNaiveCol1", iElaps, grid.x, grid.y, block.x,
block.y, ibnd);
CHECK(cudaGetLastError());
printData("transposeNaiveCol1", gpuRef, nx * ny);
printMatrix(gpuRef, ny, nx);
checkResult(hostRef, gpuRef, nx * ny);
memset(gpuRef,0,nBytes);
grid.x=(ny + block.x - 1) / block.x;
grid.y=(nx + block.y - 1) / block.y;
iStart = seconds();
transposeSmem << <grid, block >> > (d_C, d_A, nx, ny);
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
CHECK(cudaDeviceSynchronize());
iElaps = seconds() - iStart;
ibnd = 2 * nx * ny * sizeof(float) / 1e9 / iElaps;
printf("%s elapsed %f sec <<< grid (%d,%d) block (%d,%d)>>> effective "
"bandwidth %f GB\n", "transposeSmem", iElaps, grid.x, grid.y, block.x,
block.y, ibnd);
CHECK(cudaGetLastError());
printData("transposeSmem", gpuRef, nx * ny);
printMatrix(gpuRef, ny, nx);
checkResult(hostRef, gpuRef, nx * ny);
memset(gpuRef,0,nBytes);
grid.x=((ny + block.x - 1) / block.x)/2;
grid.y=(nx + block.y - 1) / block.y;
iStart = seconds();
transposeSmemUnroll << <grid, block >> > (d_C, d_A, nx, ny);
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
CHECK(cudaDeviceSynchronize());
iElaps = seconds() - iStart;
ibnd = 2 * nx * ny * sizeof(float) / 1e9 / iElaps;
printf("%s elapsed %f sec <<< grid (%d,%d) block (%d,%d)>>> effective "
"bandwidth %f GB\n", "transposeSmemUnroll", iElaps, grid.x, grid.y, block.x,
block.y, ibnd);
CHECK(cudaGetLastError());
printData("transposeSmemUnroll", gpuRef, nx * ny);
printMatrix(gpuRef, ny, nx);
checkResult(hostRef, gpuRef, nx * ny);
CHECK(cudaFree(d_A));
CHECK(cudaFree(d_C));
free(h_A);
free(hostRef);
free(gpuRef);
CHECK(cudaDeviceReset());
return EXIT_SUCCESS;
}