my code:
// MP Scan
// Given a list (lst) of length n
// Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ... + lst[n-1]}
#include <wb.h>
#define BLOCK_SIZE 512 //@@ You can change this
#define wbCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \
return -1; \
} \
} while(0)
__global__ void scan(float * input, float * output, int len) {
//@@ Modify the body of this function to complete the functionality of
//@@ the scan on the device
//@@ You may need multiple kernel calls; write your kernels before this
//@@ function and call them from here
__shared__ float XY[BLOCK_SIZE*2];
unsigned int t=threadIdx.x;
unsigned int start=blockIdx.x*blockDim.x*2;
if(start+t<len)
{
XY[t]=input[start+t];
}
else
XY[t]=0;
if(start+t+blockDim.x<len)
XY[t+blockDim.x]=input[start+t+blockDim.x];
else
XY[t+blockDim.x]=0;
__syncthreads();
for(int stride=1;stride<=BLOCK_SIZE;stride*=2)
{
int index=(threadIdx.x+1)*stride*2-1;
if(index<2*BLOCK_SIZE)
XY[index]+=XY[index-stride];
__syncthreads();
}
for(int stride=BLOCK_SIZE/2;stride>0;stride/=2)
{
__syncthreads();
int index=(threadIdx.x+1)*stride*2-1;
if(index+stride<2*BLOCK_SIZE)
{
XY[index+stride]+=XY[index];
}
}
__syncthreads();
if(start+t<len)
output[start+t]=XY[threadIdx.x];
if(start+t+BLOCK_SIZE<len)
output[start + BLOCK_SIZE + t] = XY[BLOCK_SIZE + t];
}
int main(int argc, char ** argv) {
wbArg_t args;
float * hostInput; // The input 1D list
float * hostOutput; // The output list
float * deviceInput;
float * deviceOutput;
int numElements; // number of elements in the list
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostInput = (float *) wbImport(wbArg_getInputFile(args, 0), &numElements);
hostOutput = (float*) malloc(numElements * sizeof(float));
wbTime_stop(Generic, "Importing data and creating memory on host");
wbLog(TRACE, "The number of input elements in the input is ", numElements);
wbTime_start(GPU, "Allocating GPU memory.");
wbCheck(cudaMalloc((void**)&deviceInput, numElements*sizeof(float)));
wbCheck(cudaMalloc((void**)&deviceOutput, numElements*sizeof(float)));
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Clearing output memory.");
wbCheck(cudaMemset(deviceOutput, 0, numElements*sizeof(float)));
wbTime_stop(GPU, "Clearing output memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
wbCheck(cudaMemcpy(deviceInput, hostInput, numElements*sizeof(float), cudaMemcpyHostToDevice));
wbTime_stop(GPU, "Copying input memory to the GPU.");
//@@ Initialize the grid and block dimensions here
dim3 dimGrid((numElements-1)/(BLOCK_SIZE*2)+1,1,1);
dim3 dimBlock(BLOCK_SIZE,1,1);
wbTime_start(Compute, "Performing CUDA computation");
//@@ Modify this to complete the functionality of the scan
//@@ on the deivce
scan<<<dimGrid,dimBlock>>>(deviceInput,deviceOutput,numElements);
cudaDeviceSynchronize();
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
wbCheck(cudaMemcpy(hostOutput, deviceOutput, numElements*sizeof(float), cudaMemcpyDeviceToHost));
wbTime_stop(Copy, "Copying output memory to the CPU");
//deal block
if(numElements>BLOCK_SIZE*2)
{
for (int t = 1; t <=ceil(numElements/BLOCK_SIZE/2); ++t)
{
for(int ii=0;ii<BLOCK_SIZE*2&&t*BLOCK_SIZE*2+ii<numElements;++ii)
{
hostOutput[t*BLOCK_SIZE*2+ii] += hostOutput[t*BLOCK_SIZE*2-1];
}
}
}
wbTime_start(GPU, "Freeing GPU Memory");
cudaFree(deviceInput);
cudaFree(deviceOutput);
wbTime_stop(GPU, "Freeing GPU Memory");
wbSolution(args, hostOutput, numElements);
free(hostInput);
free(hostOutput);
return 0;
}
others:
// MP Scan
// Given a list (lst) of length n
// Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ... + lst[n-1]}
#include <wb.h>
#define BLOCK_SIZE 512 //@@ You can change this
#define wbCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \
return -1; \
} \
} while(0)
__global__ void fixup(float *input, float *aux, int len) {
unsigned int t = threadIdx.x, start = 2 * blockIdx.x * BLOCK_SIZE;
if (blockIdx.x) {
if (start + t < len)
input[start + t] += aux[blockIdx.x - 1];
if (start + BLOCK_SIZE + t < len)
input[start + BLOCK_SIZE + t] += aux[blockIdx.x - 1];
}
}
__global__ void scan(float * input, float * output, float *aux, int len) {
//@@ Modify the body of this function to complete the functionality of
//@@ the scan on the device
//@@ You may need multiple kernel calls; write your kernels before this
//@@ function and call them from here
__shared__ float scan_array[BLOCK_SIZE << 1];
unsigned int t = threadIdx.x, start = 2 * blockIdx.x * BLOCK_SIZE;
if (start + t < len)
scan_array[t] = input[start + t];
else
scan_array[t] = 0;
if (start + BLOCK_SIZE + t < len)
scan_array[BLOCK_SIZE + t] = input[start + BLOCK_SIZE + t];
else
scan_array[BLOCK_SIZE + t] = 0;
__syncthreads();
int stride;
for (stride = 1; stride <= BLOCK_SIZE; stride <<= 1) {
int index = (t + 1) * stride * 2 - 1;
if (index < 2 * BLOCK_SIZE)
scan_array[index] += scan_array[index - stride];
__syncthreads();
}
for (stride = BLOCK_SIZE >> 1; stride; stride >>= 1) {
int index = (t + 1) * stride * 2 - 1;
if (index + stride < 2 * BLOCK_SIZE)
scan_array[index + stride] += scan_array[index];
__syncthreads();
}
if (start + t < len)
output[start + t] = scan_array[t];
if (start + BLOCK_SIZE + t < len)
output[start + BLOCK_SIZE + t] = scan_array[BLOCK_SIZE + t];
if (aux && t == 0)
aux[blockIdx.x] = scan_array[2 * BLOCK_SIZE - 1];
}
int main(int argc, char ** argv) {
wbArg_t args;
float * hostInput; // The input 1D list
float * hostOutput; // The output list
float * deviceInput;
float * deviceOutput;
int numElements; // number of elements in the list
float *deviceAux, *deviceAuxScanned;
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostInput = (float *) wbImport(wbArg_getInputFile(args, 0), &numElements);
hostOutput = (float*) malloc(numElements * sizeof(float));
wbTime_stop(Generic, "Importing data and creating memory on host");
wbLog(TRACE, "The number of input elements in the input is ", numElements);
wbTime_start(GPU, "Allocating GPU memory.");
wbCheck(cudaMalloc((void**)&deviceInput, numElements*sizeof(float)));
wbCheck(cudaMalloc((void**)&deviceOutput, numElements*sizeof(float)));
wbCheck(cudaMalloc((void **)&deviceAux, (BLOCK_SIZE << 1) * sizeof(float)));
wbCheck(cudaMalloc((void **)&deviceAuxScanned, (BLOCK_SIZE << 1) * sizeof(float)));
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Clearing output memory.");
wbCheck(cudaMemset(deviceOutput, 0, numElements*sizeof(float)));
wbTime_stop(GPU, "Clearing output memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
wbCheck(cudaMemcpy(deviceInput, hostInput, numElements*sizeof(float), cudaMemcpyHostToDevice));
wbTime_stop(GPU, "Copying input memory to the GPU.");
//@@ Initialize the grid and block dimensions here
int numBlocks = ceil((float)numElements/(BLOCK_SIZE<<1));
dim3 dimGrid(numBlocks, 1, 1);
dim3 dimBlock(BLOCK_SIZE, 1, 1);
wbTime_start(Compute, "Performing CUDA computation");
//@@ Modify this to complete the functionality of the scan
//@@ on the deivce
scan<<<dimGrid, dimBlock>>>(deviceInput, deviceOutput, deviceAux, numElements);
cudaDeviceSynchronize();
scan<<<dim3(1,1,1), dimBlock>>>(deviceAux, deviceAuxScanned, NULL, BLOCK_SIZE << 1);
cudaDeviceSynchronize();
fixup<<<dimGrid, dimBlock>>>(deviceOutput, deviceAuxScanned, numElements);
cudaDeviceSynchronize();
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
wbCheck(cudaMemcpy(hostOutput, deviceOutput, numElements*sizeof(float), cudaMemcpyDeviceToHost));
wbTime_stop(Copy, "Copying output memory to the CPU");
wbTime_start(GPU, "Freeing GPU Memory");
cudaFree(deviceInput);
cudaFree(deviceOutput);
cudaFree(deviceAux);
cudaFree(deviceAuxScanned);
wbTime_stop(GPU, "Freeing GPU Memory");
wbSolution(args, hostOutput, numElements);
free(hostInput);
free(hostOutput);
return 0;
}