List Scan

最新推荐文章于 2024-09-06 14:23:40 发布

原创最新推荐文章于 2024-09-06 14:23:40 发布 · 1.1k 阅读

0 ·

CC 4.0 BY-SA版权

code 同时被 2 个专栏收录

91 篇文章

订阅专栏

cuda

14 篇文章

订阅专栏

本文介绍了一种使用CUDA实现前缀和扫描(prefix sum scan)的方法，包括两种不同的内核函数实现方式，一种通过直接修改输入数组完成扫描操作，另一种则利用辅助数组进行中间计算后再修正最终结果。

部署运行你感兴趣的模型镜像

my code:

// MP Scan
// Given a list (lst) of length n
// Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ... + lst[n-1]}

#include    <wb.h>

#define BLOCK_SIZE 512 //@@ You can change this

#define wbCheck(stmt) do {                                                    \
        cudaError_t err = stmt;                                               \
        if (err != cudaSuccess) {                                             \
            wbLog(ERROR, "Failed to run stmt ", #stmt);                       \
            wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));    \
            return -1;                                                        \
        }                                                                     \
    } while(0)
    
__global__ void scan(float * input, float * output, int len) {
    //@@ Modify the body of this function to complete the functionality of
    //@@ the scan on the device
    //@@ You may need multiple kernel calls; write your kernels before this
    //@@ function and call them from here
	__shared__ float XY[BLOCK_SIZE*2];  
    unsigned int t=threadIdx.x;  
    unsigned int start=blockIdx.x*blockDim.x*2;  
    if(start+t<len)  
    {  
      XY[t]=input[start+t];  
    }  
    else  
        XY[t]=0;  
    if(start+t+blockDim.x<len)  
       XY[t+blockDim.x]=input[start+t+blockDim.x];  
    else  
        XY[t+blockDim.x]=0;   
      
    __syncthreads();  
	
	
	for(int stride=1;stride<=BLOCK_SIZE;stride*=2)
   {
     int index=(threadIdx.x+1)*stride*2-1;
     if(index<2*BLOCK_SIZE)
        XY[index]+=XY[index-stride];
     __syncthreads();
   }
	
	for(int stride=BLOCK_SIZE/2;stride>0;stride/=2)
	{
		__syncthreads();
		int index=(threadIdx.x+1)*stride*2-1;
		if(index+stride<2*BLOCK_SIZE)
		{
			XY[index+stride]+=XY[index];
		}
	}
	__syncthreads();
	
	if(start+t<len)
		output[start+t]=XY[threadIdx.x];
	if(start+t+BLOCK_SIZE<len)
		output[start + BLOCK_SIZE + t] = XY[BLOCK_SIZE + t];
 
	
}

int main(int argc, char ** argv) {
    wbArg_t args;
    float * hostInput; // The input 1D list
    float * hostOutput; // The output list
    float * deviceInput;
    float * deviceOutput;
    int numElements; // number of elements in the list

    args = wbArg_read(argc, argv);

    wbTime_start(Generic, "Importing data and creating memory on host");
    hostInput = (float *) wbImport(wbArg_getInputFile(args, 0), &numElements);
    hostOutput = (float*) malloc(numElements * sizeof(float));
    wbTime_stop(Generic, "Importing data and creating memory on host");

    wbLog(TRACE, "The number of input elements in the input is ", numElements);

    wbTime_start(GPU, "Allocating GPU memory.");
    wbCheck(cudaMalloc((void**)&deviceInput, numElements*sizeof(float)));
    wbCheck(cudaMalloc((void**)&deviceOutput, numElements*sizeof(float)));
    wbTime_stop(GPU, "Allocating GPU memory.");

    wbTime_start(GPU, "Clearing output memory.");
    wbCheck(cudaMemset(deviceOutput, 0, numElements*sizeof(float)));
    wbTime_stop(GPU, "Clearing output memory.");

    wbTime_start(GPU, "Copying input memory to the GPU.");
    wbCheck(cudaMemcpy(deviceInput, hostInput, numElements*sizeof(float), cudaMemcpyHostToDevice));
    wbTime_stop(GPU, "Copying input memory to the GPU.");

    //@@ Initialize the grid and block dimensions here
    dim3 dimGrid((numElements-1)/(BLOCK_SIZE*2)+1,1,1);  
    dim3 dimBlock(BLOCK_SIZE,1,1);  
    wbTime_start(Compute, "Performing CUDA computation");
    //@@ Modify this to complete the functionality of the scan
    //@@ on the deivce
    scan<<<dimGrid,dimBlock>>>(deviceInput,deviceOutput,numElements);  
    cudaDeviceSynchronize();
    wbTime_stop(Compute, "Performing CUDA computation");
 
    wbTime_start(Copy, "Copying output memory to the CPU");
    wbCheck(cudaMemcpy(hostOutput, deviceOutput, numElements*sizeof(float), cudaMemcpyDeviceToHost));
    wbTime_stop(Copy, "Copying output memory to the CPU");

	//deal block
	if(numElements>BLOCK_SIZE*2)
	{
	   for (int t = 1; t <=ceil(numElements/BLOCK_SIZE/2); ++t)
	   {  
         for(int ii=0;ii<BLOCK_SIZE*2&&t*BLOCK_SIZE*2+ii<numElements;++ii)
		 {
		   hostOutput[t*BLOCK_SIZE*2+ii] += hostOutput[t*BLOCK_SIZE*2-1];  
		 }
       } 
	}
	
    wbTime_start(GPU, "Freeing GPU Memory");
    cudaFree(deviceInput);
    cudaFree(deviceOutput);
    wbTime_stop(GPU, "Freeing GPU Memory");

    wbSolution(args, hostOutput, numElements);

    free(hostInput);
    free(hostOutput);

    return 0;
}

others:

// MP Scan
// Given a list (lst) of length n
// Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ... + lst[n-1]}

#include    <wb.h>

#define BLOCK_SIZE 512 //@@ You can change this

#define wbCheck(stmt) do {                                                    \
        cudaError_t err = stmt;                                               \
        if (err != cudaSuccess) {                                             \
            wbLog(ERROR, "Failed to run stmt ", #stmt);                       \
            wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));    \
            return -1;                                                        \
        }                                                                     \
    } while(0)

__global__ void fixup(float *input, float *aux, int len) {
    unsigned int t = threadIdx.x, start = 2 * blockIdx.x * BLOCK_SIZE;
    if (blockIdx.x) {
       if (start + t < len)
          input[start + t] += aux[blockIdx.x - 1];
       if (start + BLOCK_SIZE + t < len)
          input[start + BLOCK_SIZE + t] += aux[blockIdx.x - 1];
    }
}
 
__global__ void scan(float * input, float * output, float *aux, int len) {
    //@@ Modify the body of this function to complete the functionality of
    //@@ the scan on the device
    //@@ You may need multiple kernel calls; write your kernels before this
    //@@ function and call them from here
    __shared__ float scan_array[BLOCK_SIZE << 1];
    unsigned int t = threadIdx.x, start = 2 * blockIdx.x * BLOCK_SIZE;
    if (start + t < len)
       scan_array[t] = input[start + t];
    else
       scan_array[t] = 0;
    if (start + BLOCK_SIZE + t < len)
       scan_array[BLOCK_SIZE + t] = input[start + BLOCK_SIZE + t];
    else
       scan_array[BLOCK_SIZE + t] = 0;
    __syncthreads();

    int stride;
    for (stride = 1; stride <= BLOCK_SIZE; stride <<= 1) {
       int index = (t + 1) * stride * 2 - 1;
       if (index < 2 * BLOCK_SIZE)
          scan_array[index] += scan_array[index - stride];
       __syncthreads();
    }
 
    for (stride = BLOCK_SIZE >> 1; stride; stride >>= 1) {
       int index = (t + 1) * stride * 2 - 1;
       if (index + stride < 2 * BLOCK_SIZE)
          scan_array[index + stride] += scan_array[index];
       __syncthreads();
    }
 
    if (start + t < len)
       output[start + t] = scan_array[t];
    if (start + BLOCK_SIZE + t < len)
       output[start + BLOCK_SIZE + t] = scan_array[BLOCK_SIZE + t];
 
    if (aux && t == 0)
       aux[blockIdx.x] = scan_array[2 * BLOCK_SIZE - 1];
}

int main(int argc, char ** argv) {
    wbArg_t args;
    float * hostInput; // The input 1D list
    float * hostOutput; // The output list
    float * deviceInput;
    float * deviceOutput;
    int numElements; // number of elements in the list
	float *deviceAux, *deviceAuxScanned;
	
    args = wbArg_read(argc, argv);

    wbTime_start(Generic, "Importing data and creating memory on host");
    hostInput = (float *) wbImport(wbArg_getInputFile(args, 0), &numElements);
    hostOutput = (float*) malloc(numElements * sizeof(float));
    wbTime_stop(Generic, "Importing data and creating memory on host");

    wbLog(TRACE, "The number of input elements in the input is ", numElements);

    
	wbTime_start(GPU, "Allocating GPU memory.");
    wbCheck(cudaMalloc((void**)&deviceInput, numElements*sizeof(float)));
    wbCheck(cudaMalloc((void**)&deviceOutput, numElements*sizeof(float)));
	wbCheck(cudaMalloc((void **)&deviceAux, (BLOCK_SIZE << 1) * sizeof(float)));
    wbCheck(cudaMalloc((void **)&deviceAuxScanned, (BLOCK_SIZE << 1) * sizeof(float)));
	
    wbTime_stop(GPU, "Allocating GPU memory.");

    wbTime_start(GPU, "Clearing output memory.");
    wbCheck(cudaMemset(deviceOutput, 0, numElements*sizeof(float)));
    wbTime_stop(GPU, "Clearing output memory.");

    wbTime_start(GPU, "Copying input memory to the GPU.");
    wbCheck(cudaMemcpy(deviceInput, hostInput, numElements*sizeof(float), cudaMemcpyHostToDevice));
    wbTime_stop(GPU, "Copying input memory to the GPU.");

    //@@ Initialize the grid and block dimensions here
	int numBlocks = ceil((float)numElements/(BLOCK_SIZE<<1));
    dim3 dimGrid(numBlocks, 1, 1);
    dim3 dimBlock(BLOCK_SIZE, 1, 1);
   

    wbTime_start(Compute, "Performing CUDA computation");
    //@@ Modify this to complete the functionality of the scan
    //@@ on the deivce
	scan<<<dimGrid, dimBlock>>>(deviceInput, deviceOutput, deviceAux, numElements);
    cudaDeviceSynchronize();
    scan<<<dim3(1,1,1), dimBlock>>>(deviceAux, deviceAuxScanned, NULL, BLOCK_SIZE << 1);
    cudaDeviceSynchronize();
    fixup<<<dimGrid, dimBlock>>>(deviceOutput, deviceAuxScanned, numElements);

    cudaDeviceSynchronize();
    wbTime_stop(Compute, "Performing CUDA computation");

    wbTime_start(Copy, "Copying output memory to the CPU");
    wbCheck(cudaMemcpy(hostOutput, deviceOutput, numElements*sizeof(float), cudaMemcpyDeviceToHost));
    wbTime_stop(Copy, "Copying output memory to the CPU");

    wbTime_start(GPU, "Freeing GPU Memory");
    cudaFree(deviceInput);
    cudaFree(deviceOutput);
	cudaFree(deviceAux);
    cudaFree(deviceAuxScanned);
    
	wbTime_stop(GPU, "Freeing GPU Memory");

    wbSolution(args, hostOutput, numElements);

    free(hostInput);
    free(hostOutput);

    return 0;
}

您可能感兴趣的与本文相关的镜像