List Reduction_netlist reduction-优快云博客

本文链接：https://blog.youkuaiyun.com/dan1900/article/details/18961543

本文深入探讨了使用CUDA技术进行并行计算的方法，通过实例展示了如何利用GPU加速计算任务，包括数据导入、内存分配、内存复制、GPU内核启动、结果收集等关键步骤，最终实现大规模数据集的高效处理。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

// MP Reduction
// Given a list (lst) of length n
// Output its sum = lst[0] + lst[1] + ... + lst[n-1];

#include    <wb.h>

#define BLOCK_SIZE 512 //@@ You can change this

#define wbCheck(stmt) do {                                                    \
        cudaError_t err = stmt;                                               \
        if (err != cudaSuccess) {                                             \
            wbLog(ERROR, "Failed to run stmt ", #stmt);                       \
            wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));    \
            return -1;                                                        \
        }                                                                     \
    } while(0)

__global__ void total(float * input, float * output, int len) {
    //@@ Load a segment of the input vector into shared memory
    //@@ Traverse the reduction tree
    //@@ Write the computed sum of the block to the output vector at the 
    //@@ correct index
	__shared__ float partialSum[BLOCK_SIZE*2];
	unsigned int t=threadIdx.x;
	unsigned int start=blockIdx.x*blockDim.x*2;
	if(start+t<len)
	{
	   partialSum[t]=input[start+t];
	}
	else
		partialSum[t]=0;
	if(start+t+blockDim.x<len)
	   partialSum[t+blockDim.x]=input[start+t+blockDim.x];
	else
		partialSum[t+blockDim.x]=0;	
	
	__syncthreads();
	
	for(unsigned int stride=blockDim.x;stride>0;stride/=2)
	{
		__syncthreads();
		if(t<stride)
			partialSum[t]+=partialSum[t+stride];
	}
	__syncthreads();
	if(t==0)
	output[blockIdx.x]=partialSum[0];
}

int main(int argc, char ** argv) {
    int ii;
    wbArg_t args;
    float * hostInput; // The input 1D list
    float * hostOutput; // The output list
    float * deviceInput;
    float * deviceOutput;
    int numInputElements; // number of elements in the input list
    int numOutputElements; // number of elements in the output list

    args = wbArg_read(argc, argv);

    wbTime_start(Generic, "Importing data and creating memory on host");
    hostInput = (float *) wbImport(wbArg_getInputFile(args, 0), &numInputElements);

   numOutputElements = numInputElements / (BLOCK_SIZE<<1);
    if (numInputElements % (BLOCK_SIZE<<1)) {
        numOutputElements++;
    }	
    hostOutput = (float*) malloc(numOutputElements * sizeof(float));

    wbTime_stop(Generic, "Importing data and creating memory on host");

    wbLog(TRACE, "The number of input elements in the input is ", numInputElements);
    wbLog(TRACE, "The number of output elements in the input is ", numOutputElements);

    wbTime_start(GPU, "Allocating GPU memory.");
    //@@ Allocate GPU memory here
    cudaMalloc((void**)&deviceInput,sizeof(float)*numInputElements);
	cudaMalloc((void**)&deviceOutput,sizeof(float)*numOutputElements);
    wbTime_stop(GPU, "Allocating GPU memory.");

    wbTime_start(GPU, "Copying input memory to the GPU.");
    //@@ Copy memory to the GPU here
    cudaMemcpy(deviceInput,hostInput,sizeof(float)*numInputElements,cudaMemcpyHostToDevice);
    wbTime_stop(GPU, "Copying input memory to the GPU.");
    //@@ Initialize the grid and block dimensions here
    dim3 dimGrid(numOutputElements,1,1);
	dim3 dimBlock(BLOCK_SIZE,1,1);
    wbTime_start(Compute, "Performing CUDA computation");
    //@@ Launch the GPU Kernel here
    total<<<dimGrid,dimBlock>>>(deviceInput,deviceOutput,numInputElements);
    cudaDeviceSynchronize();
    wbTime_stop(Compute, "Performing CUDA computation");

    wbTime_start(Copy, "Copying output memory to the CPU");
    //@@ Copy the GPU memory back to the CPU here
    cudaMemcpy(hostOutput,deviceOutput,sizeof(float)*numOutputElements,cudaMemcpyDeviceToHost);
    wbTime_stop(Copy, "Copying output memory to the CPU");

    /********************************************************************
     * Reduce output vector on the host
     * NOTE: One could also perform the reduction of the output vector
     * recursively and support any size input. For simplicity, we do not
     * require that for this lab.
     ********************************************************************/
    for (ii = 1; ii < numOutputElements; ii++) {
        hostOutput[0] += hostOutput[ii];
    }

    wbTime_start(GPU, "Freeing GPU Memory");
    //@@ Free the GPU memory here
    cudaFree(deviceInput);
	cudaFree(deviceOutput);
    wbTime_stop(GPU, "Freeing GPU Memory");

    wbSolution(args, hostOutput, 1);

    free(hostInput);
    free(hostOutput);

    return 0;
}