Nvidia-OpenCL-SDK-Code-Samples的学习[3]

最新推荐文章于 2025-06-03 11:07:16 发布

原创最新推荐文章于 2025-06-03 11:07:16 发布 · 1.7k 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#OpenCL

Improve Performance 专栏收录该内容

40 篇文章

订阅专栏

本文深入探讨了OpenCL中的扫描算法实现，特别是针对不同数组长度的并行计算优化策略。通过对核心函数scan1Inclusive和scan4Inclusive的分析，揭示了如何利用局部内存和线程合作来高效完成前缀和计算。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

接下来就是 MultiGPU 和 OpenGLInterop 但这两个工程先跳过。因为我电脑里只有一个显卡，然后与OpenGL交互暂时用不到。

所以接下来就是OpenCL Scan这个工程了，乍一看感觉和之前的都很不一样啊，感觉很难的样子。

cl文件：

// Scan codelets
////////////////////////////////////////////////////////////////////////////////
#if(1)
    //Naive inclusive scan: O(N * log2(N)) operations
    //Allocate 2 * 'size' local memory, initialize the first half
    //with 'size' zeros avoiding if(pos >= offset) condition evaluation
    //and saving instructions
    inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){
        uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1));
        l_Data[pos] = 0;
        pos += size;
        l_Data[pos] = idata;

        for(uint offset = 1; offset < size; offset <<= 1){
            barrier(CLK_LOCAL_MEM_FENCE);
            uint t = l_Data[pos] + l_Data[pos - offset];
            barrier(CLK_LOCAL_MEM_FENCE);
            l_Data[pos] = t;
        }

        return l_Data[pos];
    }

    inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){
        return scan1Inclusive(idata, l_Data, size) - idata;
    }

#else
    #define LOG2_WARP_SIZE 5U
    #define      WARP_SIZE (1U << LOG2_WARP_SIZE)

    //Almost the same as naive scan1Inclusive but doesn't need barriers
    //and works only for size <= WARP_SIZE
    inline uint warpScanInclusive(uint idata, volatile __local uint *l_Data, uint size){
        uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1));
        l_Data[pos] = 0;
        pos += size;
        l_Data[pos] = idata;

        if(size >=  2) l_Data[pos] += l_Data[pos -  1];
        if(size >=  4) l_Data[pos] += l_Data[pos -  2];
        if(size >=  8) l_Data[pos] += l_Data[pos -  4];
        if(size >= 16) l_Data[pos] += l_Data[pos -  8];
        if(size >= 32) l_Data[pos] += l_Data[pos - 16];

        return l_Data[pos];
    }

    inline uint warpScanExclusive(uint idata, __local uint *l_Data, uint size){
        return warpScanInclusive(idata, l_Data, size) - idata;
    }

    inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){
        if(size > WARP_SIZE){
            //Bottom-level inclusive warp scan
            uint warpResult = warpScanInclusive(idata, l_Data, WARP_SIZE);

            //Save top elements of each warp for exclusive warp scan
            //sync to wait for warp scans to complete (because l_Data is being overwritten)
            barrier(CLK_LOCAL_MEM_FENCE);
            if( (get_local_id(0) & (WARP_SIZE - 1)) == (WARP_SIZE - 1) )
                l_Data[get_local_id(0) >> LOG2_WARP_SIZE] = warpResult;

            //wait for warp scans to complete
            barrier(CLK_LOCAL_MEM_FENCE);
            if( get_local_id(0) < (WORKGROUP_SIZE / WARP_SIZE) ){
                //grab top warp elements
                uint val = l_Data[get_local_id(0)];
                //calculate exclsive scan and write back to shared memory
                l_Data[get_local_id(0)] = warpScanExclusive(val, l_Data, size >> LOG2_WARP_SIZE);
            }

            //return updated warp scans with exclusive scan results
            barrier(CLK_LOCAL_MEM_FENCE);
            return warpResult + l_Data[get_local_id(0) >> LOG2_WARP_SIZE];
        }else{
            return warpScanInclusive(idata, l_Data, size);
        }
    }

    inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){
        return scan1Inclusive(idata, l_Data, size) - idata;
    }
#endif


//Vector scan: the array to be scanned is stored
//in work-item private memory as uint4
inline uint4 scan4Inclusive(uint4 data4, __local uint *l_Data, uint size){
    //Level-0 inclusive scan
    data4.y += data4.x;
    data4.z += data4.y;
    data4.w += data4.z;

    //Level-1 exclusive scan
    uint val = scan1Inclusive(data4.w, l_Data, size / 4) - data4.w;

    return (data4 + (uint4)val);
}

inline uint4 scan4Exclusive(uint4 data4, __local uint *l_Data, uint size){
    return scan4Inclusive(data4, l_Data, size) - data4;
}

////////////////////////////////////////////////////////////////////////////////
// Scan kernels
////////////////////////////////////////////////////////////////////////////////
__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
void scanExclusiveLocal1(
    __global uint4 *d_Dst,
    __global uint4 *d_Src,
    __local uint *l_Data,
    uint size
){
    //Load data
    uint4 idata4 = d_Src[get_global_id(0)];

    //Calculate exclusive scan
    uint4 odata4  = scan4Exclusive(idata4, l_Data, size);

    //Write back
    d_Dst[get_global_id(0)] = odata4;
}

//Exclusive scan of top elements of bottom-level scans (4 * THREADBLOCK_SIZE)
__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
void scanExclusiveLocal2(
    __global uint *d_Buf,
    __global uint *d_Dst,
    __global uint *d_Src,
    __local uint *l_Data,
    uint N,
    uint arrayLength
){
    //Load top elements
    //Convert results of bottom-level scan back to inclusive
    //Skip loads and stores for inactive work-items of the work-group with highest index(pos >= N)
    uint data = 0;
    if(get_global_id(0) < N)
    data =
        d_Dst[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)] + 
        d_Src[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)];

    //Compute
    uint odata = scan1Exclusive(data, l_Data, arrayLength);

    //Avoid out-of-bound access
    if(get_global_id(0) < N)
        d_Buf[get_global_id(0)] = odata;
}

//Final step of large-array scan: combine basic inclusive scan with exclusive scan of top elements of input arrays
__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
void uniformUpdate(
    __global uint4 *d_Data,
    __global uint *d_Buf
){
    __local uint buf[1];

    uint4 data4 = d_Data[get_global_id(0)];

    if(get_local_id(0) == 0)
        buf[0] = d_Buf[get_group_id(0)];

    barrier(CLK_LOCAL_MEM_FENCE);
    data4 += (uint4)buf[0];
    d_Data[get_global_id(0)] = data4;
}

main函数部分：

int main(int argc, const char **argv)
{
    shrQAStart(argc, (char **)argv);

    // Start logs
    shrSetLogFileName ("oclScan.txt");
    shrLog("%s Starting...\n\n", argv[0]);

    cl_platform_id cpPlatform;       //OpenCL platform
    cl_device_id cdDevice;           //OpenCL device
    cl_context      cxGPUContext;    //OpenCL context
    cl_command_queue cqCommandQueue; //OpenCL command que
    cl_mem d_Input, d_Output;        //OpenCL memory buffer objects

    cl_int ciErrNum;
    uint *h_Input, *h_OutputCPU, *h_OutputGPU;
    const uint N = 13 * 1048576 / 2;

    shrLog("Allocating and initializing host arrays...\n");
        h_Input     = (uint *)malloc(N * sizeof(uint));
        h_OutputCPU = (uint *)malloc(N * sizeof(uint));
        h_OutputGPU = (uint *)malloc(N * sizeof(uint));
        srand(2009);
        for(uint i = 0; i < N; i++)
            h_Input[i] = rand();

    shrLog("Initializing OpenCL...\n");
        //Get the NVIDIA platform
        ciErrNum = oclGetPlatformID(&cpPlatform);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Get a GPU device
        ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Create the context
        cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Create a command-queue
        cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Initializing OpenCL scan...\n");
        initScan(cxGPUContext, cqCommandQueue, argv);

    shrLog("Creating OpenCL memory objects...\n\n");
        d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, N * sizeof(uint), h_Input, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        d_Output = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, N * sizeof(uint), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    int globalFlag = 1; // init pass/fail flag to pass
    size_t szWorkgroup;
    const int iCycles = 100;
    shrLog("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles);
    for(uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength *= 2)
    {
        shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
            clFinish(cqCommandQueue);
            shrDeltaT(0);
            for (int i = 0; i<iCycles; i++)
            {
                szWorkgroup = scanExclusiveShort(
                    cqCommandQueue,
                    d_Output,
                    d_Input,
                    N / arrayLength,
                    arrayLength
                );
            }
            clFinish(cqCommandQueue);
            double timerValue = shrDeltaT(0)/(double)iCycles;

        shrLog("Validating the results...\n");
            shrLog(" ...reading back OpenCL memory\n");
                ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, N * sizeof(uint), h_OutputGPU, 0, NULL, NULL);
                oclCheckError(ciErrNum, CL_SUCCESS);

            shrLog(" ...scanExclusiveHost()\n");
                scanExclusiveHost(
                    h_OutputCPU,
                    h_Input,
                    N / arrayLength,
                    arrayLength
                );

            // Compare GPU results with CPU results and accumulate error for this test
            shrLog(" ...comparing the results\n");
                int localFlag = 1;
                for(uint i = 0; i < N; i++)
                {
                    if(h_OutputCPU[i] != h_OutputGPU[i])
                    {
                        localFlag = 0;
                        break;
                    }
                }

            // Log message on individual test result, then accumulate to global flag
            shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
            globalFlag = globalFlag && localFlag;

            #ifdef GPU_PROFILING
                if (arrayLength == MAX_SHORT_ARRAY_SIZE)
                {
                    shrLog("\n");
                    shrLogEx(LOGBOTH | MASTER, 0, "oclScan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
                           (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup);
                    shrLog("\n");
                }
            #endif
    }

    shrLog("*** Running GPU scan for large arrays (%d identical iterations)...\n\n", iCycles);
    for(uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength *= 2)
    {
        shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
            clFinish(cqCommandQueue);
            shrDeltaT(0);
            for (int i = 0; i<iCycles; i++)
            {
                szWorkgroup = scanExclusiveLarge(
                    cqCommandQueue,
                    d_Output,
                    d_Input,
                    N / arrayLength,
                    arrayLength
                );
            }
            clFinish(cqCommandQueue);
            double timerValue = shrDeltaT(0)/(double)iCycles;

        shrLog("Validating the results...\n");
            shrLog(" ...reading back OpenCL memory\n");
                ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, N * sizeof(uint), h_OutputGPU, 0, NULL, NULL);
                oclCheckError(ciErrNum, CL_SUCCESS);

            shrLog(" ...scanExclusiveHost()\n");
                scanExclusiveHost(
                    h_OutputCPU,
                    h_Input,
                    N / arrayLength,
                    arrayLength
                );

            // Compare GPU results with CPU results and accumulate error for this test
            shrLog(" ...comparing the results\n");
                int localFlag = 1;
                for(uint i = 0; i < N; i++)
                {
                    if(h_OutputCPU[i] != h_OutputGPU[i])
                    {
                        localFlag = 0;
                        break;
                    }
                }

            // Log message on individual test result, then accumulate to global flag
            shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
            globalFlag = globalFlag && localFlag;

            #ifdef GPU_PROFILING
                if (arrayLength == MAX_LARGE_ARRAY_SIZE)
                {
                    shrLog("\n");
                    shrLogEx(LOGBOTH | MASTER, 0, "oclScan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
                           (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup);
                    shrLog("\n");
                }
            #endif
    }

    shrLog("Shutting down...\n");
        //Release kernels and program
        closeScan();

        //Release other OpenCL Objects
        ciErrNum  = clReleaseMemObject(d_Output);
        ciErrNum |= clReleaseMemObject(d_Input);
        ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
        ciErrNum |= clReleaseContext(cxGPUContext);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Release host buffers
        free(h_OutputGPU);
        free(h_OutputCPU);
        free(h_Input);


    // finish
    // pass or fail (cumulative... all tests in the loop)
    shrQAFinishExit(argc, (const char **)argv, globalFlag ? QA_PASSED : QA_FAILED);

        //Finish
        shrEXIT(argc, argv);
}

其实就是这个意思，src是个N=6815744大小的数组，同样创建一个这么大的数组，数组中每个值是前面所有位置值的和：

就是这个意思。
但有个问题，host端实参h_Input变量是N大小的uint型创建buffer时也是N大小的uint型变量名是d_Input 将这个d_Input传给kernel函数scanExclusiveLocal1，而kernel那个位置形参是global uint4* d_Src 这个例子中竟然可以直接传过去的？？？uint直接传给了uint4，甚至没有强转换的？？？我以为这样直接传是只传进来给uint4.x 另外的uint4.y、uint4.z和uint4.w都是随机的吗随机的无意义的值。结果不是这样的！！！！有一个100个uint元素构成的缓冲区（因为int/uint这种，在A卡实现上，和普通的int/uint大小一样，因此就不区分cl_int之类的cl_前缀了）你可以使用uint *p1指向它，那么自然依然有100个有效元素, 即p1[0] - p1[99] 。如果你要使用uint4 *p2指向它，这也是可以的。但只有25个有效元素了，即p2[0] - p2[24]。无效的只是从p2[25] - p2[99]。而不是我想像的p2[0] - p2[99]中，分别只有x元素有效，y/z/w元素无效！！！大神指出这是我第N次在这种内存问题上栽倒了，上次及上上次有2个例子也是这样！

我现在是按照main函数一步步分析：arrayLength=4时，localsize=256,globalsize=N/4=1703936，执行第1个kernel，globalsize个线程去调用scan4Exclusive()函数，因为uint传给uint4，所以其实总共是有N个数的。又调用了scan4Inclusive()函数，其实就是每个globalID负责原来的4个uint数，函数scan4Inclusive()中将这4个数逐级相加，其实就是对于这4个数实现了每一个数是前面的数之和。然后函数scan4Inclusive()又调用了scan1Inclusive()函数，我分析到scan1Inclusive()函数里就卡住了。scan1Inclusive()函数用到了localID，因为localsize是256大小，而这里l_Data[pos]=0; pos+=size;(arrayLength=4时 size=1) l_data[pos]=idata;所以其实相当于每个localID算了2个位置的数，256个localID*2=512，所以l_Data大小为512是没毛病的。只是这个l_Data里奇数位置放的都是0，偶数位置放的分别是原数据uint的前4位之和、5-8位之和、9-12位之和、、、、我晕倒了、、、