自写代码来理解 get_global_id 和 get_global_size

原创已于 2024-01-10 08:26:22 修改 · 849 阅读

8 ·

CC 4.0 BY-SA版权

文章标签：

#c++

于 2024-01-09 17:30:06 首次发布

<2022-01-24 周一>

《OpenCL编程指南》第三章

自写代码来理解`get_global_id`和`get_global_size`

使用本书第三章中关于输入信号卷积的代码来进行理解，见随书代码“src/Chapter_3/OpenCLConvolution”，附代码如下：

//
// Book:      OpenCL(R) Programming Guide
// Authors:   Aaftab Munshi, Benedict Gaster, Timothy Mattson, James Fung, Dan Ginsburg
// ISBN-10:   0-321-74964-2
// ISBN-13:   978-0-321-74964-2
// Publisher: Addison-Wesley Professional
// URLs:      http://safari.informit.com/9780132488006/
//            http://www.openclprogrammingguide.com
//

// Convolution.cpp
//
//    This is a simple example that demonstrates OpenCL platform, device, and context
//    use.

#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <iomanip>

#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

#if !defined(CL_CALLBACK)
#define CL_CALLBACK
#endif

// Constants
const unsigned int inputSignalWidth = 8;
const unsigned int inputSignalHeight = 8;

cl_uint inputSignal[inputSignalWidth][inputSignalHeight] =
{
  {3, 1, 1, 4, 8, 2, 1, 3},
  {4, 2, 1, 1, 2, 1, 2, 3},
  {4, 4, 4, 4, 3, 2, 2, 2},
  {9, 8, 3, 8, 9, 0, 0, 0},
  {9, 3, 3, 9, 0, 0, 0, 0},
  {0, 9, 0, 8, 0, 0, 0, 0},
  {3, 0, 8, 8, 9, 4, 4, 4},
  {5, 9, 8, 1, 8, 1, 1, 1}
};

const unsigned int outputSignalWidth = 6;
const unsigned int outputSignalHeight = 6;

cl_uint outputSignal[outputSignalWidth][outputSignalHeight];

const unsigned int maskWidth = 3;
const unsigned int maskHeight = 3;

cl_uint mask[maskWidth][maskHeight] =
{
  {1, 1, 1}, {1, 0, 1}, {1, 1, 1},
};

///
// Function to check and handle OpenCL errors
inline void
checkErr(cl_int err, const char * name)
{
  if (err != CL_SUCCESS) {
    std::cerr << "ERROR: " << name << " (" << err << ")" << std::endl;
    exit(EXIT_FAILURE);
  }
}

void CL_CALLBACK contextCallback(
  const char * errInfo,
  const void * private_info,
  size_t cb,
  void * user_data)
{
  std::cout << "Error occured during context use: " << errInfo << std::endl;
  // should really perform any clearup and so on at this point
  // but for simplicitly just exit.
  exit(1);
}

///
//	main() for Convoloution example
//
int main(int argc, char** argv)
{
  cl_int errNum;
  cl_uint numPlatforms;
  cl_uint numDevices;
  cl_platform_id * platformIDs;
  cl_device_id * deviceIDs;
  cl_context context = NULL;
  cl_command_queue queue;
  cl_program program;
  cl_kernel kernel;
  cl_mem inputSignalBuffer;
  cl_mem outputSignalBuffer;
  cl_mem maskBuffer;

  // First, select an OpenCL platform to run on.
  errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
  checkErr(
    (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS),
    "clGetPlatformIDs");

  platformIDs = (cl_platform_id *)alloca(
    sizeof(cl_platform_id) * numPlatforms);

  errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
  checkErr(
    (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS),
    "clGetPlatformIDs");

  // Iterate through the list of platforms until we find one that supports
  // a CPU device, otherwise fail with an error.
  deviceIDs = NULL;
  cl_uint i;
  for (i = 0; i < numPlatforms; i++)
  {
    errNum = clGetDeviceIDs(
      platformIDs[i],
      CL_DEVICE_TYPE_CPU,
      0,
      NULL,
      &numDevices);
    if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
    {
      checkErr(errNum, "clGetDeviceIDs");
    }
    else if (numDevices > 0)
    {
      deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices);
      errNum = clGetDeviceIDs(
        platformIDs[i],
        CL_DEVICE_TYPE_CPU,
        numDevices,
        &deviceIDs[0],
        NULL);
      checkErr(errNum, "clGetDeviceIDs");
      break;
    }
  }

  // Check to see if we found at least one CPU device, otherwise return
  if (deviceIDs == NULL) {
    std::cout << "No CPU device found" << std::endl;
    exit(-1);
  }

  // Next, create an OpenCL context on the selected platform.
  cl_context_properties contextProperties[] =
  {
      CL_CONTEXT_PLATFORM,
      (cl_context_properties)platformIDs[i],
      0
  };
  context = clCreateContext(
    contextProperties,
    numDevices,
    deviceIDs,
    &contextCallback,
    NULL,
    &errNum);
  checkErr(errNum, "clCreateContext");

  std::ifstream srcFile("Convolution.cl");
  checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading Convolution.cl");

  std::string srcProg(
    std::istreambuf_iterator<char>(srcFile),
    (std::istreambuf_iterator<char>()));

  const char * src = srcProg.c_str();
  size_t length = srcProg.length();

  // Create program from source
  program = clCreateProgramWithSource(
    context,
    1,
    &src,
    &length,
    &errNum);
  checkErr(errNum, "clCreateProgramWithSource");

  // Build program
  errNum = clBuildProgram(
    program,
    numDevices,
    deviceIDs,
    NULL,
    NULL,
    NULL);
  if (errNum != CL_SUCCESS)
  {
    // Determine the reason for the error
    char buildLog[16384];
    clGetProgramBuildInfo(
      program,
      deviceIDs[0],
      CL_PROGRAM_BUILD_LOG,
      sizeof(buildLog),
      buildLog,
      NULL);

    std::cerr << "Error in kernel: " << std::endl;
    std::cerr << buildLog;
    checkErr(errNum, "clBuildProgram");
  }

  // Create kernel object
  kernel = clCreateKernel(
    program,
    "convolve",
    &errNum);
  checkErr(errNum, "clCreateKernel");

  // Now allocate buffers
  inputSignalBuffer = clCreateBuffer(
    context,
    CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
    sizeof(cl_uint) * inputSignalHeight * inputSignalWidth,
    static_cast<void *>(inputSignal),
    &errNum);
  checkErr(errNum, "clCreateBuffer(inputSignal)");

  maskBuffer = clCreateBuffer(
    context,
    CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
    sizeof(cl_uint) * maskHeight * maskWidth,
    static_cast<void *>(mask),
    &errNum);
  checkErr(errNum, "clCreateBuffer(mask)");

  outputSignalBuffer = clCreateBuffer(
    context,
    CL_MEM_WRITE_ONLY,
    sizeof(cl_uint) * outputSignalHeight * outputSignalWidth,
    NULL,
    &errNum);
  checkErr(errNum, "clCreateBuffer(outputSignal)");

  // Pick the first device and create command queue.
  queue = clCreateCommandQueue(
    context,
    deviceIDs[0],
    0,
    &errNum);
  checkErr(errNum, "clCreateCommandQueue");

  errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputSignalBuffer);
  errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &maskBuffer);
  errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &outputSignalBuffer);
  errNum |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &inputSignalWidth);
  errNum |= clSetKernelArg(kernel, 4, sizeof(cl_uint), &maskWidth);
  checkErr(errNum, "clSetKernelArg");

  const size_t globalWorkSize[1] = { outputSignalWidth * outputSignalHeight };
  const size_t localWorkSize[1] = { 1 };

  // Queue the kernel up for execution across the array
  errNum = clEnqueueNDRangeKernel(
    queue,
    kernel,
    1,
    NULL,
    globalWorkSize,
    localWorkSize,
    0,
    NULL,
    NULL);
  checkErr(errNum, "clEnqueueNDRangeKernel");

  errNum = clEnqueueReadBuffer(
    queue,
    outputSignalBuffer,
    CL_TRUE,
    0,
    sizeof(cl_uint) * outputSignalHeight * outputSignalHeight,
    outputSignal,
    0,
    NULL,
    NULL);
  checkErr(errNum, "clEnqueueReadBuffer");

  // Output the result buffer
  for (int y = 0; y < outputSignalHeight; y++)
  {
    for (int x = 0; x < outputSignalWidth; x++)
    {
      std::cout << std::setw(2) << outputSignal[x][y] << " ";
    }
    std::cout << std::endl;
  }

  std::cout << std::endl << "Executed program succesfully." << std::endl;

  return 0;
}

//
// Book:      OpenCL(R) Programming Guide
// Authors:   Aaftab Munshi, Benedict Gaster, Timothy Mattson, James Fung, Dan Ginsburg
// ISBN-10:   0-321-74964-2
// ISBN-13:   978-0-321-74964-2
// Publisher: Addison-Wesley Professional
// URLs:      http://safari.informit.com/9780132488006/
//            http://www.openclprogrammingguide.com
//

// Convolution.cl
//
//    This is a simple kernel performing convolution.

__kernel void convolve(
	const __global  uint * const input,
    __constant uint * const mask,
    __global  uint * const output,
    const int inputWidth,
    const int maskWidth)
{
    const int x = get_global_id(0);
    const int y = get_global_id(1);

    uint sum = 0;
    for (int r = 0; r < maskWidth; r++)
    {
        const int idxIntmp = (y + r) * inputWidth + x;

        for (int c = 0; c < maskWidth; c++)
        {
            sum += mask[(r * maskWidth) + c] * input[idxIntmp + c];
        }
    }

    output[y * get_global_size(0) + x] = sum;
}

注：代码中clGetDeviceIDs()函数使用CL_DEVICE_TYPE_CPU，虽然编译和执行都没有问题，但是在vs2017上单步调试时clCreateContext()这个函数会出错，返回错误-2，这是CL_DEVICE_NOT_AVAILABLE错误：

CL_DEVICE_NOT_AVAILABLE if a device in devices is currently not available even though the device was returned by clGetDeviceIDs.

具体原因不深究了，将CL_DEVICE_TYPE_CPU都改为CL_DEVICE_TYPE_GPU就一切正常了。代码输出：

22 27 19 35 41 12
21 35 10 26 48 24
27 35 29 16 31 26
25 31 33  6 34 48
22 31 39 22  9 37
16 27 43 31  0 38

Executed program succesfully.

我想用c++代码来模拟kernel函数，实现同样的效果，因此我有如下的c++代码：

#include <stdio.h>

int input[8][8] = {
  {3, 1, 1, 4, 8, 2, 1, 3},
  {4, 2, 1, 1, 2, 1, 2, 3},
  {4, 4, 4, 4, 3, 2, 2, 2},
  {9, 8, 3, 8, 9, 0, 0, 0},
  {9, 3, 3, 9, 0, 0, 0, 0},
  {0, 9, 0, 8, 0, 0, 0, 0},
  {3, 0, 8, 8, 9, 4, 4, 4},
  {5, 9, 8, 1, 8, 1, 1, 1}
};

int mask[3][3] = {
  {1, 1, 1},
  {1, 0, 1},
  {1, 1, 1},
};

int output[6][6] = { 0 };

void convolve()
{
  for (int y = 0; y < 6; ++y) {
    for (int x = 0; x < 6; ++x) {
      int sum = 0;
      for (int r = 0; r < 3; r++) {
        const int idxIntmp = (y + r) * 8 + x;

        for (int c = 0; c < 3; c++)
          sum += *((int*)mask + (r * 3) + c) * *((int*)input + idxIntmp + c);
      }
      *((int*)output + y * 6 + x) = sum;
    }
  }
}

int main()
{
  convolve();

  for (int y = 0; y < 6; ++y) {
    for (int x = 0; x < 6; ++x)
      printf("%2d ", output[x][y]);
    printf("\n");
  }
}

对比一下，发现结果不一致：

// 书中的
22 27 19 35 41 12
21 35 10 26 48 24
27 35 29 16 31 26
25 31 33  6 34 48
22 31 39 22  9 37
16 27 43 31  0 38

// 第一次
22 35 39 41 26 42
21 31 43 48 48 43
27 31 35 31 37 42
25 27 26 34 38 30
22 19 16  9 17 23
16 10  6  0 12 11

百思不得其解，确信我的理解是对的，get_global_id(0)和get_global_id(1)的值都为6，如果将上述c++代码修改为：

void convolve()
{
  for (int y = 0; y < /*6*/8; ++y) {
    for (int x = 0; x < /*6*/8; ++x) {
      int sum = 0;
      for (int r = 0; r < 3; r++) {
        const int idxIntmp = (y + r) * 8 + x;

        for (int c = 0; c < 3; c++)
          sum += *((int*)mask + (r * 3) + c) * *((int*)input + idxIntmp + c);
      }
      *((int*)output + y * /*6*/8 + x) = sum;
    }
  }
}

输出结果就和书中的一致了：

// 书中的
22 27 19 35 41 12
21 35 10 26 48 24
27 35 29 16 31 26
25 31 33  6 34 48
22 31 39 22  9 37
16 27 43 31  0 38

// 第二次
22 27 19 35 41 12
21 35 10 26 48 24
27 35 29 16 31 26
25 31 33  6 34 48
22 31 39 22  9 37
16 27 43 31  0 38

虽然结果一致，但是此时output[6][6]变量的大小变成了output[8][8]，说明已经溢出了，难道get_global_id(0)和get_global_id(1)都是8，如果真是这样的话，那我之前的理解就全错了，这也太恐怖了吧！

经过尝试我发现：

func	value
get_global_id(0)	0-35
get_global_id(1)	0
get_global_size(0)	36

原来书中代码是以一维数组传入的：

const size_t globalWorkSize[1] = { outputSignalWidth * outputSignalHeight };
const size_t localWorkSize[1] = { 1 };

// Queue the kernel up for execution across the array
errNum = clEnqueueNDRangeKernel(
  queue,
  kernel,
  1,
  NULL,
  globalWorkSize,
  localWorkSize,
  0,
  NULL,
  NULL);
checkErr(errNum, "clEnqueueNDRangeKernel");

所以，我将我的c++代码也进行了相应的修改：

void convolve()
{
  for (int y = 0; y < 1; ++y) {
    for (int x = 0; x < 36; ++x) {
      int sum = 0;
      for (int r = 0; r < 3; r++) {
        const int idxIntmp = (y + r) * 8 + x;

        for (int c = 0; c < 3; c++)
          sum += *((int*)mask + (r * 3) + c) * *((int*)input + idxIntmp + c);
      }
      *((int*)output + y * 0 + x) = sum;
    }
  }
}

这样结果就一致了：

// 书中的
22 27 19 35 41 12
21 35 10 26 48 24
27 35 29 16 31 26
25 31 33  6 34 48
22 31 39 22  9 37
16 27 43 31  0 38

// 第三次
22 27 19 35 41 12
21 35 10 26 48 24
27 35 29 16 31 26
25 31 33  6 34 48
22 31 39 22  9 37
16 27 43 31  0 38

或者我修改一下原书代码，以二维数组传入：

const size_t globalWorkSize[2] = { outputSignalWidth, outputSignalHeight };
const size_t localWorkSize[2] = { 1, 1 };

// Queue the kernel up for execution across the array
errNum = clEnqueueNDRangeKernel(
  queue,
  kernel,
  2,
  NULL,
  globalWorkSize,
  localWorkSize,
  0,
  NULL,
  NULL);
checkErr(errNum, "clEnqueueNDRangeKernel");

我的测试代码仍保持最初的模样，即：

void convolve()
{
  for (int y = 0; y < 6; ++y) {
    for (int x = 0; x < 6; ++x) {
      int sum = 0;
      for (int r = 0; r < 3; r++) {
        const int idxIntmp = (y + r) * 8 + x;

        for (int c = 0; c < 3; c++)
          sum += *((int*)mask + (r * 3) + c) * *((int*)input + idxIntmp + c);
      }
      *((int*)output + y * 6 + x) = sum;
    }
  }
}

这样的修改后的结果也一致，说明我的理解是正确的：

// 书中的
22 35 39 41 26 42
21 31 43 48 48 43
27 31 35 31 37 42
25 27 26 34 38 30
22 19 16  9 17 23
16 10  6  0 12 11

// 第一次
22 35 39 41 26 42
21 31 43 48 48 43
27 31 35 31 37 42
25 27 26 34 38 30
22 19 16  9 17 23
16 10  6  0 12 11

现在就算真正理解了get_global_id和get_global_size，最后一个疑问就是为什么以一组数组传入和以二维数组传入得到的结果不一样？