- 细分了流程,并注释了相应函数
- 对get_global_id的理解:如果clEnqueueNDRangeKernel设置的是一维且global_item_size = 9,那么get_global_id(0)返回0到8,对应9个work-group。如果clEnqueueNDRangeKernel设置的是二维且global_item_size[2] = {9, 8},那么get_global_id(0)和get_global_id(1)的结果是(0,j)(1,j)(2,j)(3,j)(4,j)(5,j)(6,j)(7,j)(8,j),其中j是0到7且顺序不定。
- 主机程序,注意
kernel.cl
的路径
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <CL/cl.h>
#include <time.h>
#define MAX_SOURCE_SIZE (0x100000)
int main()
{
int i, j;
float *A;
float *B;
float *C;
const int row = 9, col = 8;
A = (float *)malloc(row * col * sizeof(float));
B = (float *)malloc(row * col * sizeof(float));
C = (float *)malloc(row * col * sizeof(float));
printf("Initialize input data");
for (i = 0; i < row; i++) {
for (j = 0; j < col; j++) {
A[i * col + j] = i + j;
B[i * col + j] = i + j + 1;
}
}
printf("\n");
printf("A array data:\n");
for (i = 0; i < row; i++) {
for (int j=0; j<col; j++){
printf("%.2f\t",A[i*col+j]);
}
printf("\n");
}
printf("B array data:\n");
for (i = 0; i < row; i++) {
for (int j=0; j<col; j++){
printf("%.2f\t",B[i*col+j]);
}
printf("\n");
}
clock_t start, finish;
double duration;
printf("DataParallel kernels start to execute\n");
start = clock();
cl_int ret;
cl_platform_id platform_id = NULL;
cl_uint ret_num_platforms;
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
cl_context context = NULL;
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
cl_command_queue command_queue = NULL;
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
cl_mem Amobj = NULL;
cl_mem Bmobj = NULL;
cl_mem Cmobj = NULL;
Amobj = clCreateBuffer(context, CL_MEM_READ_WRITE, row * col * sizeof(float), NULL, &ret);
Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, row * col * sizeof(float), NULL, &ret);
Cmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, row * col * sizeof(float), NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, row * col * sizeof(float), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, row * col * sizeof(float), B, 0, NULL, NULL);
FILE *fp;
const char fileName[] = "../src/kernel.cl";
size_t source_size;
char *source_str;
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.cl");
exit(1);
}
source_str = (char *)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
cl_program program = NULL;
program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "dataParallel", &ret);
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&Amobj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&Bmobj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&Cmobj);
size_t global_item_size = 9;
size_t local_item_size = 1;
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clEnqueueReadBuffer(command_queue, Cmobj, CL_TRUE, 0, row * col * sizeof(float), C, 0, NULL, NULL);
finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
printf("cost : %f seconds\n", duration);
printf("Calculation result:\n");
for (i = 0; i < row; i++) {
for (j = 0; j < col; j++) {
printf("%7.2f\t", C[i * col + j]);
}
printf("\n");
}
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(Amobj);
ret = clReleaseMemObject(Bmobj);
ret = clReleaseMemObject(Cmobj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(source_str);
free(A);
free(B);
free(C);
return 0;
}
__kernel void dataParallel(__global float* A, __global float* B, __global float* C)
{
int i = 8*get_global_id(0);
printf("%d\n", i);
C[i+0] = A[i+0] + B[i+0];
C[i+1] = A[i+1] - B[i+1];
C[i+2] = A[i+2] * B[i+2];
C[i+3] = A[i+3] / B[i+3];
C[i+4] = A[i+4];
C[i+5] = B[i+5];
}