opencl学习(一)

该博客演示了一个使用OpenCL进行数据并行计算的例子,包括获取平台和设备信息、创建上下文、命令队列、缓冲区,加载内核源代码,执行内核以及读取结果。程序涉及一维和二维NDRangeKernel的调用,解释了get_global_id的作用,展示了如何将主机数据传输到设备并执行计算,最后显示了计算结果。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

  • 细分了流程,并注释了相应函数
  • get_global_id的理解:如果clEnqueueNDRangeKernel设置的是一维且global_item_size = 9,那么get_global_id(0)返回0到8,对应9个work-group。如果clEnqueueNDRangeKernel设置的是二维且global_item_size[2] = {9, 8},那么get_global_id(0)和get_global_id(1)的结果是(0,j)(1,j)(2,j)(3,j)(4,j)(5,j)(6,j)(7,j)(8,j),其中j是0到7且顺序不定。
  • 主机程序,注意kernel.cl的路径
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <CL/cl.h>
#include <time.h>
 
#define MAX_SOURCE_SIZE (0x100000)
//data parallel
int main()
{ 
	int i, j;
	float *A;
	float *B;
	float *C;
	const int row = 9, col = 8;
 
	A = (float *)malloc(row * col * sizeof(float));
	B = (float *)malloc(row * col * sizeof(float));
	C = (float *)malloc(row * col * sizeof(float));
 
	/* Initialize input data */
	printf("Initialize input data");
	for (i = 0; i < row; i++) {
		for (j = 0; j < col; j++) {
			A[i * col + j] = i + j;
			B[i * col + j] = i + j + 1;
		}
	}
	printf("\n");
 
	printf("A array data:\n");
	for (i = 0; i < row; i++) {
		for (int j=0; j<col; j++){
		printf("%.2f\t",A[i*col+j]);	
		}
		printf("\n");
	}
 
	printf("B array data:\n");
	for (i = 0; i < row; i++) {
		for (int j=0; j<col; j++){
		printf("%.2f\t",B[i*col+j]);	
		}
		printf("\n");
	}
 
	clock_t start, finish;
	double  duration;
	printf("DataParallel kernels start to execute\n");
	start = clock();
    
    cl_int ret;
	/* 1.Get Platform Information */
	cl_platform_id platform_id = NULL;
	cl_uint ret_num_platforms;
	// 查询的最大数量,返回的平台列表,实际平台数
	ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);

	/* 2.Device Information */
	cl_device_id device_id = NULL;
	cl_uint ret_num_devices;
	// 平台列表,查询的设备类型,查询的最大数量,返回的设备列表,实际设备数
	ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);

	/* 3.Create OpenCL Context */
	cl_context context = NULL;
	// properties,设备数,设备列表,pfn_notify,user_data,返回码
	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

	/* 4.Create command queue */
	cl_command_queue command_queue = NULL;
	// context,device_id,properties,返回码
	command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

	/* 5.Create Buffer Object */
	cl_mem Amobj = NULL;
	cl_mem Bmobj = NULL;
	cl_mem Cmobj = NULL;
	// context,flag,大小,host指针,返回码
	Amobj = clCreateBuffer(context, CL_MEM_READ_WRITE, row * col * sizeof(float), NULL, &ret);
	Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, row * col * sizeof(float), NULL, &ret);
	Cmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, row * col * sizeof(float), NULL, &ret);

	/* 6.Copy input data to the memory buffer */
	// command_queue,buffer,是否阻塞,偏移,大小,主机数据,等待事件数,等待事件列表,事件命令
	ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, row * col * sizeof(float), A, 0, NULL, NULL);
	ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, row * col * sizeof(float), B, 0, NULL, NULL);

	/* 7.Load kernel source file */
	FILE *fp;
	const char fileName[] = "../src/kernel.cl";
	size_t source_size;
	char *source_str;
	fp = fopen(fileName, "r");
	if (!fp) {
		fprintf(stderr, "Failed to load kernel.cl");
		exit(1);
	}
	source_str = (char *)malloc(MAX_SOURCE_SIZE);
	source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);

	/* 8.Create kernel program from source file*/
	cl_program program = NULL;
	// context,count,kernel代码,代码长度,返回码
	program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);

	/*  9. build program*/
	// program,设备数,device_id,options,pfn_notify,user_data
	ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

	/* 10.Create data parallel OpenCL kernel */
	cl_kernel kernel = NULL;
	// program, 名字, 返回码
	kernel = clCreateKernel(program, "dataParallel", &ret);

	/* 11.Set OpenCL kernel arguments */
	// kernel, 参数索引,参数大小,参数
	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&Amobj);
	ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&Bmobj);
	ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&Cmobj);
	
	/* 12.Execute OpenCL kernel as data parallel */
	size_t global_item_size = 9;
	size_t local_item_size = 1;
	// command_queue,kernel,数据维度,global_work_offset,global_item_size,local_item_size,等待事件数,等待事件列表,事件命令
	ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
	// 2D clEnqueueNDRangeKernel
	// size_t global_item_size[2] = {9, 8};
	// ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_item_size, NULL, 0, NULL, NULL);

	/* 13.wait for the commands to complete before reading back results */
	ret = clFlush(command_queue);
	ret = clFinish(command_queue);
 
	/* 14.Transfer result to host */
	// command_queue,buffer,是否阻塞,偏移,大小,主机数据,等待事件数,等待事件列表,事件命令
	ret = clEnqueueReadBuffer(command_queue, Cmobj, CL_TRUE, 0, row * col * sizeof(float), C, 0, NULL, NULL);
	// cost
	finish = clock();
	duration = (double)(finish - start) / CLOCKS_PER_SEC;
	printf("cost : %f seconds\n", duration);
	/* Display Results */
	printf("Calculation result:\n");
	for (i = 0; i < row; i++) {
		for (j = 0; j < col; j++) {
			printf("%7.2f\t", C[i * col + j]);
		}
		printf("\n");
	}
 
 
	/* 15.Finalization */
	ret = clReleaseKernel(kernel);
	ret = clReleaseProgram(program);
	ret = clReleaseMemObject(Amobj);
	ret = clReleaseMemObject(Bmobj);
	ret = clReleaseMemObject(Cmobj);
	ret = clReleaseCommandQueue(command_queue);
	ret = clReleaseContext(context);
 
	free(source_str);
 
	free(A);
	free(B);
	free(C);
	return 0;
 }
  • kernel.cl
__kernel void dataParallel(__global float* A, __global float* B, __global float* C)
{
	int i = 8*get_global_id(0);
	printf("%d\n", i);
	C[i+0] = A[i+0] + B[i+0];
	C[i+1] = A[i+1] - B[i+1];
	C[i+2] = A[i+2] * B[i+2];
	C[i+3] = A[i+3] / B[i+3];
	C[i+4] = A[i+4];
	C[i+5] = B[i+5];
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

刀么克瑟拉莫

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值