忙了好久,期中考试考完了,要开始研究openCL了,这是自己的第一个hello word。就是数组的并行化,当然这里的kernal函数没有写在文件里,主要是程序不大,姑且就这样吧,便于阅读。以下是源代码
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
const char* programSource=
"__kernel \n"
"void vecadd(__global const float* A, \n"
"__global const float* B, \n"
" __global float* C) \n"
"{ \n"
" int id = get_global_id(0); \n"
" C[id] = A[id] + B[id]; \n"
"} \n"
;
int main()
{
int *A = NULL; // 输入数组
int *B = NULL; // 输入数组
int *C = NULL; // 输出数组
// 数组的大小
const int elements = 2048;
// 计算内存大小
size_t datasize = sizeof(int)*elements;
// 分配内存空间
A = (int*)malloc(datasize);
B = (int*)malloc(datasize);
C = (int*)malloc(datasize);
// 初始化输入数组
for(int i = 0;i < elements;i++)
{
A[i] = std::rand();
B[i] = std::rand();
}
// 获取并初始化平台
cl_int status;
cl_uint numPlatforms = 0;
cl_platform_id *platforms = NULL;
status = clGetPlatformIDs(0,NULL,&numPlatforms);
platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
//
status = clGetPlatformIDs(numPlatforms,platforms,NULL);
cl_uint numDevices = 0;
cl_device_id *devices = NULL;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL,0,NULL,&numDevices);
// 分配内存空间
devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL,numDevices,devices,NULL);
cl_context context = NULL;
//创建上下文,管理设备之间的资料
context = clCreateContext(NULL,
numDevices,
devices,
NULL,
NULL,
&status);
cl_command_queue cmdQueue;
//创建命令队列
cmdQueue = clCreateCommandQueue(context,
devices[0],
0,
&status);
//初始化数组内存
cl_mem bufferA;
cl_mem bufferB;
cl_mem bufferC;
bufferA = clCreateBuffer(
context,
CL_MEM_READ_ONLY,
datasize,
NULL,
&status);
bufferB = clCreateBuffer(
context,
CL_MEM_READ_ONLY,
datasize,
NULL,
&status);
bufferC = clCreateBuffer(
context,
CL_MEM_WRITE_ONLY,
datasize,
NULL,
&status);
//将主机端的数据写入设备
status = clEnqueueWriteBuffer(
cmdQueue,
bufferA,
CL_FALSE,
0,
datasize,
A,
0,
NULL,
NULL);
status = clEnqueueWriteBuffer(
cmdQueue,
bufferB,
CL_FALSE,
0,
datasize,
B,
0,
NULL,
NULL);
//编译函数
cl_program program = clCreateProgramWithSource(
context,
1,
(const char**)&programSource,
NULL,
&status);
status = clBuildProgram(
program,
numDevices,
devices,
NULL,
NULL,
NULL);
//创建Kernel函数
cl_kernel kernel = NULL;
kernel = clCreateKernel(program,"vecadd",&status);
//设置参数
status = clSetKernelArg(kernel,0,sizeof(cl_mem),&bufferA);
status = clSetKernelArg(kernel,1,sizeof(cl_mem),&bufferB);
status = clSetKernelArg(kernel,2,sizeof(cl_mem),&bufferC);
//初始化线程的映射
size_t globalWorkSize[1];
globalWorkSize[0] = elements;
//运行kernel
status = clEnqueueNDRangeKernel(
cmdQueue,
kernel,
1,
NULL,
globalWorkSize,
NULL,
0,
NULL,
NULL);
//从设备中读回数据结果
clEnqueueReadBuffer(
cmdQueue,
bufferC,
CL_TRUE,
0,
datasize,
C,
0,
NULL,
NULL);
bool result = true;
for(int i = 0;i < elements;i++)
{
//std::cout<<C[i]<<std::endl;
if(C[i]!=A[i]+B[i])
{
result = false;
//break;
}
}
if(result)
{
printf("Output is correct\n");
}
else
{
printf("Output is incorrect\n");
}
//清理数据
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(bufferA);
clReleaseMemObject(bufferB);
clReleaseMemObject(bufferC);
clReleaseContext(context);
free(A);
free(B);
free(C);
free(platforms);
free(devices);
return 0;
}