我的测试环境:
VS2022,.Net5.0
首先到Git上下载NOpenCL源码(GitHub - tunnelvisionlabs/NOpenCL: .NET wrapper for OpenCL with abstraction)
然后在此项目上创建一个控制台,如图:
添加项目引用NOpenCL,并且在主类中添加using NOpenCL;
控制台的主文件源码如下:
internal class MainProgram
{
private const int NumElements = 999_999_99;
static unsafe void Main(string[] args)
{
Console.WriteLine("Hello World!");
int localWorkSize = 256;
int globalWorkSize = RoundUp(localWorkSize, NumElements);
//获取平台数量
var platforms= Platform.GetPlatforms();
if(platforms==null || platforms.Length==0)
{
Console.WriteLine($"{nameof(platforms)}无值");
return;
}
var devs = new List<Device>();
//枚举所有平台下面的设备(CPU或GPU)
devs.AddRange(platforms.SelectMany(t => t.GetDevices(DeviceType.Gpu)));
if (devs == null || devs.Count == 0)
{
Console.WriteLine($"{nameof(devs)}无值");
return;
}
//选中运算设备
var oclDevice = devs[0];
//根据配置建立上下文,上下文用来描述CPU与运算设备之间的主从关系.
var oclContext =Context.Create(oclDevice);
//创建命令队列
var oclCQ = oclContext.CreateCommandQueue(oclDevice, CommandQueueProperties.ProfilingEnable);
//定义一个字典用来存放所有核
var Kernels = new Dictionary<string, Kernel>();
#region 编译代码并导出核
var oclProgram = oclContext.CreateProgramWithSource(CLCode);
try
{
oclProgram.Build();
}
catch (Exception e)
{
Console.WriteLine(e.Message);
throw e;
//return null;
}
foreach (var item in new[] { "vector_add_gpu", "vector_inc_gpu" })
{
Kernels.Add(item, oclProgram.CreateKernel(item));
}
oclProgram.Dispose();
#endregion
Stopwatch sw = Stopwatch.StartNew();
#region 调用vector_add_gpu核
{
var A = new int[globalWorkSize];
var B = new int[globalWorkSize];
var C = new int[globalWorkSize];
var D = new int[globalWorkSize];
FillArray(A, NumElements);
FillArray(B, NumElements);
sw.Start();
for (int i = 0; i < A.Length; i++)
{
D[i] = A[i] + B[i];
}
sw.Stop();
Console.WriteLine($"普通代码用时:{sw.ElapsedMilliseconds}ms");
sw = Stopwatch.StartNew();
sw.Start();
//在显存创建缓冲区并把HOST的数据拷贝过去
var n1 = oclContext.CreateBuffer(MemoryFlags.ReadWrite | MemoryFlags.CopyHostPointer, A.Length * sizeof(int), A.ToIntPtr());
var n2 = oclContext.CreateBuffer(MemoryFlags.ReadWrite | MemoryFlags.CopyHostPointer, B.Length * sizeof(int), B.ToIntPtr());
//还有一个缓冲区用来接收回参
var n3 = oclContext.CreateBuffer(MemoryFlags.ReadWrite, B.Length * sizeof(int), IntPtr.Zero);
//把参数填进Kernel里
using (Kernel kernel = Kernels["vector_add_gpu"])
{
kernel.Arguments[0].SetValue(n1);
kernel.Arguments[1].SetValue(n2);
kernel.Arguments[2].SetValue(n3);
// Start core sequence... copy input data to GPU, compute, copy results back
fixed (int* psrcA = A, psrcB = B, pdst = C)
{
// asynchronous write of data to GPU device
//using (oclCQ.EnqueueWriteBuffer(n1, false, 0, sizeof(float) * globalWorkSize, (IntPtr)psrcA))
//using (oclCQ.EnqueueWriteBuffer(n2, false, 0, sizeof(float) * globalWorkSize, (IntPtr)psrcB))
//{
//}
using (oclCQ.EnqueueNDRangeKernel(kernel, (IntPtr)globalWorkSize, (IntPtr)localWorkSize))
{
}
//把调用请求添加到队列里,参数分别是:Kernel,数据的维度,每个维度的全局工作项ID偏移,每个维度工作项数量(我们这里有4个数据,所以设为4),每个维度的工作组长度(这里设为每4个一组)
//oclCQ.EnqueueNDRangeKernel(Kernels["vector_add_gpu"], 1, new[] { 0 }, new[] { 4 }, new[] { 4 });
//设置栅栏强制要求上面的命令执行完才继续下面的命令.
oclCQ.EnqueueBarrier();
//添加一个读取数据命令到队列里,用来读取运算结果
oclCQ.EnqueueReadBuffer(n3, true, 0, C.Length * sizeof(int), C.ToIntPtr());
//开始执行
oclCQ.Finish();
}
}
sw.Stop();
n1.Dispose();
n2.Dispose();
n3.Dispose();
//Console.WriteLine(String.Join(',', C));
Console.WriteLine($"GPU用时:{sw.ElapsedMilliseconds}ms");
Console.WriteLine($"数组长度:{A.Length}");
}
// */
#endregion
//按顺序释放之前构造的对象
oclCQ.Dispose();
oclContext.Dispose();
oclDevice.Dispose();
}
#region OpenCL代码
private static string CLCode = @"
__kernel void vector_add_gpu(__global int* src_a, __global int* src_b, __global int* res)
{
const int idx = get_global_id(0);
res[idx] =src_a[idx] + src_b[idx];
}
__kernel void vector_inc_gpu(__global int* src_a, __global int* res)
{
const int idx = get_global_id(0);
res[idx] =src_a[idx] + 1;
}
";
#endregion
private static void FillArray(int[] data, int length)
{
Random random = new Random();
for (int i = 0; i < length; i++)
data[i] = random.Next(0,255);
}
private static int RoundUp(int groupSize, int globalSize)
{
int r = globalSize % groupSize;
if (r == 0)
return globalSize;
return globalSize + groupSize - r;
}
}
测试结果:
其中OpenExtend扩展方法:
/// <summary>
/// 取指针
/// </summary>
/// <param name="obj"></param>
/// <returns></returns>
public static unsafe IntPtr ToIntPtr(this int[] obj)
{
//IntPtr PtrA = IntPtr.Zero;
fixed (int* Ap = obj)
return new IntPtr(Ap);
}