oneAPI GPU 优化指南 - 访问buffer的不同模式

SYCL中的BufferAccessor模式：优化内存访问和并发执行

翻译已于 2024-02-19 23:28:35 修改 · 159 阅读

文章标签：

#oneapi

于 2024-02-19 23:19:48 首次发布

oneAPI GPU 优化指南专栏收录该内容

46 篇文章

订阅专栏

本文介绍了SYCL中buffer和accessor的概念，重点讲解了访问模式（如read_only和read_write）的作用，以及如何根据kernel操作选择合适的模式以提高性能，避免不必要的内存复制。

本章节翻译by weavingtime@formail.com 原文 Buffer Accessor Modes (intel.com)

在 SYCL 中，buffer提供了主机或设备可以访问的内存的抽象视图。但不能通过buffer对象直接访问buffer。相反，我们必须创建一个允许我们访问buffer数据的accessor对象。

访问模式描述了我们打算如何在程序中使用与accessor关联的内存。 runtime使用accessor的访问模式为kernel创建执行顺序，并执行数据移动。这将确保kernel按照程序员预期的顺序执行。根据底层硬件的功能，如果依赖项不会引起冲突或竞争条件，则runtime可以并发执行kernel。

为了获得更好的性能，请确保accessor的访问模式反映kernel执行的操作。当对声明为 read_only 的accessor进行写入时，编译器将标记错误。但是，如果kernel中没有进行写入，则编译器不会将accessor形式 read_write 的声明更改为读取。

以下示例显示了三个kernel。第一个kernel初始化 A、B 和 C buffer，因此我们指定这些buffer的访问模式为 write_only 。第二个kernel读取A和B buffer，并读写 Cbuffer，因此我们指定A和B buffer的访问模式为 read_only ，C buffer的访问模式为 read_write 。

read_only 访问模式通知runtime，在kernel开始执行之前，数据需要在设备上可用，但在计算结束时不需要将数据从设备复制到主机。

如果第二个kernel对 A 和 B 使用 read_write 而不是 read_only ，则与 A 和 B 关联的内存会在kernel执行结束时从设备复制到主机，即使数据尚未被设备修改。此外， read_write 会创建不必要的依赖关系。如果在此程序块内提交读取 A 或 B buffer的另一个kernel，则在第二个kernel完成之前，该新kernel无法启动。

 1#include <CL/sycl.hpp>
 2#include <stdio.h>
 3
 4constexpr int N = 100;
 5
 6int main() {
 7
 8  int AData[N];
 9  int BData[N];
10  int CData[N];
11
12  sycl::queue Q;
13
14  // Kernel1
15  {
16    // Create 3 buffers, each holding N integers
17    sycl::buffer<int> ABuf(&AData[0], N);
18    sycl::buffer<int> BBuf(&BData[0], N);
19    sycl::buffer<int> CBuf(&CData[0], N);
20
21    Q.submit([&](auto &h) {
22      // Create device accessors.
23      // The property no_init lets the runtime know that the
24      // previous contents of the buffer can be discarded.
25      sycl::accessor aA(ABuf, h, sycl::write_only, sycl::no_init);
26      sycl::accessor aB(BBuf, h, sycl::write_only, sycl::no_init);
27      sycl::accessor aC(CBuf, h, sycl::write_only, sycl::no_init);
28
29      h.parallel_for(N, [=](auto i) {
30        aA[i] = 11;
31        aB[i] = 22;
32        aC[i] = 0;
33      });
34    });
35  } // end Kernel1
36
37  // Kernel2
38  {
39    // Create 3 buffers, each holding N integers
40    sycl::buffer<int> ABuf(&AData[0], N);
41    sycl::buffer<int> BBuf(&BData[0], N);
42    sycl::buffer<int> CBuf(&CData[0], N);
43
44    Q.submit([&](auto &h) {
45      // Create device accessors
46      sycl::accessor aA(ABuf, h, sycl::read_only);
47      sycl::accessor aB(BBuf, h, sycl::read_only);
48      sycl::accessor aC(CBuf, h);
49      h.parallel_for(N, [=](auto i) { aC[i] += aA[i] + aB[i]; });
50    });
51  } // end Kernel2
52
53  // Buffers are destroyed and so CData is updated and can be accessed
54  for (int i = 0; i < N; i++) {
55    printf("%d\n", CData[i]);
56  }
57
58  return 0;
59}

当在 for 循环内重复启动kernel时，指定 read_ony 访问器模式而不是 read_write 特别有用。如果访问模式是 read_write ，则kernel的启动将被串行执行，因为一个kernel应该完成其计算并且数据应该在启动下一个kernel之前准备好。另一方面，如果访问模式是 read_only ，那么运行时可以并行启动kernel。

请注意，buffer声明和kernel启动是同一个有效域内的。这将导致buffer在第一个kernel完成结束时超出范围。这将触发从设备到主机的内容复制。第二个kernel位于另一个有效域内，其中新buffer被声明到同一内存，这将再次触发同一内存从主机到设备的副本。主机和设备之间的这种来回拷贝可以通过声明一次buffer来避免，并确保它们在其指向的内存的生命周期内。下面展示了避免这些不必要的内存传输的代码。

 1#include <CL/sycl.hpp>
 2#include <stdio.h>
 3
 4constexpr int N = 100;
 5
 6int main() {
 7
 8  int AData[N];
 9  int BData[N];
10  int CData[N];
11
12  sycl::queue Q;
13
14  // Create 3 buffers, each holding N integers
15  sycl::buffer<int> ABuf(&AData[0], N);
16  sycl::buffer<int> BBuf(&BData[0], N);
17  sycl::buffer<int> CBuf(&CData[0], N);
18
19  // Kernel1
20  Q.submit([&](auto &h) {
21    // Create device accessors.
22    // The property no_init lets the runtime know that the
23    // previous contents of the buffer can be discarded.
24    sycl::accessor aA(ABuf, h, sycl::write_only, sycl::no_init);
25    sycl::accessor aB(BBuf, h, sycl::write_only, sycl::no_init);
26    sycl::accessor aC(CBuf, h, sycl::write_only, sycl::no_init);
27
28    h.parallel_for(N, [=](auto i) {
29      aA[i] = 11;
30      aB[i] = 22;
31      aC[i] = 0;
32    });
33  });
34
35  // Kernel2
36  Q.submit([&](auto &h) {
37    // Create device sycl::accessors
38    sycl::accessor aA(ABuf, h, sycl::read_only);
39    sycl::accessor aB(BBuf, h, sycl::read_only);
40    sycl::accessor aC(CBuf, h);
41    h.parallel_for(N, [=](auto i) { aC[i] += aA[i] + aB[i]; });
42  });
43
44  // The host accessor creation will ensure that a wait for kernel to finish
45  // is triggered and data from device to host is copied
46  sycl::host_accessor h_acc(CBuf);
47  for (int i = 0; i < N; i++) {
48    printf("%d\n", h_acc[i]);
49  }
50
51  return 0;
52}

以下示例显示了使用不同范围阻塞运行相同代码的另一种方法。在这种情况下，在 kernel1 末尾处不会有从主机到设备的buffer拷贝，在 kernel2 开头处不会有从主机到设备的buffer拷贝。当这些buffer的生命周期结束时，所有三个buffer的拷贝都会在 kernel2 末尾发生。

 1#include <CL/sycl.hpp>
 2#include <stdio.h>
 3
 4constexpr int N = 100;
 5
 6int main() {
 7
 8  int AData[N];
 9  int BData[N];
10  int CData[N];
11
12  sycl::queue Q;
13
14  {
15    // Create 3 buffers, each holding N integers
16    sycl::buffer<int> ABuf(&AData[0], N);
17    sycl::buffer<int> BBuf(&BData[0], N);
18    sycl::buffer<int> CBuf(&CData[0], N);
19
20    // Kernel1
21    Q.submit([&](auto &h) {
22      // Create device accessors.
23      // The property no_init lets the runtime know that the
24      // previous contents of the buffer can be discarded.
25      sycl::accessor aA(ABuf, h, sycl::write_only, sycl::no_init);
26      sycl::accessor aB(BBuf, h, sycl::write_only, sycl::no_init);
27      sycl::accessor aC(CBuf, h, sycl::write_only, sycl::no_init);
28
29      h.parallel_for(N, [=](auto i) {
30        aA[i] = 11;
31        aB[i] = 22;
32        aC[i] = 0;
33      });
34    });
35
36    // Kernel2
37    Q.submit([&](auto &h) {
38      // Create device accessors
39      sycl::accessor aA(ABuf, h, sycl::read_only);
40      sycl::accessor aB(BBuf, h, sycl::read_only);
41      sycl::accessor aC(CBuf, h);
42      h.parallel_for(N, [=](auto i) { aC[i] += aA[i] + aB[i]; });
43    });
44  }
45  // Since the buffers are going out of scope, they will have to be
46  // copied back from device to host and this will require a wait for
47  // all the kernels to finish and so no explicit wait is needed
48  for (int i = 0; i < N; i++) {
49    printf("%d\n", CData[i]);
50  }
51
52  return 0;
53}

还有另一种编写kernel的方法，在设备上访问主机上的只读变量的副本，将其作为定义kernel的 lambda 函数中变量捕获的一部分，如下所示。问题在于，对于每个kernel调用，与向量 AData 和 BData 关联的数据都必须复制到设备。

 1#include <CL/sycl.hpp>
 2#include <stdio.h>
 3
 4constexpr int N = 100;
 5constexpr int iters = 100;
 6
 7int main() {
 8
 9  int AData[N];
10  int BData[N];
11  int CData[N];
12
13  sycl::queue Q;
14  sycl::buffer<int> CBuf(&CData[0], N);
15
16  {
17    // Create 2 buffers, each holding N integers
18    sycl::buffer<int> ABuf(&AData[0], N);
19    sycl::buffer<int> BBuf(&BData[0], N);
20
21    // Kernel1
22    Q.submit([&](auto &h) {
23      // Create device accessors.
24      // The property no_init lets the runtime know that the
25      // previous contents of the buffer can be discarded.
26      sycl::accessor aA(ABuf, h, sycl::write_only, sycl::no_init);
27      sycl::accessor aB(BBuf, h, sycl::write_only, sycl::no_init);
28      sycl::accessor aC(CBuf, h, sycl::write_only, sycl::no_init);
29
30      h.parallel_for(N, [=](auto i) {
31        aA[i] = 11;
32        aB[i] = 22;
33        aC[i] = 0;
34      });
35    });
36  }
37
38  for (int it = 0; it < iters; it++) {
39    // Kernel2
40    Q.submit([&](auto &h) {
41      // Create device accessors
42      sycl::accessor aC(CBuf, h);
43      h.parallel_for(N, [=](auto i) { aC[i] += AData[i] + BData[i]; });
44    });
45  }
46
47  sycl::host_accessor h_acc(CBuf);
48  for (int i = 0; i < N; i++) {
49    printf("%d\n", h_acc[i]);
50  }
51
52  return 0;
53}

最好使用buffer和该buffer的只读访问器，以便向量仅从主机复制到设备一次。在下面的kernel中，对内存 AData 和 BData 的访问是通过第38和39行的 ABuf 和 Bbuf 进行的，第44和45行中的声明使它们成为只读的，这可以防止它们在生命周期结束时从设备复制回主机。

 1#include <CL/sycl.hpp>
 2#include <stdio.h>
 3
 4constexpr int N = 100;
 5constexpr int iters = 100;
 6
 7int main() {
 8
 9  int AData[N];
10  int BData[N];
11  int CData[N];
12
13  sycl::queue Q;
14  sycl::buffer<int> CBuf(&CData[0], N);
15
16  {
17    // Create 2 buffers, each holding N integers
18    sycl::buffer<int> ABuf(&AData[0], N);
19    sycl::buffer<int> BBuf(&BData[0], N);
20
21    // Kernel1
22    Q.submit([&](auto &h) {
23      // Create device accessors.
24      // The property no_init lets the runtime know that the
25      // previous contents of the buffer can be discarded.
26      sycl::accessor aA(ABuf, h, sycl::write_only, sycl::no_init);
27      sycl::accessor aB(BBuf, h, sycl::write_only, sycl::no_init);
28      sycl::accessor aC(CBuf, h, sycl::write_only, sycl::no_init);
29
30      h.parallel_for(N, [=](auto i) {
31        aA[i] = 11;
32        aB[i] = 22;
33        aC[i] = 0;
34      });
35    });
36  }
37
38  sycl::buffer<int> ABuf(&AData[0], N);
39  sycl::buffer<int> BBuf(&BData[0], N);
40  for (int it = 0; it < iters; it++) {
41    // Kernel2
42    Q.submit([&](auto &h) {
43      // Create device accessors
44      sycl::accessor aA(ABuf, h, sycl::read_only);
45      sycl::accessor aB(BBuf, h, sycl::read_only);
46      sycl::accessor aC(CBuf, h);
47      h.parallel_for(N, [=](auto i) { aC[i] += aA[i] + aB[i]; });
48    });
49  }
50
51  sycl::host_accessor h_acc(CBuf);
52  for (int i = 0; i < N; i++) {
53    printf("%d\n", h_acc[i]);
54  }
55
56  return 0;
57}