本章节翻译by chenchensmail@163.com 原文:Making Better Use of OpenMP Constructs (intel.com)
使用 nowait
减少同步
如果适当,可以在 target
构造上使用 nowait
子句来减少同步。
默认情况下,在 target
区域的末尾有一个隐式 barrier ,该 barrier 确保遇到 target
构造的主机线程在 target
区域完成之前不能继续。
在 target
构造上添加 nowait
子句可以消除这个隐式 barrier ,因此即使 target
区域尚未完成, 遇到 target
构造的主机线程也可以继续。这允许 target
区域在设备上异步执行, 而无需主机线程空闲等待 target
区域完成。
考虑以下示例,其中在一个 parallel
区域(第 48 行)中计算两个向量 v1 和 v2 的乘积。 一半的计算在主机上由执行 parallel
区域的线程完成。设备上进行另一半的计算。 主线程启动一个 target
区域来在设备上进行计算。
默认情况下,主线程必须等待 target
区域完成后才能继续并参与主机上的计算(工作共享循环)。
1/* 2 * This test is taken from OpenMP API 5.0.1 Examples (June 2020) 3 * https://www.openmp.org/wp-content/uploads/openmp-examples-5-0-1.pdf 4 * (4.13.2 nowait Clause on target Construct) 5 */ 6 7 8#include <stdio.h> 9#include <stdlib.h> 10#include <time.h> 11#include <omp.h> 12 13#define N 100000 // N must be even 14 15void init(int n, float *v1, float *v2) { 16 int i; 17 18 for(i=0; i<n; i++){ 19 v1[i] = i * 0.25; 20 v2[i] = i - 1.25; 21 } 22} 23 24int main() { 25 int i, n=N; 26 float v1[N],v2[N],vxv[N]; 27 double start,end; // timers 28 29 init(n, v1, v2); 30 31 /* Dummy parallel and target regions, so as not to measure startup 32 time. */ 33 #pragma omp parallel 34 { 35 #pragma omp master 36 #pragma omp target 37 {;} 38 } 39 40 start=omp_get_wtime(); 41 42 #pragma omp parallel 43 { 44 #pragma omp master 45 #pragma omp target teams distribute parallel for \ 46 map(to: v1[0:n/2]) \ 47 map(to: v2[0:n/2]) \ 48 map(from: vxv[0:n/2]) 49 for(i=0; i<n/2; i++){ 50 vxv[i] = v1[i]*v2[i]; 51 } 52 /* Master thread will wait for target region to be completed 53 before proceeding beyond this point. */ 54 55 #pragma omp for 56 for(i=n/2; i<n; i++) { 57 vxv[i] = v1[i]*v2[i]; 58 } 59 /* Implicit barrier at end of worksharing for. */ 60 } 61 62 end=omp_get_wtime(); 63 64 printf("vxv[0]=%f, vxv[n-1]=%f, time=%lf\n", vxv[0], vxv[n-1], end-start); 65 return 0; 66}
编译命令:
icpx -fiopenmp -fopenmp-targets=spir64 test_target_no_nowait.cpp
运行命令:
OMP_TARGET_OFFLOAD=MANDATORY ZE_AFFINITY_MASK=0.0 LIBOMPTARGET_DEBUG=1 ./a.out
如果在 target
构造上指定了 nowait
子句,那么性能可能会得到提高, 因为这样主线程就不必等待 target
区域完成,可以继续处理工作共享循环。 通过工作共享循环末尾的隐式 barrier 保证了 target
区域的完成。
1/* 2 * This test is taken from OpenMP API 5.0.1 Examples (June 2020) 3 * https://www.openmp.org/wp-content/uploads/openmp-examples-5-0-1.pdf 4 * (4.13.2 nowait Clause on target Construct) 5 */ 6 7 8#include <stdio.h> 9#include <stdlib.h> 10#include <time.h> 11#include <omp.h> 12 13#define N 100000 // N must be even 14 15void init(int n, float *v1, float *v2) { 16 int i; 17 18 for(i=0; i<n; i++){ 19 v1[i] = i * 0.25; 20 v2[i] = i - 1.25; 21 } 22} 23 24int main() { 25 int i, n=N; 26 float v1[N],v2[N],vxv[N]; 27 double start,end; // timers 28 29 init(n, v1,v2); 30 31 /* Dummy parallel and target (nowait) regions, so as not to measure 32 startup time. */ 33 #pragma omp parallel 34 { 35 #pragma omp master 36 #pragma omp target nowait 37 {;} 38 } 39 40 start=omp_get_wtime(); 41 42 #pragma omp parallel 43 { 44 #pragma omp master 45 #pragma omp target teams distribute parallel for nowait \ 46 map(to: v1[0:n/2]) \ 47 map(to: v2[0:n/2]) \ 48 map(from: vxv[0:n/2]) 49 for(i=0; i<n/2; i++){ 50 vxv[i] = v1[i]*v2[i]; 51 } 52 53 #pragma omp for 54 for(i=n/2; i<n; i++) { 55 vxv[i] = v1[i]*v2[i]; 56 } 57 /* Implicit barrier at end of worksharing for. Target region is 58 guaranteed to be completed by this point. */ 59 } 60 61 end=omp_get_wtime(); 62 63 printf("vxv[1]=%f, vxv[n-1]=%f, time=%lf\n", vxv[1], vxv[n-1], end-start); 64 return 0; 65}
对于两个版本,在我们实验室的一台机器上运行时的性能如下:
no nowait 版本 : 0.008220 秒 nowait on target 版本 : 0.002110 秒
Fortran
上面显示的相同的 nowait
示例可以用 Fortran 编写如下。
1 ! 2 ! This test is from OpenMP API 5.0.1 Examples (June 2020) 3 ! https://www.openmp.org/wp-content/uploads/openmp-examples-5-0-1.pdf 4 !(4.13.2 nowait Clause on target Construct) 5 ! 6 7 subroutine init(n, v1, v2) 8 integer :: i, n 9 real :: v1(n), v2(n) 10 11 do i = 1, n 12 v1(i) = i * 0.25 13 v2(i) = i - 1.25 14 end do 15 end subroutine init 16 17 program test_target_nowait 18 use omp_lib 19 use iso_fortran_env 20 implicit none 21 22 integer, parameter :: NUM=100000 ! NUM must be even 23 real :: v1(NUM), v2(NUM), vxv(NUM) 24 integer :: n, i 25 real(kind=REAL64) :: start, end 26 27 n = NUM 28 call init(n, v1, v2) 29 30 ! Dummy parallel and target (nowait) regions, so as not to measure 31 ! startup time. 32 !$omp parallel 33 !$omp master 34 !$omp target nowait 35 !$omp end target 36 !$omp end master 37 !$omp end parallel 38 39 start=omp_get_wtime() 40 41 !$omp parallel 42 43 !$omp master 44 !$omp target teams distribute parallel do nowait & 45 !$omp& map(to: v1(1:n/2)) & 46 !$omp& map(to: v2(1:n/2)) & 47 !$omp& map(from: vxv(1:n/2)) 48 do i = 1, n/2 49 vxv(i) = v1(i)*v2(i) 50 end do 51 !$omp end master 52 53 !$omp do 54 do i = n/2+1, n 55 vxv(i) = v1(i)*v2(i) 56 end do 57 58 !$omp end parallel 59 60 end=omp_get_wtime() 61 62 write(*,110) "vxv(1)=", vxv(1), ", vxv(n-1)=", vxv(n-1), ", time=", end-start 63110 format (A, F10.6, A, F17.6, A, F10.6) 64 65 end program test_target_nowait