本章节翻译by chenchensmail@163.com 原文:Fortran Example (intel.com)
我们将使用一个 Fortran 示例来说明如何适当地选择内存分配,以避免冗余的数据传输并提高性能。
这个 Fortran 示例使用了 OpenMP 部署,并且是根据 NWChem (一个高性能的计算化学应用程序)进行改编的。
在这个示例中,有一个 4 层的循环嵌套。最内层的 k 循环调用了 omp_fbody 例程,该例程将两个对 sgemm 的调用部署到设备上, 然后将一个 reduction 计算( reduction 变量 emp4i 和 emp5i )部署到设备上。
在循环嵌套中,有一些数组在主机上进行了更新(即,数组 Tkj 、 Kkj、 Jia 、 Kia、 Tia、 Xia 和 t1v1)。因此,我们需要使设备上这些数组的值与主机上的值保持一致。
版本 1
在程序的第一个(初级)版本中,我们使用 allocate 指令在共享内存中分配所有 11 个数组。
!$omp allocate allocator(omp_target_shared_mem_alloc) allocate( f1n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( f2n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( eorb(1:nbf) ) ...
共享分配可以由主机和设备访问,并根据需要自动在主机和设备之间迁移。因此,可以在主机和设备上使用指向共享分配的相同指针。
版本 1 如下所示。
1 include "mkl_omp_offload.f90"
2
3 subroutine omp_fbody(f1n,f2n,eorb, &
4 ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
5 Jia, Kia, Tia, Xia, Tkj, Kkj, &
6 t1v1,t1v2)
7
8 use omp_lib
9 use onemkl_blas_omp_offload_lp64
10 use iso_fortran_env
11 implicit none
12
13 real, intent(inout) :: emp4,emp5
14 integer, intent(in) :: ncor,nocc,nvir
15 integer, intent(in) :: a,i,j,k, klo
16 real, intent(inout) :: f1n(nvir,nvir)
17 real, intent(inout) :: f2n(nvir,nvir)
18 real, intent(in) :: eorb(*)
19 real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*)
20 real, intent(in) :: Tkj(*), Kkj(*)
21 real, intent(in) :: t1v1(nvir),t1v2(nvir)
22 real :: emp4i,emp5i
23 real :: eaijk,denom
24 integer :: lnov,lnvv
25 integer :: b,c
26 real :: f1nbc,f1ncb,f2nbc,f2ncb
27 real :: t1v1b,t1v2b
28
29 lnov=nocc*nvir
30 lnvv=nvir*nvir
31 emp4i = 0.0
32 emp5i = 0.0
33
34 !$omp dispatch
35 call sgemm('n','t',nvir,nvir,nvir,1.0,Jia,nvir, &
36 Tkj(1+(k-klo)*lnvv),nvir,1.0,f1n,nvir)
37
38 !$omp dispatch
39 call sgemm('n','t',nvir,nvir,nvir,1.0,Kia,nvir, &
40 Tkj(1+(k-klo)*lnvv),nvir,1.0, f2n,nvir)
41
42 !$omp dispatch
43 call sgemm('n','n',nvir,nvir,nocc,-1.0,Tia,nvir, &
44 Kkj(1+(k-klo)*lnov),nocc,1.0, f1n,nvir)
45
46 !$omp dispatch
47 call sgemm('n','n',nvir,nvir,nocc,-1.0,Xia,nvir, &
48 Kkj(1+(k-klo)*lnov),nocc,1.0, f2n,nvir)
49
50 eaijk = eorb(a) - ( eorb(ncor+i)+eorb(ncor+j)+eorb(ncor+k) )
51
52 !$omp target teams distribute parallel do collapse(2) &
53 !$omp reduction(+:emp4i,emp5i) &
54 !$omp private(f1nbc,f1ncb,f2nbc,f2ncb) &
55 !$omp private(t1v1b,t1v2b) &
56 !$omp private(denom) firstprivate(eaijk,nvir,ncor,nocc)
57 do b=1,nvir
58 do c=1,nvir
59 denom=-1.0/( eorb(ncor+nocc+b)+eorb(ncor+nocc+c)+eaijk )
60
61 f1nbc = f1n(b,c);
62 f1ncb = f1n(c,b);
63 f2nbc = f2n(b,c);
64 f2ncb = f2n(c,b);
65 t1v1b = t1v1(b);
66 t1v2b = t1v2(b);
67
68 emp4i = emp4i + (denom*t1v1b*f1nbc) + (denom*2*f1ncb)
69 emp5i = emp5i + (denom*t1v2b*f2nbc) + (denom*3*f2ncb)
70 enddo
71 enddo
72 !$omp end target teams distribute parallel do
73
74 emp4 = emp4 + emp4i
75 emp5 = emp5 + emp5i
76
77 end
78
79
80 subroutine init_array_1(arr, m)
81 implicit none
82 real, intent(inout) :: arr(m)
83 integer m, i
84
85 do i = 1, m
86 arr(i) = 1.0/(100.0 + i-1)
87 end do
88 end subroutine init_array_1
89
90
91 subroutine init_array_2(arr, m, n)
92 implicit none
93 real, intent(inout) :: arr(m, n)
94 integer m, n, i, j
95
96 do i = 1, m
97 do j = 1, n
98 arr(i,j) = 1.0/(100.0 + ((i-1) * n) + j)
99 end do
100 end do
101 end subroutine init_array_2
102
103
104 program main
105
106 use omp_lib
107 use iso_fortran_env
108 implicit none
109
110 interface
111 subroutine omp_fbody(f1n,f2n,eorb, &
112 ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
113 Jia, Kia, Tia, Xia, Tkj, Kkj, &
114 t1v1,t1v2)
115 real, intent(inout) :: emp4,emp5
116 integer, intent(in) :: ncor,nocc,nvir
117 integer, intent(in) :: a,i,j,k, klo
118 real, intent(inout) :: f1n(nvir,nvir)
119 real, intent(inout) :: f2n(nvir,nvir)
120 real, intent(in) :: eorb(*)
121 real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*), Tkj(*), Kkj(*)
122 real, intent(in) :: t1v1(nvir),t1v2(nvir)
123 end subroutine omp_fbody
124 end interface
125
126 integer :: ncor, nocc, nvir, maxiter, nkpass
127 integer :: nbf, lnvv, lnov, kchunk
128 real, allocatable :: eorb(:)
129 real, allocatable :: f1n(:,:)
130 real, allocatable :: f2n(:,:)
131
132 real, allocatable :: Jia(:)
133 real, allocatable :: Kia(:)
134 real, allocatable :: Tia(:)
135 real, allocatable :: Xia(:)
136 real, allocatable :: Tkj(:)
137 real, allocatable :: Kkj(:)
138
139 real, allocatable :: t1v1(:),t1v2(:)
140 real emp4, emp5
141 integer :: a, b, c, i, j, k
142 integer :: klo, khi, iter
143 double precision, allocatable :: timers(:)
144 double precision :: t0, t1, tsum, tmax, tmin, tavg
145
146! Run parameters
147 nocc = 256
148 nvir = 2048
149 maxiter = 50
150 nkpass = 1
151 ncor = 0
152
153 print *, "Run parameters:"
154 print *, "nocc =", nocc
155 print *, "nvir =", nvir
156 print *, "maxiter =", maxiter
157 print *, "nkpass =", nkpass
158 print *, "ncor =", ncor
159 print *, " "
160
161! Allocate and initialize arrays.
162
163 nbf = ncor + nocc + nvir
164 lnvv = nvir * nvir
165 lnov = nocc * nvir
166 kchunk = (nocc - 1)/nkpass + 1
167
168 !$omp allocate allocator(omp_target_shared_mem_alloc)
169 allocate( f1n(1:nvir,1:nvir) )
170
171 !$omp allocate allocator(omp_target_shared_mem_alloc)
172 allocate( f2n(1:nvir,1:nvir) )
173
174 !$omp allocate allocator(omp_target_shared_mem_alloc)
175 allocate( eorb(1:nbf) )
176
177 !$omp allocate allocator(omp_target_shared_mem_alloc)
178 allocate( Jia(1:lnvv) )
179
180 !$omp allocate allocator(omp_target_shared_mem_alloc)
181 allocate( Kia(1:lnvv) )
182
183 !$omp allocate allocator(omp_target_shared_mem_alloc)
184 allocate( Tia(1:lnov*nocc) )
185
186 !$omp allocate allocator(omp_target_shared_mem_alloc)
187 allocate( Xia(1:lnov*nocc))
188
189 !$omp allocate allocator(omp_target_shared_mem_alloc)
190 allocate( Tkj(1:kchunk*lnvv) )
191
192 !$omp allocate allocator(omp_target_shared_mem_alloc)
193 allocate( Kkj(1:kchunk*lnvv) )
194
195 !$omp allocate allocator(omp_target_shared_mem_alloc)
196 allocate( t1v1(1:lnvv) )
197
198 !$omp allocate allocator(omp_target_shared_mem_alloc)
199 allocate( t1v2(1:lnvv) )
200!
201 call init_array_1(eorb, nbf)
202 call init_array_1(Jia, lnvv)
203 call init_array_1(Kia, lnvv)
204 call init_array_1(Tia, lnov*nocc)
205 call init_array_1(Xia, lnov*nocc)
206 call init_array_1(Tkj, kchunk*lnvv)
207 call init_array_1(Kkj, kchunk*lnov)
208 call init_array_1(t1v1, lnvv)
209 call init_array_1(t1v2, lnvv)
210
211 call init_array_2(f1n, nvir, nvir)
212 call init_array_2(f2n, nvir, nvir)
213
214 print *, "End of initialization"
215
216 allocate (timers(1:maxiter))
217
218 emp4=0.0
219 emp5=0.0
220 a=1
221 iter=1
222
223 do klo = 1, nocc, kchunk
224 khi = MIN(nocc, klo+kchunk-1)
225 do j = 1, nocc
226
227#if defined(DO_UPDATE_ARRAYS)
228! Update elements of Tkj and KKj.
229 Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0
230 Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0
231#endif
232
233 do i = 1, nocc
234
235#if defined(DO_UPDATE_ARRAYS)
236! Update elements of Jia, Kia, Tia, Xia arrays.
237 Jia(lnvv) = Jia(lnvv) + 1.0
238 Kia(lnvv) = Kia(lnvv) + 1.0
239 Tia(lnov) = Tia(lnov) + 1.0
240 Xia(lnov) = Xia(lnov) + 1.0
241#endif
242
243 do k = klo, MIN(khi,i)
244
245#if defined(DO_UPDATE_ARRAYS)
246! Update elements of t1v1 array.
247 t1v1(:) = Tkj(lnvv-nvir+1:lnvv)
248#endif
249
250 t0 = omp_get_wtime()
251
252 call omp_fbody(f1n,f2n,eorb, &
253 ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
254 Jia, Kia, Tia, Xia, Tkj, Kkj, &
255 t1v1,t1v2)
256
257 t1 = omp_get_wtime()
258 timers(iter) = (t1-t0)
259 if (iter .eq. maxiter) then
260 print *, "Stopping after ", iter, "iterations"
261 print *, " "
262 goto 150
263 endif
264
265! Prevent NAN for large maxiter...
266 if (emp4 > 1000.0) then
267 emp4 = emp4 - 1000.0
268 endif
269 if (emp4 < -1000.0) then
270 emp4 = emp4 + 1000.0
271 endif
272 if (emp5 > 1000.0) then
273 emp5 = emp5 - 1000.0
274 endif
275 if (emp5 < -1000.0) then
276 emp5 = emp5 + 1000.0
277 endif
278
279 iter = iter + 1
280
281 end do ! k = klo, MIN(khi,i)
282 end do ! do i = 1, nocc
283 end do ! do j = 1, nocc
284 end do ! do klo = 1, nocc, kchunk
285
286 150 CONTINUE
287
288 tsum = 0.0
289 tmax = -1.0e10
290 tmin = 1.0e10
291 do i = 2, iter
292 tsum = tsum + timers(i)
293 tmax = MAX(tmax,timers(i))
294 tmin = MIN(tmin,timers(i))
295 end do
296
297 tavg = tsum / (iter - 1)
298 print *, "TOTAL ITER: ", iter
299 write(*, 110) " TIMING: min=", tmin, ", max=", tmax, ", avg of iters after first=", tavg, " seconds"
300 110 format (A, F9.6, A, F9.6, A, F9.6, A)
301
302 write(*, 120) " emp4 = ", emp4, " emp5 =", emp5
303 120 format (A, F15.3, A, F15.3)
304
305 print *, "END"
306
307 deallocate (f1n)
308 deallocate (f2n)
309 deallocate (eorb)
310 deallocate (Jia)
311 deallocate (Kia)
312 deallocate (Tia)
313 deallocate (Xia)
314 deallocate (Tkj)
315 deallocate (Kkj)
316
317 deallocate (t1v1)
318 deallocate (t1v2)
319 deallocate (timers)
320
321 end program
虽然这个版本编程简单直接,但并不是最高效的。
版本 2
在程序的第二个版本中,我们使用普通的 allocate 在系统内存中分配所有 11 个数组,并使用 map 子句将数组映射到设备
real, allocatable :: f1n(:,:) real, allocatable :: f2n(:,:) real, allocatable :: eorb(:) ... !$omp target data & map(to: f1n, f2n, eorb) & map(to: Jia, Kia, Tia, Xia) & map(to: Tkj, Kkj) & map(to: t1v1, t1v2)
当在主机上更新映射到设备的数组时,我们使用 OpenMP target update to 指令将数组的新值从主机拷贝到设备。我们尽可能地将指令放在循环嵌套的最高层,以避免不必要地拷贝数组。
Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0 Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0 !$omp target update to (Tkj, Kkj)
完整的版本 2 如下所示。
1 include "mkl_omp_offload.f90"
2
3 subroutine omp_fbody(f1n,f2n,eorb, &
4 ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
5 Jia, Kia, Tia, Xia, Tkj, Kkj, &
6 t1v1,t1v2)
7
8 use omp_lib
9 use onemkl_blas_omp_offload_lp64
10 use iso_fortran_env
11 implicit none
12
13 real, intent(inout) :: emp4,emp5
14 integer, intent(in) :: ncor,nocc,nvir
15 integer, intent(in) :: a,i,j,k, klo
16 real, intent(inout) :: f1n(nvir,nvir)
17 real, intent(inout) :: f2n(nvir,nvir)
18 real, intent(in) :: eorb(*)
19 real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*)
20 real, intent(in) :: Tkj(*), Kkj(*)
21 real, intent(in) :: t1v1(nvir),t1v2(nvir)
22 real :: emp4i,emp5i
23 real :: eaijk,denom
24 integer :: lnov,lnvv
25 integer :: b,c
26 real :: f1nbc,f1ncb,f2nbc,f2ncb
27 real :: t1v1b,t1v2b
28
29 lnov=nocc*nvir
30 lnvv=nvir*nvir
31 emp4i = 0.0
32 emp5i = 0.0
33
34 !$omp dispatch
35 call sgemm('n','t',nvir,nvir,nvir,1.0,Jia,nvir, &
36 Tkj(1+(k-klo)*lnvv),nvir,1.0,f1n,nvir)
37
38 !$omp dispatch
39 call sgemm('n','t',nvir,nvir,nvir,1.0,Kia,nvir, &
40 Tkj(1+(k-klo)*lnvv),nvir,1.0, f2n,nvir)
41
42 !$omp dispatch
43 call sgemm('n','n',nvir,nvir,nocc,-1.0,Tia,nvir, &
44 Kkj(1+(k-klo)*lnov),nocc,1.0, f1n,nvir)
45
46 !$omp dispatch
47 call sgemm('n','n',nvir,nvir,nocc,-1.0,Xia,nvir, &
48 Kkj(1+(k-klo)*lnov),nocc,1.0, f2n,nvir)
49
50 eaijk = eorb(a) - ( eorb(ncor+i)+eorb(ncor+j)+eorb(ncor+k) )
51
52 !$omp target teams distribute parallel do collapse(2) &
53 !$omp reduction(+:emp4i,emp5i) &
54 !$omp private(f1nbc,f1ncb,f2nbc,f2ncb) &
55 !$omp private(t1v1b,t1v2b) &
56 !$omp private(denom) firstprivate(eaijk,nvir,ncor,nocc)
57 do b=1,nvir
58 do c=1,nvir
59 denom=-1.0/( eorb(ncor+nocc+b)+eorb(ncor+nocc+c)+eaijk )
60
61 f1nbc = f1n(b,c);
62 f1ncb = f1n(c,b);
63 f2nbc = f2n(b,c);
64 f2ncb = f2n(c,b);
65 t1v1b = t1v1(b);
66 t1v2b = t1v2(b);
67
68 emp4i = emp4i + (denom*t1v1b*f1nbc) + (denom*2*f1ncb)
69 emp5i = emp5i + (denom*t1v2b*f2nbc) + (denom*3*f2ncb)
70 enddo
71 enddo
72 !$omp end target teams distribute parallel do
73
74 emp4 = emp4 + emp4i
75 emp5 = emp5 + emp5i
76
77 end
78
79
80 subroutine init_array_1(arr, m)
81 implicit none
82 real, intent(inout) :: arr(m)
83 integer m, i
84
85 do i = 1, m
86 arr(i) = 1.0/(100.0 + i-1)
87 end do
88 end subroutine init_array_1
89
90
91 subroutine init_array_2(arr, m, n)
92 implicit none
93 real, intent(inout) :: arr(m, n)
94 integer m, n, i, j
95
96 do i = 1, m
97 do j = 1, n
98 arr(i,j) = 1.0/(100.0 + ((i-1) * n) + j)
99 end do
100 end do
101 end subroutine init_array_2
102
103
104 program main
105
106 use omp_lib
107 use iso_fortran_env
108 implicit none
109
110 interface
111 subroutine omp_fbody(f1n,f2n,eorb, &
112 ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
113 Jia, Kia, Tia, Xia, Tkj, Kkj, &
114 t1v1,t1v2)
115 real, intent(inout) :: emp4,emp5
116 integer, intent(in) :: ncor,nocc,nvir
117 integer, intent(in) :: a,i,j,k, klo
118 real, intent(inout) :: f1n(nvir,nvir)
119 real, intent(inout) :: f2n(nvir,nvir)
120 real, intent(in) :: eorb(*)
121 real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*), Tkj(*), Kkj(*)
122 real, intent(in) :: t1v1(nvir),t1v2(nvir)
123 end subroutine omp_fbody
124 end interface
125
126 integer :: ncor, nocc, nvir, maxiter, nkpass
127 integer :: nbf, lnvv, lnov, kchunk
128 real, allocatable :: eorb(:)
129 real, allocatable :: f1n(:,:)
130 real, allocatable :: f2n(:,:)
131
132 real, allocatable :: Jia(:)
133 real, allocatable :: Kia(:)
134 real, allocatable :: Tia(:)
135 real, allocatable :: Xia(:)
136 real, allocatable :: Tkj(:)
137 real, allocatable :: Kkj(:)
138
139 real, allocatable :: t1v1(:),t1v2(:)
140 real emp4, emp5
141 integer :: a, b, c, i, j, k
142 integer :: klo, khi, iter
143 double precision, allocatable :: timers(:)
144 double precision :: t0, t1, tsum, tmax, tmin, tavg
145
146! Run parameters
147 nocc = 256
148 nvir = 2048
149 maxiter = 50
150 nkpass = 1
151 ncor = 0
152
153 print *, "Run parameters:"
154 print *, "nocc =", nocc
155 print *, "nvir =", nvir
156 print *, "maxiter =", maxiter
157 print *, "nkpass =", nkpass
158 print *, "ncor =", ncor
159 print *, " "
160
161! Allocate and initialize arrays.
162
163 nbf = ncor + nocc + nvir
164 lnvv = nvir * nvir
165 lnov = nocc * nvir
166 kchunk = (nocc - 1)/nkpass + 1
167
168 allocate( f1n(1:nvir,1:nvir) )
169 allocate( f2n(1:nvir,1:nvir) )
170 allocate( eorb(1:nbf) )
171 allocate( Jia(1:lnvv) )
172 allocate( Kia(1:lnvv) )
173 allocate( Tia(1:lnov*nocc) )
174 allocate( Xia(1:lnov*nocc))
175 allocate( Tkj(1:kchunk*lnvv) )
176 allocate( Kkj(1:kchunk*lnvv) )
177 allocate( t1v1(1:lnvv) )
178 allocate( t1v2(1:lnvv) )
179!
180 call init_array_1(eorb, nbf)
181 call init_array_1(Jia, lnvv)
182 call init_array_1(Kia, lnvv)
183 call init_array_1(Tia, lnov*nocc)
184 call init_array_1(Xia, lnov*nocc)
185 call init_array_1(Tkj, kchunk*lnvv)
186 call init_array_1(Kkj, kchunk*lnov)
187 call init_array_1(t1v1, lnvv)
188 call init_array_1(t1v2, lnvv)
189
190 call init_array_2(f1n, nvir, nvir)
191 call init_array_2(f2n, nvir, nvir)
192
193 print *, "End of initialization"
194
195 allocate (timers(1:maxiter))
196
197 emp4=0.0
198 emp5=0.0
199 a=1
200 iter=1
201
202 !$omp target data &
203 map(to: f1n, f2n, eorb) &
204 map(to: Jia, Kia, Tia, Xia) &
205 map(to: Tkj, Kkj) &
206 map(to: t1v1, t1v2)
207
208 do klo = 1, nocc, kchunk
209 khi = MIN(nocc, klo+kchunk-1)
210 do j = 1, nocc
211
212#if defined(DO_UPDATE_ARRAYS)
213! Update elements of Tkj and KKj.
214 Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0
215 Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0
216
217 !$omp target update to (Tkj, Kkj)
218#endif
219
220 do i = 1, nocc
221
222#if defined(DO_UPDATE_ARRAYS)
223! Update elements of Jia, Kia, Tia, Xia arrays.
224 Jia(lnvv) = Jia(lnvv) + 1.0
225 Kia(lnvv) = Kia(lnvv) + 1.0
226 Tia(lnov) = Tia(lnov) + 1.0
227 Xia(lnov) = Xia(lnov) + 1.0
228
229 !$omp target update to (Jia, Kia, Tia, Xia)
230#endif
231
232 do k = klo, MIN(khi,i)
233
234#if defined(DO_UPDATE_ARRAYS)
235! Update elements of t1v1 array.
236 t1v1(:) = Tkj(lnvv-nvir+1:lnvv)
237
238 !$omp target update to (t1v1)
239#endif
240
241 t0 = omp_get_wtime()
242
243 call omp_fbody(f1n,f2n,eorb, &
244 ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
245 Jia, Kia, Tia, Xia, Tkj, Kkj, &
246 t1v1,t1v2)
247
248 t1 = omp_get_wtime()
249 timers(iter) = (t1-t0)
250 if (iter .eq. maxiter) then
251 print *, "Stopping after ", iter, "iterations"
252 print *, " "
253 goto 150
254 endif
255
256! Prevent NAN for large maxiter...
257 if (emp4 > 1000.0) then
258 emp4 = emp4 - 1000.0
259 endif
260 if (emp4 < -1000.0) then
261 emp4 = emp4 + 1000.0
262 endif
263 if (emp5 > 1000.0) then
264 emp5 = emp5 - 1000.0
265 endif
266 if (emp5 < -1000.0) then
267 emp5 = emp5 + 1000.0
268 endif
269
270 iter = iter + 1
271
272 end do ! k = klo, MIN(khi,i)
273 end do ! do i = 1, nocc
274 end do ! do j = 1, nocc
275 end do ! do klo = 1, nocc, kchunk
276
277 150 CONTINUE
278 !$omp end target data
279
280 tsum = 0.0
281 tmax = -1.0e10
282 tmin = 1.0e10
283 do i = 2, iter
284 tsum = tsum + timers(i)
285 tmax = MAX(tmax,timers(i))
286 tmin = MIN(tmin,timers(i))
287 end do
288
289 tavg = tsum / (iter - 1)
290 print *, "TOTAL ITER: ", iter
291 write(*, 110) " TIMING: min=", tmin, ", max=", tmax, ", avg of iters after first=", tavg, " seconds"
292 110 format (A, F9.6, A, F9.6, A, F9.6, A)
293
294 write(*, 120) " emp4 = ", emp4, " emp5 =", emp5
295 120 format (A, F15.3, A, F15.3)
296
297 print *, "END"
298
299 deallocate (f1n)
300 deallocate (f2n)
301 deallocate (eorb)
302 deallocate (Jia)
303 deallocate (Kia)
304 deallocate (Tia)
305 deallocate (Xia)
306 deallocate (Tkj)
307 deallocate (Kkj)
308
309 deallocate (t1v1)
310 deallocate (t1v2)
311 deallocate (timers)
312
313 end program
版本 3
在程序的第三个版本中,我们考虑哪些数组只在主机上使用,哪些只在设备上使用,或者哪些既在主机上也在设备上使用。 我们还考虑数组值是否在程序执行过程中进行了更新。这些信息被用来决定在哪里分配数组,如何初始化它们,以及是否需要更新设备上的值。
数组 f1 和 f2 作为设备上的工作数组使用(用于存储对 SGEMM 的调用结果),并且它们不在主机上访问。所以我们直接在设备上分配它们。
由于 f1 和 f2 在设备上分配,我们调用 init_array_2() 在一个 OpenMP target 结构体中初始化数组。
其他 9 个数组在主机和设备上都被访问,所以我们在主机内存中分配它们,并调用 init_array_1() 初始化数组。然后使用 map 子句将数组映射到设备。
我们使用 OpenMP target update to 指令将 Tkj, Kkj, Jia, Kia, Tia, Xia``和 ``t1v1 的更新值从主机拷贝到设备。
!$omp allocate allocator(omp_target_device_mem_alloc) allocate( f1n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_device_mem_alloc) allocate( f2n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( eorb(1:nbf) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( Jia(1:lnvv) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( Kia(1:lnvv) ) ... !$omp target data & map(to: eorb) & map(to: Jia, Kia, Tia, Xia) & map(to: Tkj, Kkj) & map(to: t1v1, t1v2) ... Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0 Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0 !$omp target update to (Tkj, Kkj)
以下是程序的完整版本 3 。
1 include "mkl_omp_offload.f90"
2
3 subroutine omp_fbody(f1n,f2n,eorb, &
4 ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
5 Jia, Kia, Tia, Xia, Tkj, Kkj, &
6 t1v1,t1v2)
7
8 use omp_lib
9 use onemkl_blas_omp_offload_lp64
10 use iso_fortran_env
11 implicit none
12
13 real, intent(inout) :: emp4,emp5
14 integer, intent(in) :: ncor,nocc,nvir
15 integer, intent(in) :: a,i,j,k, klo
16 real, intent(inout) :: f1n(nvir,nvir)
17 real, intent(inout) :: f2n(nvir,nvir)
18 real, intent(in) :: eorb(*)
19 real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*)
20 real, intent(in) :: Tkj(*), Kkj(*)
21 real, intent(in) :: t1v1(nvir),t1v2(nvir)
22 real :: emp4i,emp5i
23 real :: eaijk,denom
24 integer :: lnov,lnvv
25 integer :: b,c
26 real :: f1nbc,f1ncb,f2nbc,f2ncb
27 real :: t1v1b,t1v2b
28
29 lnov=nocc*nvir
30 lnvv=nvir*nvir
31 emp4i = 0.0
32 emp5i = 0.0
33
34 !$omp dispatch
35 call sgemm('n','t',nvir,nvir,nvir,1.0,Jia,nvir, &
36 Tkj(1+(k-klo)*lnvv),nvir,1.0,f1n,nvir)
37
38 !$omp dispatch
39 call sgemm('n','t',nvir,nvir,nvir,1.0,Kia,nvir, &
40 Tkj(1+(k-klo)*lnvv),nvir,1.0, f2n,nvir)
41
42 !$omp dispatch
43 call sgemm('n','n',nvir,nvir,nocc,-1.0,Tia,nvir, &
44 Kkj(1+(k-klo)*lnov),nocc,1.0, f1n,nvir)
45
46 !$omp dispatch
47 call sgemm('n','n',nvir,nvir,nocc,-1.0,Xia,nvir, &
48 Kkj(1+(k-klo)*lnov),nocc,1.0, f2n,nvir)
49
50 eaijk = eorb(a) - ( eorb(ncor+i)+eorb(ncor+j)+eorb(ncor+k) )
51
52 !$omp target teams distribute parallel do collapse(2) &
53 !$omp reduction(+:emp4i,emp5i) &
54 !$omp private(f1nbc,f1ncb,f2nbc,f2ncb) &
55 !$omp private(t1v1b,t1v2b) &
56 !$omp private(denom) firstprivate(eaijk,nvir,ncor,nocc)
57 do b=1,nvir
58 do c=1,nvir
59 denom=-1.0/( eorb(ncor+nocc+b)+eorb(ncor+nocc+c)+eaijk )
60
61 f1nbc = f1n(b,c);
62 f1ncb = f1n(c,b);
63 f2nbc = f2n(b,c);
64 f2ncb = f2n(c,b);
65 t1v1b = t1v1(b);
66 t1v2b = t1v2(b);
67
68 emp4i = emp4i + (denom*t1v1b*f1nbc) + (denom*2*f1ncb)
69 emp5i = emp5i + (denom*t1v2b*f2nbc) + (denom*3*f2ncb)
70 enddo
71 enddo
72 !$omp end target teams distribute parallel do
73
74 emp4 = emp4 + emp4i
75 emp5 = emp5 + emp5i
76
77 end
78
79
80 subroutine init_array_1(arr, m)
81 implicit none
82 real, intent(inout) :: arr(m)
83 integer m, i
84
85 do i = 1, m
86 arr(i) = 1.0/(100.0 + i-1)
87 end do
88 end subroutine init_array_1
89
90
91 subroutine init_array_2(arr, m, n)
92 implicit none
93 real, intent(inout) :: arr(m, n)
94 integer m, n, i, j
95
96 !$omp target teams distribute parallel do
97 do i = 1, m
98 do j = 1, n
99 arr(i,j) = 1.0/(100.0 + ((i-1) * n) + j)
100 end do
101 end do
102 end subroutine init_array_2
103
104
105 program main
106
107 use omp_lib
108 use iso_fortran_env
109 implicit none
110
111 interface
112 subroutine omp_fbody(f1n,f2n,eorb, &
113 ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
114 Jia, Kia, Tia, Xia, Tkj, Kkj, &
115 t1v1,t1v2)
116 real, intent(inout) :: emp4,emp5
117 integer, intent(in) :: ncor,nocc,nvir
118 integer, intent(in) :: a,i,j,k, klo
119 real, intent(inout) :: f1n(nvir,nvir)
120 real, intent(inout) :: f2n(nvir,nvir)
121 real, intent(in) :: eorb(*)
122 real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*), Tkj(*), Kkj(*)
123 real, intent(in) :: t1v1(nvir),t1v2(nvir)
124 end subroutine omp_fbody
125 end interface
126
127 integer :: ncor, nocc, nvir, maxiter, nkpass
128 integer :: nbf, lnvv, lnov, kchunk
129 real, allocatable :: eorb(:)
130 real, allocatable :: f1n(:,:)
131 real, allocatable :: f2n(:,:)
132
133 real, allocatable :: Jia(:)
134 real, allocatable :: Kia(:)
135 real, allocatable :: Tia(:)
136 real, allocatable :: Xia(:)
137 real, allocatable :: Tkj(:)
138 real, allocatable :: Kkj(:)
139
140 real, allocatable :: t1v1(:),t1v2(:)
141 real emp4, emp5
142 integer :: a, b, c, i, j, k
143 integer :: klo, khi, iter
144 double precision, allocatable :: timers(:)
145 double precision :: t0, t1, tsum, tmax, tmin, tavg
146
147! Run parameters
148 nocc = 256
149 nvir = 2048
150 maxiter = 50
151 nkpass = 1
152 ncor = 0
153
154 print *, "Run parameters:"
155 print *, "nocc =", nocc
156 print *, "nvir =", nvir
157 print *, "maxiter =", maxiter
158 print *, "nkpass =", nkpass
159 print *, "ncor =", ncor
160 print *, " "
161
162! Allocate and initialize arrays.
163
164 nbf = ncor + nocc + nvir
165 lnvv = nvir * nvir
166 lnov = nocc * nvir
167 kchunk = (nocc - 1)/nkpass + 1
168
169 !$omp allocate allocator(omp_target_device_mem_alloc)
170 allocate( f1n(1:nvir,1:nvir) )
171
172 !$omp allocate allocator(omp_target_device_mem_alloc)
173 allocate( f2n(1:nvir,1:nvir) )
174
175 !$omp allocate allocator(omp_target_host_mem_alloc)
176 allocate( eorb(1:nbf) )
177
178 !$omp allocate allocator(omp_target_host_mem_alloc)
179 allocate( Jia(1:lnvv) )
180
181 !$omp allocate allocator(omp_target_host_mem_alloc)
182 allocate( Kia(1:lnvv) )
183
184 !$omp allocate allocator(omp_target_host_mem_alloc)
185 allocate( Tia(1:lnov*nocc) )
186
187 !$omp allocate allocator(omp_target_host_mem_alloc)
188 allocate( Xia(1:lnov*nocc))
189
190 !$omp allocate allocator(omp_target_host_mem_alloc)
191 allocate( Tkj(1:kchunk*lnvv) )
192
193 !$omp allocate allocator(omp_target_host_mem_alloc)
194 allocate( Kkj(1:kchunk*lnvv) )
195
196 !$omp allocate allocator(omp_target_host_mem_alloc)
197 allocate( t1v1(1:lnvv) )
198
199 !$omp allocate allocator(omp_target_host_mem_alloc)
200 allocate( t1v2(1:lnvv) )
201!
202 call init_array_1(eorb, nbf)
203 call init_array_1(Jia, lnvv)
204 call init_array_1(Kia, lnvv)
205 call init_array_1(Tia, lnov*nocc)
206 call init_array_1(Xia, lnov*nocc)
207 call init_array_1(Tkj, kchunk*lnvv)
208 call init_array_1(Kkj, kchunk*lnov)
209 call init_array_1(t1v1, lnvv)
210 call init_array_1(t1v2, lnvv)
211
212 call init_array_2(f1n, nvir, nvir)
213 call init_array_2(f2n, nvir, nvir)
214
215 print *, "End of initialization"
216
217 allocate (timers(1:maxiter))
218
219 emp4=0.0
220 emp5=0.0
221 a=1
222 iter=1
223
224 !$omp target data &
225 map(to: eorb) &
226 map(to: Jia, Kia, Tia, Xia) &
227 map(to: Tkj, Kkj) &
228 map(to: t1v1, t1v2)
229
230 do klo = 1, nocc, kchunk
231 khi = MIN(nocc, klo+kchunk-1)
232 do j = 1, nocc
233
234#if defined(DO_UPDATE_ARRAYS)
235! Update elements of Tkj and KKj.
236 Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0
237 Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0
238
239 !$omp target update to (Tkj, Kkj)
240#endif
241
242 do i = 1, nocc
243
244#if defined(DO_UPDATE_ARRAYS)
245! Update elements of Jia, Kia, Tia, Xia arrays.
246 Jia(lnvv) = Jia(lnvv) + 1.0
247 Kia(lnvv) = Kia(lnvv) + 1.0
248 Tia(lnov) = Tia(lnov) + 1.0
249 Xia(lnov) = Xia(lnov) + 1.0
250
251 !$omp target update to (Jia, Kia, Tia, Xia)
252#endif
253
254 do k = klo, MIN(khi,i)
255
256#if defined(DO_UPDATE_ARRAYS)
257! Update elements of t1v1 array.
258 t1v1(:) = Tkj(lnvv-nvir+1:lnvv)
259
260 !$omp target update to (t1v1)
261#endif
262
263 t0 = omp_get_wtime()
264
265 call omp_fbody(f1n,f2n,eorb, &
266 ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
267 Jia, Kia, Tia, Xia, Tkj, Kkj, &
268 t1v1,t1v2)
269
270 t1 = omp_get_wtime()
271 timers(iter) = (t1-t0)
272 if (iter .eq. maxiter) then
273 print *, "Stopping after ", iter, "iterations"
274 print *, " "
275 goto 150
276 endif
277
278! Prevent NAN for large maxiter...
279 if (emp4 > 1000.0) then
280 emp4 = emp4 - 1000.0
281 endif
282 if (emp4 < -1000.0) then
283 emp4 = emp4 + 1000.0
284 endif
285 if (emp5 > 1000.0) then
286 emp5 = emp5 - 1000.0
287 endif
288 if (emp5 < -1000.0) then
289 emp5 = emp5 + 1000.0
290 endif
291
292 iter = iter + 1
293
294 end do ! k = klo, MIN(khi,i)
295 end do ! do i = 1, nocc
296 end do ! do j = 1, nocc
297 end do ! do klo = 1, nocc, kchunk
298
299 150 CONTINUE
300 !$omp end target data
301
302 tsum = 0.0
303 tmax = -1.0e10
304 tmin = 1.0e10
305 do i = 2, iter
306 tsum = tsum + timers(i)
307 tmax = MAX(tmax,timers(i))
308 tmin = MIN(tmin,timers(i))
309 end do
310
311 tavg = tsum / (iter - 1)
312 print *, "TOTAL ITER: ", iter
313 write(*, 110) " TIMING: min=", tmin, ", max=", tmax, ", avg of iters after first=", tavg, " seconds"
314 110 format (A, F9.6, A, F9.6, A, F9.6, A)
315
316 write(*, 120) " emp4 = ", emp4, " emp5 =", emp5
317 120 format (A, F15.3, A, F15.3)
318
319 print *, "END"
320
321 deallocate (f1n)
322 deallocate (f2n)
323 deallocate (eorb)
324 deallocate (Jia)
325 deallocate (Kia)
326 deallocate (Tia)
327 deallocate (Xia)
328 deallocate (Tkj)
329 deallocate (Kkj)
330
331 deallocate (t1v1)
332 deallocate (t1v2)
333 deallocate (timers)
334
335 end program
性能比较
我们使用以下命令来编译、链接和运行各个版本。在下面的命令中,将源文件名替换为 TEST.f。
编译:
ifx -O3 -fiopenmp -fopenmp-targets=spir64 -DMKL -qmkl -fpp -free -D DO_UPDATE_ARRAYS TEST.f -c -o TEST.o
链接:
ifx -fiopenmp -fopenmp-targets=spir64 -qmkl -fsycl -L${MKLROOT}/lib/intel64 -liomp5 -lsycl -lOpenCL -lstdc++ -lpthread -lm -ldl -lmkl_sycl -D DO_UPDATE_ARRAYS TEST.o -o TEST.exe
运行:
LIBOMPTARGET_LEVEL_ZERO_COMMAND_BATCH=copy OMP_TARGET_OFFLOAD=MANDATORY ZE_AFFINITY_MASK=0.0 LIBOMPTARGET_DEBUG=0 LIBOMPTARGET_PLUGIN=level0 TEST.exe
我们比较了在使用的特定 GPU (仅 1 堆栈)上运行各个版本的性能。
各个版本的平均迭代时间如下。
版本 1 (test-SharedMem.f: 0.115163 秒 版本 2 (test-Map-UpdateTo.f): 0.002012 秒 版本 3 (test-DeviceMem-Map-UpdateTo.f): 0.001978 秒
283

被折叠的 条评论
为什么被折叠?



