在openmp中有个比较典型的测试例子cpp_compiler_options_openmp.cpp,展示了
for循环中的归约操作, #pragma omp parallel for reduction(+:sum) private(x)
自已也写个多线程的版本,针对Intel Core 2 Duo CPU.
计算pi的方法很多,这个方法用于测试最好了,原理:http://wenku.baidu.com/view/3287baacdd3383c4bb4cd2ed.html
c代码:
double test1(int num_steps) {
int i;
double x, pi, sum = 0.0, step;
step = 1.0 / (double) num_steps;
for (i = 1; i <= num_steps; i++) {
x = (i - 0.5) * step;
sum = sum + 4.0 / (1.0 + x*x);
}
pi = step * sum;
return pi;
}
使用MS vc++6.0 编译,得到FPU单线程版本。
用Intel C++编译,得到SSE单线程版本。
用Intel C++结合openmp生成SSE的3线程版本(两个计算线程,一个主线程)。
我将Intel C++编译,得到SSE单线程版本改写为如上的3线程版本。
处理器:Intel Core(TM)2 Duo CPU E8500 @3.16GHz 3.16GHz win7 32位.
性能如下:
MS VC For 1000000000 steps, pi = 3.141592653589971, 6506 milliseconds 单线程
Intel C++ For 1000000000 steps, pi = 3.141592653589763, 3307 milliseconds 单线程
Openmp+Intel C++ For 1000000000 steps, pi = 3.141592653589738, 1684 milliseconds 3线程
my mtTest.exe For 1000000000 steps, pi = 3.141592653589738 ,1606 milliseconds 3线程
很容易扩展成更多的线程。
;>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
;*--==--* fasm multiple threads.
;*--==--* By G-Spider
;*--==--* fasm mtTest.asm mtTest.exe
;>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
format PE console
entry start
include 'win32a.inc'
THREAD_PRIORITY_TIME_CRITICAL = 0fh
THREAD_PRIORITY_HIGHEST = 02h
CREATE_SUSPENDED = 04h
INFINITE = -1
;i = 1; i <= num_steps; i++
;[1 , N/2] [N/2+1, N]
N = 1000000000
;---------------------------------------------
section '.text' code readable executable
start:
invoke GetCurrentThread
mov edi,eax
invoke SetThreadPriority,edi,THREAD_PRIORITY_TIME_CRITICAL
invoke SetThreadAffinityMask,edi,1
;==============================================================
invoke GetTickCount
mov ebx,eax
xor esi,esi
@@:
xor eax,eax
lea edx,[esi*8+dwParam]
invoke CreateThread,eax,eax,ThreadProc,edx,CREATE_SUSPENDED,eax
test eax,eax
jz _END
mov [hTrd+esi*4],eax
mov edi,eax
invoke SetThreadPriority,edi,THREAD_PRIORITY_HIGHEST
inc esi
invoke SetThreadAffinityMask,edi,esi
invoke ResumeThread,edi
test esi,1
jnz @B
invoke WaitForMultipleObjects,2,hTrd,TRUE,INFINITE
invoke GetTickCount
sub eax,ebx
;==============================================================
push eax
sub esp,8
fld qword [dwParam+16]
fadd qword [dwParam+24]
fstp qword [esp]
push szFmt
call [printf]
add esp,16
cinvoke system,szPause
_END:
invoke ExitProcess,0
;align 16
proc ThreadProc uses esi, lpParam
mov esi,[lpParam]
stdcall _testPiSSE,[esi],[esi+4]
fstp qword [esi+16]
ret
endp
align 16
_testPiSSE:
; parameter 1: 8 + ebp ;lower
; parameter 2: 12 + ebp ;higher
.B2.1: ; Preds .B2.0
push ebp
mov ebp, esp
and esp, -16
push ebx
sub esp, 28
mov eax, dword [8+ebp]
mov ecx, dword [12+ebp]
cmp eax, ecx
jg .B2.10 ; Prob 50%
.B2.2: ; Preds .B2.1
sub ecx, eax
inc ecx
cmp ecx, 8
jl .B2.12 ; Prob 10%
.B2.3: ; Preds .B2.2
mov ebx, 2
mov edx, ecx
and edx, 7
neg edx
pxor xmm1, xmm1
pxor xmm6, xmm6
add edx, ecx
movd xmm3, ebx
lea ebx, dword [1+eax]
pshufd xmm4, xmm3, 0
movd xmm3, eax
movaps xmm5, dqword [_2il0floatpacket.13]
movaps xmm2, dqword [_2il0floatpacket.14]
movd xmm0, ebx
xor ebx, ebx
punpckldq xmm3, xmm0
movaps xmm0, xmm6
punpcklqdq xmm3, xmm1
.B2.4: ; Preds .B2.4 .B2.3
cvtdq2pd xmm7, xmm3
subpd xmm7, xmm2
mulpd xmm7, xmm5
mulpd xmm7, xmm7
movaps xmm1, dqword [_2il0floatpacket.16]
paddd xmm3, xmm4
addpd xmm7, xmm1
movaps dqword [esp], xmm0
add ebx, 8
movaps xmm0, dqword [_2il0floatpacket.15]
cmp ebx, edx
divpd xmm0, xmm7
cvtdq2pd xmm7, xmm3
addpd xmm6, xmm0
subpd xmm7, xmm2
mulpd xmm7, xmm5
mulpd xmm7, xmm7
addpd xmm7, xmm1
movaps xmm0, dqword [_2il0floatpacket.15]
paddd xmm3, xmm4
divpd xmm0, xmm7
movaps xmm7, dqword [esp]
addpd xmm7, xmm0
cvtdq2pd xmm0, xmm3
subpd xmm0, xmm2
mulpd xmm0, xmm5
mulpd xmm0, xmm0
addpd xmm0, xmm1
movaps dqword [esp], xmm7
paddd xmm3, xmm4
movaps xmm7, dqword [_2il0floatpacket.15]
divpd xmm7, xmm0
cvtdq2pd xmm0, xmm3
addpd xmm6, xmm7
subpd xmm0, xmm2
mulpd xmm0, xmm5
mulpd xmm0, xmm0
addpd xmm0, xmm1
movaps xmm1, dqword [_2il0floatpacket.15]
paddd xmm3, xmm4
divpd xmm1, xmm0
movaps xmm0, dqword [esp]
addpd xmm0, xmm1
jb .B2.4 ; Prob 82%
.B2.5: ; Preds .B2.4
addpd xmm6, xmm0
movaps xmm0, xmm6
unpckhpd xmm0, xmm6
addsd xmm6, xmm0
.B2.6: ; Preds .B2.5 .B2.12
movsd xmm2, qword [_2il0floatpacket.12]
add eax, edx
cmp edx, ecx
jae .B2.11 ; Prob 10%
.B2.7: ; Preds .B2.6
movsd xmm1, qword [_2il0floatpacket.17]
movsd xmm0, qword [_2il0floatpacket.19]
.B2.8: ; Preds .B2.8 .B2.7
pxor xmm3, xmm3
inc edx
cvtsi2sd xmm3, eax
movsd xmm4, qword [_2il0floatpacket.18]
inc eax
cmp edx, ecx
subsd xmm3, xmm1
mulsd xmm3, xmm2
mulsd xmm3, xmm3
addsd xmm3, xmm0
divsd xmm4, xmm3
addsd xmm6, xmm4
jb .B2.8 ; Prob 82%
jmp .B2.11 ; Prob 100%
.B2.10: ; Preds .B2.1
movsd xmm2, qword [_2il0floatpacket.12]
pxor xmm6, xmm6
.B2.11: ; Preds .B2.8 .B2.6 .B2.10
mulsd xmm2, xmm6
movsd qword [esp], xmm2
fld qword [esp]
add esp, 28
pop ebx
mov esp, ebp
pop ebp
retn 8
.B2.12: ; Preds .B2.2
xor edx, edx
pxor xmm6, xmm6
jmp .B2.6 ; Prob 100%
;---------------------------------------------
section '.data' data readable writeable
_2il0floatpacket.13 DD 0e826d695H,03e112e0bH,0e826d695H,03e112e0bH
_2il0floatpacket.14 DD 000000000H,03fe00000H,000000000H,03fe00000H
_2il0floatpacket.15 DD 000000000H,040100000H,000000000H,040100000H
_2il0floatpacket.16 DD 000000000H,03ff00000H,000000000H,03ff00000H
_2il0floatpacket.12 DD 0e826d695H,03e112e0bH ;N=1000000000
_2il0floatpacket.17 DD 000000000H,03fe00000H ;0.5
_2il0floatpacket.18 DD 000000000H,040100000H ;4.0
_2il0floatpacket.19 DD 000000000H,03ff00000H ;1.0
dwParam dd 1,N/2-1000,N/2+1-1000,N,0,0,0,0
hTrd rd 2
szFmt db '%.15lf ,%d ms', 0aH, 00H
szPause db 'pause',0
;---------------------------------------------
section '.idata' import data readable writeable
library kernel32,'KERNEL32.DLL',\
msvcrt,'msvcrt.dll'
include 'api\kernel32.inc'
import msvcrt,\
printf,'printf',\
system,'system'