; SAD_W16指定使用SSE SSE2等指令集
%macro SAD_W16 1
;-----------------------------------------------------------------------------
; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
;使用SSE2指令集做16x16的SAD
cglobal x264_pixel_sad_16x16_%1, 4,4,8 ;使用4个参数,4个寄存器,8个XMM寄存器
movdqu m0, [r2]
movdqu m1, [r2+r3] ; 分别存入两行数据到m0和m1中
lea r2, [r2+2*r3] r2往下移动两行
movdqu m2, [r2] 再把第三行以及第四行的数据搬过来
movdqu m3, [r2+r3]
lea r2, [r2+2*r3] ; 再把r2往下移动两行,现在指向的是第五行(line 4)
psadbw m0, [r0] ; m0和dst的第1行做sad
psadbw m1, [r0+r1] ; m1和dst的第二行做sad
lea r0, [r0+2*r1] ; r0指向dst第三行
movdqu m4, [r2] ; 载入src第五行
paddw m0, m1
psadbw m2, [r0] ; m2和dst的第三行做SAD
psadbw m3, [r0+r1] ; m3和dst的第四行做SAD
lea r0, [r0+2*r1] ; r0指向第5行
movdqu m5, [r2+r3] ; 载入第6行
lea r2, [r2+2*r3] ; r2指向第7行.....
paddw m2, m3 把m3的结果加到m2上
movdqu m6, [r2]
movdqu m7, [r2+r3]
lea r2, [r2+2*r3]
paddw m0, m2 这时把m2的值加到m0上,m2的值就可以被释放,放入新的值去计算
psadbw m4, [r0] 每次都读两行值
psadbw m5, [r0+r1]
lea r0, [r0+2*r1]
movdqu m1, [r2]
paddw m4, m5
psadbw m6, [r0]
psadbw m7, [r0+r1]
lea r0, [r0+2*r1]
movdqu m2, [r2+r3]
lea r2, [r2+2*r3]
paddw m6, m7
movdqu m3, [r2]
paddw m0, m4 最终都加会m0寄存器中
movdqu m4, [r2+r3]
lea r2, [r2+2*r3]
paddw m0, m6
psadbw m1, [r0]
psadbw m2, [r0+r1]
lea r0, [r0+2*r1]
movdqu m5, [r2]
paddw m1, m2
psadbw m3, [r0]
psadbw m4, [r0+r1]
lea r0, [r0+2*r1]
movdqu m6, [r2+r3]
lea r2, [r2+2*r3]
paddw m3, m4
movdqu m7, [r2]
paddw m0, m1
movdqu m1, [r2+r3]
paddw m0, m3
psadbw m5, [r0]
psadbw m6, [r0+r1]
lea r0, [r0+2*r1]
paddw m5, m6
psadbw m7, [r0]
psadbw m1, [r0+r1]
paddw m7, m1
paddw m0, m5
paddw m0, m7
SAD_END_SSE2
03-09
4585

11-11
1004
