测试:
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <stdlib.h>
#include <malloc.h>
typedef unsigned int u_int;
#define N 1024*1024*16
#define M 1000
void movsb_cpy1(void* Dst,void* Src,size_t Maxsize) ;
void movsd_cpy(void* Dst,void* Src,size_t Maxsize) ;
void cpp_cpy(void* Dst,void* Src,size_t Maxsize) ;
void sse_copy1(void *p1, void *p2, size_t n);
void sse_copy2(void *p1, void *p2, size_t n);
void sse_copy3(void *p1, void *p2, size_t n);
void mov_cpy0(void* Dst,void* Src,size_t Maxsize);
int main()
{
u_int a[4] = {(1U<<24) + (2U<<16) + (3U<<8) + 4};
u_int *p = a;
u_int * p1 = (u_int*)_aligned_malloc(N, 16);
u_int * p2 = (u_int*)_aligned_malloc(N, 16);
int x;
clock_t k1, k2;
x=M;
k1 = clock();
while(x--)
movsb_cpy1(p1,p2,N);
k2 = clock();
printf("movsb_cpy1: %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC);
x=M;
k1 = clock();
while(x--)
movsd_cpy(p1,p2,N);
k2 = clock();
printf("movsd_cpy: %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC);
x=M;
k1 = clock();
while(x--)
sse_copy1(p1,p2,N);
k2 = clock();
printf("sse_copy1: %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC);
x=M;
k1 = clock();
while(x--)
sse_copy2(p1,p2,N);
k2 = clock();
printf("sse_copy2: %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC);
x=M;
k1 = clock();
while(x--)
sse_copy3(p1,p2,N);
k2 = clock();
printf("sse_copy3: %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC);
x=M;
k1 = clock();
while(x--)
memcpy(p1,p2,N);
k2 = clock();
printf("memcpy: %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC);
x=M;
k1 = clock();
while(x--)
cpp_cpy(p1,p2,N);
k2 = clock();
printf("cpp_cpy: %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC);
x=M;
k1 = clock();
while(x--)
mov_cpy0(p1,p2,N);
k2 = clock();
printf("mov_cpy0: %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC);
_aligned_free(p1);
_aligned_free(p2);
return 0;
}
void sse_copy3(void *p1, void *p2, size_t n)
{
__asm
{
mov esi, p1
mov edi, p2
mov ecx, n
shr ecx, 7
LOOP1:
prefetchnta [p1+128]
prefetchnta [p1+160]
prefetchnta [p1+192]
prefetchnta [p1+124]
movdqa xmm0, [esi]
movdqa xmm1, [esi+16]
movdqa xmm2, [esi+32]
movdqa xmm3, [esi+48]
movdqa xmm4, [esi+64]
movdqa xmm5, [esi+80]
movdqa xmm6, [esi+96]
movdqa xmm7, [esi+112]
movntdq [edi], xmm0
movntdq [edi+16], xmm1
movntdq [edi+32], xmm2
movntdq [edi+48], xmm3
movntdq [edi+64], xmm4
movntdq [edi+80], xmm5
movntdq [edi+96], xmm6
movntdq [edi+112], xmm7
add esi, 128
add edi,128
sub ecx,1
jnz LOOP1
}
}
void sse_copy1(void *p1, void *p2, size_t n)
{
__asm
{
mov esi, p1
mov edi, p2
mov ecx, n
shr ecx, 7
LOOP1:
prefetchnta [p1+128]
prefetchnta [p1+160]
prefetchnta [p1+192]
prefetchnta [p1+124]
movups xmm0, [esi]
movups xmm1, [esi+16]
movups xmm2, [esi+32]
movups xmm3, [esi+48]
movups xmm4, [esi+64]
movups xmm5, [esi+80]
movups xmm6, [esi+96]
movups xmm7, [esi+112]
movups [edi], xmm0
movups [edi+16], xmm1
movups [edi+32], xmm2
movups [edi+48], xmm3
movups [edi+64], xmm4
movups [edi+80], xmm5
movups [edi+96], xmm6
movups [edi+112], xmm7
add esi, 128
add edi,128
sub ecx,1
jnz LOOP1
END:
}
}
void sse_copy2(void *p1, void *p2, size_t n)
{
__asm
{
mov esi, p1
mov edi, p2
mov ecx, n
shr ecx, 7
LOOP1:
movdqa xmm0, [esi]
movdqa xmm1, [esi+16]
movdqa xmm2, [esi+32]
movdqa xmm3, [esi+48]
movdqa xmm4, [esi+64]
movdqa xmm5, [esi+80]
movdqa xmm6, [esi+96]
movdqa xmm7, [esi+112]
movntdq [edi], xmm0
movntdq [edi+16], xmm1
movntdq [edi+32], xmm2
movntdq [edi+48], xmm3
movntdq [edi+64], xmm4
movntdq [edi+80], xmm5
movntdq [edi+96], xmm6
movntdq [edi+112], xmm7
add esi, 128
add edi,128
sub ecx,1
jnz LOOP1
END:
}
}
void movsb_cpy1(void* Dst,void* Src,size_t Maxsize)
{
__asm
{
mov esi,[Src]
mov edi,[Dst]
mov ecx, [Maxsize]
rep movsb
}
}
void mov_cpy0(void* Dst,void* Src,size_t Maxsize)
{
__asm
{
mov esi,Src
mov edi,Dst
mov ecx, Maxsize
L:
mov al, byte ptr[esi]
mov byte ptr[edi], al
sub ecx, 1
jnz L
}
}
void movsd_cpy(void* Dst,void* Src,size_t Maxsize)
{
__asm
{
mov esi,[Src]
mov edi,[Dst]
mov ecx, [Maxsize]
shr ecx,2
rep movsd
}
}
//
void cpp_cpy(void* Dst,void* Src,size_t Maxsize)
{
char *p1 = (char*)Dst;
char *p2 = (char*)Src;
while(Maxsize--)
*p1 = *p2;
}
movsb_cpy1: 14.48500 // 使用了rep的两个函数性能都不错
movsd_cpy: 14.797000
sse_copy1: 16.390000 // 在这里,prefetch好像没有发挥作用
sse_copy2: 10.313000 // movdqa 与 movntqa 比movups快得多
sse_copy3: 10.343000
memcpy: 14.469000 // 标准函数其实也是优化过的
cpp_cpy: 108.656000 // c++逐个字节拷贝
mov_cpy0: 109.563000 // 汇编逐个字节拷贝,如果逐个int拷贝,就变成了26s,差不多4倍
可见,
【1】rep很不错,可以学会使用
【2】sse需要内存对齐,很重要