对 strlen 汇编代码的解释

最新推荐文章于 2022-06-04 13:35:57 发布

weixin_34220963

最新推荐文章于 2022-06-04 13:35:57 发布

阅读量323

点赞数

本文对比了自定义strlen函数与标准库版本的性能差异，并深入分析了标准库中strlen函数的高效实现原理，揭示了其利用魔数进行快速零字节检测的技巧。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

　　　　【原创性声明】在调试到strlen的汇编代码时，我发现其汇编代码略显“古怪”，当时并未有精力去关注，在研究过后重新查看了strlen的汇编代码。对strlen的原理解释并非原创，而是参考了《strlen源码剖析》一文的解释，在理解后感觉有必要写此文记之。

　　　　首先我们创建一个测试项目，来比较以下我们自己用常规方法写的的strlen函数和<string.h> 中的strlen的时间消耗水平，这里我用 GetTickCount（）做一个初步的估算（并不精确），测试代码如下：

strlen时间消耗测试

#include " stdafx.h "
#include < stdlib.h >
#include < stdio.h >
#include < string .h >
#include < windows.h >

// 这是我们自己按常规算法实现的strlen，即逐个byte判断是否为0
size_t mystrlen( char * s)
{
     char * p = s;
     while ( * p) ++ p;
     return (p - s);
}

int main( int argc, char * argv[])
{
     char * s;
    size_t result, i;
    DWORD tickcount;

    s = ( char * )malloc( 4096 );
    memset(s, ' a ' , 4096 );
    s[ 4091 ] = 0 ;

     // 粗略测试两种strlen的时间
     // 标准库中的strlen：
    tickcount = GetTickCount();
     for (i = 0 ; i < 20000 ; i ++ )
    {
        result = strlen(s);
    }
    printf( " strlen  :  %ld ms, result = %ld\n " , GetTickCount() - tickcount, result);

     // 我们自己编写的strlen：
    tickcount = GetTickCount();
     for (i = 0 ; i < 20000 ; i ++ )
    {
        result = mystrlen(s);
    }
    printf( " mystrlen:  %ld ms, result = %ld\n " , GetTickCount() - tickcount, result);

    free(s);
     return 0 ;
}

　　　　该测试的输出结果如下：

　　　　strlen : 16 ms, result = 4091
　　　　mystrlen: 218 ms, result = 4091

　　　　首先毫无疑问两种方法的时间复杂度相同（O（n）），区别在于常数系数不同，在这里我们自己写的函数的时间消耗大约是标准库版本的 13 倍之多。下面我们可以分别看下两个版本的汇编代码。

　　　　首先是我们自己编写的mystrlen版本，编译器产生的汇编代码如下：

代码

. text: 00401020 mystrlen        proc near
. text: 00401020
. text: 00401020 var_44          = dword ptr -44h
. text: 00401020 var_4           = dword ptr - 4
. text: 00401020 arg_0           = dword ptr   8
. text: 00401020
. text: 00401020                   push     ebp
. text: 00401021                   mov      ebp, esp
. text: 00401023                   sub      esp, 44h
. text: 00401026                   push     ebx
. text: 00401027                   push     esi
. text: 00401028                   push     edi
. text: 00401029                   lea      edi, [ebp+var_44]
. text: 0040102C                   mov      ecx, 11h
. text: 00401031                   mov      eax, 0CCCCCCCCh
. text: 00401036                   rep stosd
. text: 00401038                   mov      eax, [ebp+arg_0]   ;
. text: 0040103B                   mov      [ebp+var_4], eax   ; char* p = s;
. text: 0040103E
. text: 0040103E loc_40103E:                               ;
. text: 0040103E                   mov      ecx, [ebp+var_4]
. text: 00401041                   movsx    edx, byte ptr [ecx] ; edx = *p;
. text: 00401044                   test     edx, edx             ; if ( edx == 0 )
. text: 00401046                   jz       short loc_401053     ;      break;
. text: 00401048                   mov      eax, [ebp+var_4]
. text: 0040104B                   add      eax, 1
. text: 0040104E                   mov      [ebp+var_4], eax     ; ++p;
. text: 00401051                   jmp      short loc_40103E     ;
. text: 00401053 ;
. text: 00401053
. text: 00401053 loc_401053:                               ; CODE XREF: mystrlen+26j
. text: 00401053                   mov      eax, [ebp+var_4]      ; eax = p;
. text: 00401056                   sub      eax, [ebp+arg_0]      ; return (p - s);
. text: 00401059                   pop      edi
. text: 0040105A                   pop      esi
. text: 0040105B                   pop      ebx
. text: 0040105C                   mov      esp, ebp
. text: 0040105E                   pop      ebp
. text: 0040105F                   retn
. text: 0040105F mystrlen        endp

　　　　很明显，我们自己编写的函数 mystrlen 是逐个 byte 去比较的。

　　　　下面我们看标准库中的strlen的汇编代码，位于VC6的如下路径：...\Microsoft Visual Studio\VC98\CRT\SRC\Intel\strlen.ASM

代码

        CODESEG

        public  strlen

strlen  proc

        .FPO    ( 0 , 1 , 0 , 0 , 0 , 0 )

string  equ     [esp + 4 ]

         mov      ecx,string               ; ecx -> string
         test     ecx, 3                     ; test if string is aligned on 32 bits
         je       short main_loop

str_misaligned:
         ; simple byte loop until string is aligned
         mov      al,byte ptr [ecx]         ; 在DWORD未对齐部分逐个字节扫描
         inc      ecx
         test     al,al
         je       short byte_3
         test     ecx, 3
         jne      short str_misaligned

         add      eax,dword ptr 0           ; 5 byte nop to align label below

        align    16                        ; should be redundant

main_loop:
         mov      eax,dword ptr [ecx]      ; read 4 bytes, 复制四个char到eax
         mov      edx,7efefeffh            ; 魔数 0x 7efefeff
         add      edx,eax                  ; edx = 0x7efefeff + eax;
         xor      eax,- 1                    ; eax = ~eax （按位取反）
         xor      eax,edx                  ; eax = ~eax ^ edx
         add      ecx, 4                     ; ecx 指向到下一个 DWORD
         test     eax,81010100h            ; eax & (~0x 7efefeff), 含 0字节吗？
         je       short main_loop
         ; found zero byte in the loop     如果当前的DWORD含0，找出其位于哪一个字节
         mov      eax,[ecx - 4 ]
         test     al,al                    ; is it byte 0
         je       short byte_0
         test     ah,ah                    ; is it byte 1
         je       short byte_1
         test     eax,00ff0000h            ; is it byte 2
         je       short byte_2
         test     eax,0ff000000h           ; is it byte 3
         je       short byte_3
         jmp      short main_loop          ; taken if bits 24-30 are clear and bit
                                         ; 31 is set

         ; 0 字节的地址减去字符串地址，即为strlen的结果。
byte_3:
         lea      eax,[ecx - 1 ]
         mov      ecx,string
         sub      eax,ecx
         ret
byte_2:
         lea      eax,[ecx - 2 ]
         mov      ecx,string
         sub      eax,ecx
         ret
byte_1:
         lea      eax,[ecx - 3 ]
         mov      ecx,string
         sub      eax,ecx
         ret
byte_0:
         lea      eax,[ecx - 4 ]
         mov      ecx,string
         sub      eax,ecx
         ret

strlen  endp

        end

　　　　请注意，strlen的主循环部分，是已4个char为一组作为一个DORD送到eax去检测的，因此要求必须读取DWORD时以 4 bytes 对齐，所以如果字符串地址并不对齐，则前面的1~3个char必须逐个字节检测。进入主循环的循环条件相当于以下代码：

　　　　if ((((longword + magic_bits) ^ ~longword) & ~magic_bits) != 0 )

{

　　　　　　　　//则在这4个char中至少有一个char的值是0！

　　　　}

　　　　理解strlen代码的关键在于理解上面这个检测DWORD中含有 0 字节的条件。可以详细参考参考资料中的解释，这里我再少许重复以下：

　　　　我们把 4 个 char 取到 eax 中，然后为了检测 eax 中是否含有 0 byte，注意不能逐个判断（否则效率就和普通算法无异），而是对eax加上一个魔数，该数字的取值是：7efefeff, 写成2进制是：

<-----------------------| ... ... string

| 7E | FE | FE | FF |

| 0111 1110 | 1111 1110 | 1111 1110 | 1111 1111 |

　　　　请注意上面这个数字的4个 0 的位置（holes），它们的位置是经过选取的，主要目的为了检测是否有进位。在eax和它相加后，如果eax的某个字节不为0，一定会在该字节上产生向“左侧”hole的进位，反之如果某个字节为0，则一定不会产生进位。这样相加后我们只要检测 4 个hole是否全部发生改变，如果有任何一个hole依旧为0不变，就知道当前的四个byte中必含有0，然后我们从byte[0]到byte[3]逐个探查哪一个byte为0即可。

　　　　注意，当发现含有 0 的时候，是从低位byte到高位byte的顺序去检测的，这个顺序也很重要，因为低位的byte更靠近字符串的开始位置。

　　　　上面的判断条件，是根据异或的性质，通过异或操作去找出 holes 是否发生改变，如果有任何hole在相加后未发生变化，会在计算结果中的相应位产生1，即不为0。如果所有hole都发生变化，这计算结果为0，即4个byte都不为0。

　　　　代码中的" XOR EAX -1 "；由于 -1 的补码是 0x FFFFFFFF，和1异或的结果相当于取反( 0^1 = 1, 1^1 = 0)，即和（-1）异或相当于对该操作数取反。

　　　　参考资料：

　　　　strlen源码剖析， ant