字符串转换为整数

最新推荐文章于 2019-10-25 19:45:40 发布

Joseph-Growth

最新推荐文章于 2019-10-25 19:45:40 发布

阅读量1.4k

点赞数

本文深入探讨了字符串转整数的实现方式，对比了Microsoft的atoi函数与Linux内核中的simple_strtol等函数的不同之处，并详细解析了后者如何优雅地处理溢出问题。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

最近，突然兴致突发，开始玩优快云的pango系统里面的类似Online Judge的测评系统。刷了3道水题，错了一道，时之今日才知道错误的原因，参考http://blog.youkuaiyun.com/v_july_v/article/details/9024123#commentsJuly大神的这篇文章把这道题来分析一下。

题目：

当给的字符串是如左边图片所示的时候，有考虑到么？当然，它们各自对应的正确输出如右边图片所示（假定你是在32位系统下，且编译环境是VS2008以上）：

刚拿到题时，看都没看，心想不就是找整数嘛，一个栈搞定，最终没有仔细审题，没有注意到数据范围的问题。（这道题最大的陷阱就在于有数据范围限值）

题上说道字符串转INT，大家都知道int类型是32位的有符号类型，大小是-2^31 ~ (2 ^ 31 - 1)。

题上很清晰的提到，不能使用Microsoft的atoi函数，那我们不妨来看看微软的atoi函数是如何来实现的。

 
   //atol函数  
 //Copyright (c) 1989-1997, Microsoft Corporation. All rights reserved.  
 long __cdecl atol(  
     const char *nptr  
     )  
 {  
     int c; /* current char */  
     long total; /* current total */  
     int sign; /* if ''-'', then negative, otherwise positive */  
   
     /* skip whitespace */  
     while ( isspace((int)(unsigned char)*nptr) )  
         ++nptr;  
   
     c = (int)(unsigned char)*nptr++;  
     sign = c; /* save sign indication */  
     if (c == ''-'' || c == ''+'')  
         c = (int)(unsigned char)*nptr++; /* skip sign */  
   
     total = 0;  
   
     while (isdigit(c)) {  
         total = 10 * total + (c - ''0''); /* accumulate digit */  
         c = (int)(unsigned char)*nptr++; /* get next char */  
     }  
   
     if (sign == ''-'')  
         return -total;  
     else  
         return total; /* return result, negated if necessary */  
 }  
 
 

其中，isspace和isdigit函数的实现代码为：

 
   isspace(int x)    
 {    
     if(x==' '||x=='/t'||x=='/n'||x=='/f'||x=='/b'||x=='/r')    
         return 1;    
     else     
         return 0;    
 }    
   
 isdigit(int x)    
 {    
     if(x<='9'&&x>='0')             
         return 1;     
     else     
         return 0;    
 }   
 
 

然后 atoi调用上面的atol函数，如下所示：

 
   //atoi调用上述的atol  
 int __cdecl atoi(  
     const char *nptr  
     )  
 {  
     //Overflow is not detected. Because of this, we can just use  
     return (int)atol(nptr);  
 }  
 
 

但很遗憾的是，上述atoi标准代码依然返回的是long：

 
   long total; /* current total */  
 if (sign == ''-'')  
     return -total;  
 else  
     return total; /* return result, negated if necessary */  
 
 

再者，下面这里定义成long的total与10相乘，即total*10很容易溢出：

 
   long total; /* current total */  
 total = 10 * total + (c - ''0''); /* accumulate digit */

从上看出，貌似微软api也有bug，当然这段代码是否是真实的微软api源码，还有待考究。

接下来，咱们来看看 linux内核中是如何实现此字符串转换为整数的问题的。

linux内核中提供了以下几个函数：

simple_strtol，把一个字符串转换为一个有符号长整数；
simple_strtoll，把一个字符串转换为一个有符号长长整数；
simple_strtoul，把一个字符串转换为一个无符号长整数；
simple_strtoull，把一个字符串转换为一个无符号长长整数

相关源码及分析如下。

首先，atoi调下面的strtol：

 
   //linux/lib/vsprintf.c  
 //Copyright (C) 1991, 1992  Linus Torvalds  
 //simple_strtol - convert a string to a signed long  
 long simple_strtol(const char *cp, char **endp, unsigned int base)  
 {  
     if (*cp == '-')  
         return -simple_strtoul(cp + 1, endp, base);  
   
     return simple_strtoul(cp, endp, base);  
 }  
 EXPORT_SYMBOL(simple_strtol);  
 
 

然后，上面的strtol调下面的strtoul：

 
   //simple_strtoul - convert a string to an unsigned long  
 unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base)  
 {  
     return simple_strtoull(cp, endp, base);  
 }  
 EXPORT_SYMBOL(simple_strtoul);  
 
 

接着，上面的strtoul调下面的strtoull：

 
   //simple_strtoll - convert a string to a signed long long  
 long long simple_strtoll(const char *cp, char **endp, unsigned int base)  
 {  
     if (*cp == '-')  
         return -simple_strtoull(cp + 1, endp, base);  
   
     return simple_strtoull(cp, endp, base);  
 }  
 EXPORT_SYMBOL(simple_strtoll);  
 
 

最后，strtoull调_parse_integer_fixup_radix和_parse_integer来处理相关逻辑：

 
   //simple_strtoull - convert a string to an unsigned long long  
 unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)  
 {  
     unsigned long long result;  
     unsigned int rv;  
   
     cp = _parse_integer_fixup_radix(cp, &base);  
     rv = _parse_integer(cp, base, &result);  
     /* FIXME */  
     cp += (rv & ~KSTRTOX_OVERFLOW);  
   
     if (endp)  
         *endp = (char *)cp;  
   
     return result;  
 }  
 EXPORT_SYMBOL(simple_strtoull);  
 
 

重头戏来了。接下来，我们来看上面strtoull函数中的parse_integer_fixup_radix和_parse_integer两段代码。如鲨鱼所说

“真正的处理逻辑主要是在_parse_integer里面，关于溢出的处理，_parse_integer处理的很优美，
而_parse_integer_fixup_radix是用来自动根据字符串判断进制的”。

先来看 _parse_integer函数：

 
   //lib/kstrtox.c, line 39    
 //Convert non-negative integer string representation in explicitly given radix to an integer.    
 //Return number of characters consumed maybe or-ed with overflow bit.    
 //If overflow occurs, result integer (incorrect) is still returned.    
 unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long *p)    
 {    
     unsigned long long res;    
     unsigned int rv;    
     int overflow;    
     
     res = 0;    
     rv = 0;    
     overflow = 0;    
     while (*s) {    
         unsigned int val;    
     
         if ('0' <= *s && *s <= '9')    
             val = *s - '0';    
         else if ('a' <= _tolower(*s) && _tolower(*s) <= 'f')    
             val = _tolower(*s) - 'a' + 10;    
         else    
             break;    
     
         if (val >= base)    
             break;    
         /*  
          * Check for overflow only if we are within range of  
          * it in the max base we support (16)  
          */    
         if (unlikely(res & (~0ull << 60))) {    
             if (res > div_u64(ULLONG_MAX - val, base))    
                 overflow = 1;    
         }    
         res = res * base + val;    
         rv++;    
         s++;    
     }    
     *p = res;    
     if (overflow)    
         rv |= KSTRTOX_OVERFLOW;    
     return rv;    
 }  
 
 

解释下两个小细节：

上头出现了个unlikely，其实unlikely和likely经常出现在linux相关内核源码中
1. if(likely(value)){
2. //等价于if(likely(value)) == if(value)
3. }
4. else{
5. }
likely表示value为真的可能性更大，而unlikely表示value为假的可能性更大，这两个宏被定义成：
1. //include/linux/compiler.h
2. # ifndef likely
3. # define likely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1))
4. # endif
5. # ifndef unlikely
6. # define unlikely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0))
7. # endif
呈现下div_u64的代码：

 
  //include/linux/math64.h  
 //div_u64  
 static inline u64 div_u64(u64 dividend, u32 divisor)  
 {  
     u32 remainder;  
     return div_u64_rem(dividend, divisor, &remainder);  
 }  
   
 //div_u64_rem  
 static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)  
 {  
     *remainder = dividend % divisor;  
     return dividend / divisor;  
 }  
 

最后看下_parse_integer_fixup_radix函数：

 
   //lib/kstrtox.c, line 23  
 const char *_parse_integer_fixup_radix(const char *s, unsigned int *base)  
 {  
     if (*base == 0) {  
         if (s[0] == '0') {  
             if (_tolower(s[1]) == 'x' && isxdigit(s[2]))  
                 *base = 16;  
             else  
                 *base = 8;  
         } else  
             *base = 10;  
     }  
     if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x')  
         s += 2;  
     return s;  
 }