Fast bit count问题（即计算一个unsigned int的二进制表达中1的数目）

最新推荐文章于 2024-02-18 08:30:26 发布

天天向上的金牛

最新推荐文章于 2024-02-18 08:30:26 发布

阅读量8.8k

点赞数

分类专栏：算法 C、C++ 文章标签： parallel each casting c numbers c++

C、C++ 同时被 2 个专栏收录

17 篇文章

订阅专栏

算法

13 篇文章

订阅专栏

最近在看《Programming Pearls》。里面的好些问题很有意思。做一点小小的总结吧。

这个问题是计算一个unsigned int型二进制数中一个的个数，google中看到了不少很巧妙的算法，在这里做一个简单总结。另外，很喜欢stackoverflow这个论坛，geek很多，打酱油的人很少，感觉国外的程序员遇到有趣的问题兴致更高。关于这个问题，帖子见http://stackoverflow.com/questions/109023/best-algorithm-to-count-the-number-of-set-bits-in-a-32-bit-integer。

直观的想法

最简单最直观的想法是检测每一位是不是1，如果是的话就计数器加1，最后肯定能得出结果来。虽然"too simple, sometimes naive"，但是一般情况下是很好用的，需要的时候不用到处查，很快就能写出来。

int NumberOfSetBits(unsigned int n)
{
        int c = 0;
        for(int j = 0; j <32; j++)
        {
                if(n & (1<<j))
                {
                        c++;
                }
        }
        return c;
}

下面的程序思路一样，也是每位检测：

unsigned int bitCount (unsigned int value) {
    unsigned int count = 0;
    while (value > 0) {           // until all bits are zero
        if ((value & 1) == 1) {   // check lower bit
            count++;
        }
        value /= 2;               // shift bits, removing lower bit
    }
    return count;
}

进阶

下面的这种方法比较巧妙，已经比较难想到了。可以动手试试n-1的二进制表达就明白了。

long count_bits(long n) {     
  unsigned int c; // c accumulates the total bits set in v
  for (c = 0; n; c++) 
    n &= n - 1; // clear the least significant bit set
  return c;
}

高级

这里面的方法属于那种“此曲只应天上有”的类型，我都没有尝试去理解，用的时候查一下就好了。

int pop(unsigned x)
{
    x = x - ((x >> 1) & 0x55555555);
    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
    x = (x + (x >> 4)) & 0x0F0F0F0F;
    x = x + (x >> 8);
    x = x + (x >> 16);
    return x & 0x0000003F;
}

最懒的方法

下面的程序只有一句话，调用系统库函数 __builtin_popcount，我在gcc下编译运行是可以的。

int __builtin_popcount (unsigned int x);

转自：http://blog.youkuaiyun.com/thestoryofsnow/article/details/6565875

/* ==========================================================================
   Bit Counting routines

   Author: Gurmeet Singh Manku    (manku@cs.stanford.edu)
   Date:   27 Aug 2002
   ========================================================================== */


#include <stdlib.h>
#include <stdio.h>
#include <limits.h>

/* Iterated bitcount iterates over each bit. The while condition sometimes helps
   terminates the loop earlier */
int iterated_bitcount (unsigned int n)
{
    int count=0;    
    while (n)
    {
        count += n & 0x1u ;    
        n >>= 1 ;
    }
    return count ;
}


/* Sparse Ones runs proportional to the number of ones in n.
   The line   n &= (n-1)   simply sets the last 1 bit in n to zero. */
int sparse_ones_bitcount (unsigned int n)
{
    int count=0 ;
    while (n)
    {
        count++ ;
        n &= (n - 1) ;     
    }
    return count ;
}


/* Dense Ones runs proportional to the number of zeros in n.
   It first toggles all bits in n, then diminishes count repeatedly */
int dense_ones_bitcount (unsigned int n)
{
    int count = 8 * sizeof(int) ;   
    n ^= (unsigned int) -1 ;
    while (n)
    {
        count-- ;
        n &= (n - 1) ;    
    }
    return count ;
}


/* Precomputed bitcount uses a precomputed array that stores the number of ones
   in each char. */
static int bits_in_char [256] ;

void compute_bits_in_char (void)
{
    unsigned int i ;    
    for (i = 0; i < 256; i++)
        bits_in_char [i] = iterated_bitcount (i) ;
    return ;
}

int precomputed_bitcount (unsigned int n)
{
    // works only for 32-bit ints
    
    return bits_in_char [n         & 0xffu]
        +  bits_in_char [(n >>  8) & 0xffu]
        +  bits_in_char [(n >> 16) & 0xffu]
        +  bits_in_char [(n >> 24) & 0xffu] ;
}


/* Here is another version of precomputed bitcount that uses a precomputed array
   that stores the number of ones in each short. */

static char bits_in_16bits [0x1u << 16] ;

void compute_bits_in_16bits (void)
{
    unsigned int i ;    
    for (i = 0; i < (0x1u<<16); i++)
        bits_in_16bits [i] = iterated_bitcount (i) ;
    return ;
}

int precomputed16_bitcount (unsigned int n)
{
    // works only for 32-bit int
    
    return bits_in_16bits [n         & 0xffffu]
        +  bits_in_16bits [(n >> 16) & 0xffffu] ;
}


/* Parallel   Count   carries   out    bit   counting   in   a   parallel
   fashion.   Consider   n   after    the   first   line   has   finished
   executing. Imagine splitting n into  pairs of bits. Each pair contains
   the <em>number of ones</em> in those two bit positions in the original
   n.  After the second line has finished executing, each nibble contains
   the  <em>number of  ones</em>  in  those four  bits  positions in  the
   original n. Continuing  this for five iterations, the  64 bits contain
   the  number  of ones  among  these  sixty-four  bit positions  in  the
   original n. That is what we wanted to compute. */

#define TWO(c) (0x1u << (c))
#define MASK(c) (((unsigned int)(-1)) / (TWO(TWO(c)) + 1u))
#define COUNT(x,c) ((x) & MASK(c)) + (((x) >> (TWO(c))) & MASK(c))

int parallel_bitcount (unsigned int n)
{
    n = COUNT(n, 0) ;
    n = COUNT(n, 1) ;
    n = COUNT(n, 2) ;
    n = COUNT(n, 3) ;
    n = COUNT(n, 4) ;
    /* n = COUNT(n, 5) ;    for 64-bit integers */
    return n ;
}


/* Nifty  Parallel Count works  the same  way as  Parallel Count  for the
   first three iterations. At the end  of the third line (just before the
   return), each byte of n contains the number of ones in those eight bit
   positions in  the original n. A  little thought then  explains why the
   remainder modulo 255 works. */

#define MASK_01010101 (((unsigned int)(-1))/3)
#define MASK_00110011 (((unsigned int)(-1))/5)
#define MASK_00001111 (((unsigned int)(-1))/17)

int nifty_bitcount (unsigned int n)
{
    n = (n & MASK_01010101) + ((n >> 1) & MASK_01010101) ;
    n = (n & MASK_00110011) + ((n >> 2) & MASK_00110011) ;
    n = (n & MASK_00001111) + ((n >> 4) & MASK_00001111) ;        
    return n % 255 ;
}

/* MIT Bitcount

   Consider a 3 bit number as being
        4a+2b+c
   if we shift it right 1 bit, we have
        2a+b
  subtracting this from the original gives
        2a+b+c
  if we shift the original 2 bits right we get
        a
  and so with another subtraction we have
        a+b+c
  which is the number of bits in the original number.

  Suitable masking  allows the sums of  the octal digits  in a 32 bit  number to
  appear in  each octal digit.  This  isn't much help  unless we can get  all of
  them summed together.   This can be done by modulo  arithmetic (sum the digits
  in a number by  molulo the base of the number minus  one) the old "casting out
  nines" trick  they taught  in school before  calculators were  invented.  Now,
  using mod 7 wont help us, because our number will very likely have more than 7
  bits set.   So add  the octal digits  together to  get base64 digits,  and use
  modulo 63.   (Those of you  with 64  bit machines need  to add 3  octal digits
  together to get base512 digits, and use mod 511.)
 
  This is HACKMEM 169, as used in X11 sources.
  Source: MIT AI Lab memo, late 1970's.
*/

int mit_bitcount(unsigned int n)
{
    /* works for 32-bit numbers only */
    register unsigned int tmp;
    
    tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
    return ((tmp + (tmp >> 3)) & 030707070707) % 63;
}

void verify_bitcounts (unsigned int x)
{
    int iterated_ones, sparse_ones, dense_ones ;
    int precomputed_ones, precomputed16_ones ;
    int parallel_ones, nifty_ones ;
    int mit_ones ;
    
    iterated_ones      = iterated_bitcount      (x) ;
    sparse_ones        = sparse_ones_bitcount   (x) ;
    dense_ones         = dense_ones_bitcount    (x) ;
    precomputed_ones   = precomputed_bitcount   (x) ;
    precomputed16_ones = precomputed16_bitcount (x) ;
    parallel_ones      = parallel_bitcount      (x) ;
    nifty_ones         = nifty_bitcount         (x) ;
    mit_ones           = mit_bitcount           (x) ;

    if (iterated_ones != sparse_ones)
    {
        printf ("ERROR: sparse_bitcount (0x%x) not okay!/n", x) ;
        exit (0) ;
    }
    
    if (iterated_ones != dense_ones)
    {
        printf ("ERROR: dense_bitcount (0x%x) not okay!/n", x) ;
        exit (0) ;
    }

    if (iterated_ones != precomputed_ones)
    {
        printf ("ERROR: precomputed_bitcount (0x%x) not okay!/n", x) ;
        exit (0) ;
    }
        
    if (iterated_ones != precomputed16_ones)
    {
        printf ("ERROR: precomputed16_bitcount (0x%x) not okay!/n", x) ;
        exit (0) ;
    }
    
    if (iterated_ones != parallel_ones)
    {
        printf ("ERROR: parallel_bitcount (0x%x) not okay!/n", x) ;
        exit (0) ;
    }

    if (iterated_ones != nifty_ones)
    {
        printf ("ERROR: nifty_bitcount (0x%x) not okay!/n", x) ;
        exit (0) ;
    }

    if (mit_ones != nifty_ones)
    {
        printf ("ERROR: mit_bitcount (0x%x) not okay!/n", x) ;
        exit (0) ;
    }
    
    return ;
}


int main (void)
{
    int i ;
    
    compute_bits_in_char () ;
    compute_bits_in_16bits () ;

    verify_bitcounts (UINT_MAX) ;
    verify_bitcounts (0) ;
    
    for (i = 0 ; i < 100000 ; i++)
        verify_bitcounts (lrand48 ()) ;
    
    printf ("All bitcounts seem okay!/n") ;
    
    return 0 ;
}

转自：http://blog.youkuaiyun.com/ehui928/article/details/1011309

Counting bits set (naive way)

unsigned int v; // count the number of bits set in v
unsigned int c; // c accumulates the total bits set in v

for (c = 0; v; v >>= 1)
{
  c += v & 1;
}

The naive approach requires one iteration per bit, until no more bits are set. So on a 32-bit word with only the high set, it will go through 32 iterations.

Counting bits set by lookup table

static const unsigned char BitsSetTable256[256] = 
{
#   define B2(n) n,     n+1,     n+1,     n+2
#   define B4(n) B2(n), B2(n+1), B2(n+1), B2(n+2)
#   define B6(n) B4(n), B4(n+1), B4(n+1), B4(n+2)
    B6(0), B6(1), B6(1), B6(2)
};

unsigned int v; // count the number of bits set in 32-bit value v
unsigned int c; // c is the total bits set in v

// Option 1:
c = BitsSetTable256[v & 0xff] + 
    BitsSetTable256[(v >> 8) & 0xff] + 
    BitsSetTable256[(v >> 16) & 0xff] + 
    BitsSetTable256[v >> 24]; 

// Option 2:
unsigned char * p = (unsigned char *) &v;
c = BitsSetTable256[p[0]] + 
    BitsSetTable256[p[1]] + 
    BitsSetTable256[p[2]] +	
    BitsSetTable256[p[3]];


// To initially generate the table algorithmically:
BitsSetTable256[0] = 0;
for (int i = 0; i < 256; i++)
{
  BitsSetTable256[i] = (i & 1) + BitsSetTable256[i / 2];
}

On July 14, 2009 Hallvard Furuseth suggested the macro compacted table.

Counting bits set, Brian Kernighan's way

unsigned int v; // count the number of bits set in v
unsigned int c; // c accumulates the total bits set in v
for (c = 0; v; c++)
{
  v &= v - 1; // clear the least significant bit set
}

Brian Kernighan's method goes through as many iterations as there are set bits. So if we have a 32-bit word with only the high bit set, then it will only go once through the loop.

Published in 1988, the C Programming Language 2nd Ed. (by Brian W. Kernighan and Dennis M. Ritchie) mentions this in exercise 2-9. On April 19, 2006 Don Knuth pointed out to me that this method "was first published by Peter Wegner in CACM 3 (1960), 322. (Also discovered independently by Derrick Lehmer and published in 1964 in a book edited by Beckenbach.)"

Counting bits set in 14, 24, or 32-bit words using 64-bit instructions

unsigned int v; // count the number of bits set in v
unsigned int c; // c accumulates the total bits set in v

// option 1, for at most 14-bit values in v:
c = (v * 0x200040008001ULL & 0x111111111111111ULL) % 0xf;

// option 2, for at most 24-bit values in v:
c =  ((v & 0xfff) * 0x1001001001001ULL & 0x84210842108421ULL) % 0x1f;
c += (((v & 0xfff000) >> 12) * 0x1001001001001ULL & 0x84210842108421ULL) 
     % 0x1f;

// option 3, for at most 32-bit values in v:
c =  ((v & 0xfff) * 0x1001001001001ULL & 0x84210842108421ULL) % 0x1f;
c += (((v & 0xfff000) >> 12) * 0x1001001001001ULL & 0x84210842108421ULL) % 
     0x1f;
c += ((v >> 24) * 0x1001001001001ULL & 0x84210842108421ULL) % 0x1f;

This method requires a 64-bit CPU with fast modulus division to be efficient. The first option takes only 3 operations; the second option takes 10; and the third option takes 15.

Rich Schroeppel originally created a 9-bit version, similiar to option 1; see the Programming Hacks section of Beeler, M., Gosper, R. W., and Schroeppel, R. HAKMEM. MIT AI Memo 239, Feb. 29, 1972. His method was the inspiration for the variants above, devised by Sean Anderson. Randal E. Bryant offered a couple bug fixes on May 3, 2005. Bruce Dawson tweaked what had been a 12-bit version and made it suitable for 14 bits using the same number of operations on Feburary 1, 2007.

Counting bits set, in parallel

unsigned int v; // count bits set in this (32-bit value)
unsigned int c; // store the total here
static const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
static const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};

c = v - ((v >> 1) & B[0]);
c = ((c >> S[1]) & B[1]) + (c & B[1]);
c = ((c >> S[2]) + c) & B[2];
c = ((c >> S[3]) + c) & B[3];
c = ((c >> S[4]) + c) & B[4];

The B array, expressed as binary, is:

B[0] = 0x55555555 = 01010101 01010101 01010101 01010101
B[1] = 0x33333333 = 00110011 00110011 00110011 00110011
B[2] = 0x0F0F0F0F = 00001111 00001111 00001111 00001111
B[3] = 0x00FF00FF = 00000000 11111111 00000000 11111111
B[4] = 0x0000FFFF = 00000000 00000000 11111111 11111111

We can adjust the method for larger integer sizes by continuing with the patterns for the Binary Magic Numbers, B and S. If there are k bits, then we need the arrays S and B to be ceil(lg(k)) elements long, and we must compute the same number of expressions for c as S or B are long. For a 32-bit v, 16 operations are used.

The best method for counting bits in a 32-bit integer v is the following:

v = v - ((v >> 1) & 0x55555555);                    // reuse input as temporary
v = (v & 0x33333333) + ((v >> 2) & 0x33333333);     // temp
c = ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; // count

The best bit counting method takes only 12 operations, which is the same as the lookup-table method, but avoids the memory and potential cache misses of a table. It is a hybrid between the purely parallel method above and the earlier methods using multiplies (in the section on counting bits with 64-bit instructions), though it doesn't use 64-bit instructions. The counts of bits set in the bytes is done in parallel, and the sum total of the bits set in the bytes is computed by multiplying by 0x1010101 and shifting right 24 bits.

A generalization of the best bit counting method to integers of bit-widths upto 128 (parameterized by type T) is this:

v = v - ((v >> 1) & (T)~(T)0/3);                           // temp
v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3);      // temp
v = (v + (v >> 4)) & (T)~(T)0/255*15;                      // temp
c = (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * CHAR_BIT; // count

See Ian Ashdown's nice newsgroup post for more information on counting the number of bits set (also known as sideways addition). The best bit counting method was brought to my attention on October 5, 2005 by Andrew Shapira; he found it in pages 187-188 of Software Optimization Guide for AMD Athlon™ 64 and Opteron™ Processors. Charlie Gordon suggested a way to shave off one operation from the purely parallel version on December 14, 2005, and Don Clugston trimmed three more from it on December 30, 2005. I made a typo with Don's suggestion that Eric Cole spotted on January 8, 2006. Eric later suggested the arbitrary bit-width generalization to the best method on November 17, 2006. On April 5, 2007, Al Williams observed that I had a line of dead code at the top of the first method.

转自： http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive

不得不转，这个太经典了。后面两个问题自己分析了一下。

转自：http://blog.chinaunix.net/u/13991/showart_115947.html

代码：http://infolab.stanford.edu/~manku/bitcount/bitcount.c

Fast Bit Counting Routines

Compiled from various sources by Gurmeet Singh Manku

A common problem asked in job interviews is to count the number of bits that are on in an unsigned integer. Here are seven solutions to this problem. Source code in C is available.

Iterated Count

   int bitcount (unsigned int n)  
   {
      int count=0;    
      while (n)
      {
         count += n & 0x1u ;
         n >>= 1 ;
      }
      return count ;
   }

Sparse Ones

   int bitcount (unsigned int n)  
   {  
      int count=0 ;
      while (n)
      {
         count++ ;
         n &= (n - 1) ;
      }
      return count ;
   }

Dense Ones

   int bitcount (unsigned int n)     
   {
      int count = 8 * sizeof(int) ;
      n ^= (unsigned int) -1 ;
      while (n)
      {
         count-- ;
         n &= (n - 1) ;
      }
      return count ;
   }

Precompute_8bit

   // static int bits_in_char [256] ;           
   
   int bitcount (unsigned int n)
   {
      // works only for 32-bit ints
    
      return bits_in_char [n         & 0xffu]
          +  bits_in_char [(n >>  8) & 0xffu]
          +  bits_in_char [(n >> 16) & 0xffu]
          +  bits_in_char [(n >> 24) & 0xffu] ;
   }

Iterated Count runs in time proportional to the total number of bits. It simply loops through all the bits, terminating slightly earlier because of the while condition. Useful if 1's are sparse and among the least significant bits. Sparse Ones runs in time proportional to the number of 1 bits. The line n &= (n - 1) simply sets the rightmost 1 bit in n to 0. Dense Ones runs in time proportional to the number of 0 bits. It is the same as Sparse Ones, except that it first toggles all bits (n ~= -1), and continually subtracts the number of 1 bits from sizeof(int). Precompute_8bit assumes an array bits_in_char such that bits_in_char[i] contains the number of 1 bits in the binary representation for i. It repeatedly updates count by masking out the last eight bits in n, and indexing into bits_in_char.

Precompute_16bit

  // static char bits_in_16bits [0x1u << 16] ;      

  int bitcount (unsigned int n)
  {
     // works only for 32-bit ints
    
     return bits_in_16bits [n         & 0xffffu]
         +  bits_in_16bits [(n >> 16) & 0xffffu] ;
  }

Precompute_16bit is a variant of Precompute_8bit in that an array bits_in_16bits[] stores the number of 1 bits in successive 16 bit numbers (shorts).

Parallel Count

  #define TWO(c)     (0x1u << (c))
  #define MASK(c)    (((unsigned int)(-1)) / (TWO(TWO(c)) + 1u))
  #define COUNT(x,c) ((x) & MASK(c)) + (((x) >> (TWO(c))) & MASK(c))
     
  int bitcount (unsigned int n)
  {
     n = COUNT(n, 0) ;
     n = COUNT(n, 1) ;
     n = COUNT(n, 2) ;
     n = COUNT(n, 3) ;
     n = COUNT(n, 4) ;
     /* n = COUNT(n, 5) ;    for 64-bit integers */
     return n ;
  }

Parallel Count carries out bit counting in a parallel fashion. Consider n after the first line has finished executing. Imagine splitting n into pairs of bits. Each pair contains thenumber of ones in those two bit positions in the original n. After the second line has finished executing, each nibble contains thenumber of ones in those four bits positions in the original n. Continuing this for five iterations, the 64 bits contain the number of ones among these sixty-four bit positions in the original n. That is what we wanted to compute.

Nifty Parallel Count

 #define MASK_01010101 (((unsigned int)(-1))/3)
 #define MASK_00110011 (((unsigned int)(-1))/5)
 #define MASK_00001111 (((unsigned int)(-1))/17)

 int bitcount (unsigned int n)
 {
    n = (n & MASK_01010101) + ((n >> 1) & MASK_01010101) ; 
    n = (n & MASK_00110011) + ((n >> 2) & MASK_00110011) ; 
    n = (n & MASK_00001111) + ((n >> 4) & MASK_00001111) ; 
    return n % 255 ;
 }

Nifty Parallel Countworks the same way as Parallel Count for the first three iterations. At the end of the third line (just before the return), each byte of n contains the number of ones in those eight bit positions in the original n. A little thought then explains why the remainder modulo 255 works.

MIT HACKMEM Count

 int bitcount(unsigned int n)                          
 {
    /* works for 32-bit numbers only    */
    /* fix last line for 64-bit numbers */
    
    register unsigned int tmp;
    
    tmp = n - ((n >> 1) & 033333333333)
            - ((n >> 2) & 011111111111);
    return ((tmp + (tmp >> 3)) & 030707070707) % 63;
 }

MIT Hackmem Count is funky. Consider a 3 bit number as being 4a+2b+c. If we shift it right 1 bit, we have 2a+b. Subtracting this from the original gives 2a+b+c. If we right-shift the original 3-bit number by two bits, we get a, and so with another subtraction we have a+b+c, which is the number of bits in the original number. How is this insight employed? The first assignment statement in the routine computes tmp. Consider the octal representation oftmp. Each digit in the octal representation is simply the number of 1's in the corresponding three bit positions in n. The last return statement sums these octal digits to produce the final answer. The key idea is to add adjacent pairs of octal digits together and then compute the remainder modulus 63. This is accomplished by right-shifting tmp by three bits, adding it to tmp itself and ANDing with a suitable mask. This yields a number in which groups of six adjacent bits (starting from the LSB) contain the number of 1's among those six positions inn. This number modulo 63 yields the final answer. For 64-bit numbers, we would have to add triples of octal digits and use modulus 1023. This is HACKMEM 169, as used in X11 sources. Source: MIT AI Lab memo, late 1970's.

     No Optimization         Some Optimization       Heavy Optimization

  Precomp_16 52.94 Mcps    Precomp_16 76.22 Mcps    Precomp_16 80.58 Mcps  
   Precomp_8 29.74 Mcps     Precomp_8 49.83 Mcps     Precomp_8 51.65 Mcps
    Parallel 19.30 Mcps      Parallel 36.00 Mcps      Parallel 38.55 Mcps
         MIT 16.93 Mcps           MIT 17.10 Mcps         Nifty 31.82 Mcps
       Nifty 12.78 Mcps         Nifty 16.07 Mcps           MIT 29.71 Mcps
      Sparse  5.70 Mcps        Sparse 15.01 Mcps        Sparse 14.62 Mcps
       Dense  5.30 Mcps        Dense  14.11 Mcps         Dense 14.56 Mcps
    Iterated  3.60 Mcps      Iterated  3.84 Mcps      Iterated  9.24 Mcps

  Mcps = Million counts per second

Which of the several bit counting routines is the fastest? Results of speed trials on an i686 are summarized in the table on left. "No Optimization" was compiled with plain gcc. "Some Optimizations" was gcc -O3. "Heavy Optimizations" corresponds to gcc -O3 -mcpu=i686 -march=i686 -fforce-addr -funroll-loops -frerun-cse-after-loop -frerun-loop-opt -malign-functions=4.
Thanks to Seth Robertson who suggested performing speed trials by extending bitcount.c. Seth also pointed me to MIT_Hackmem routine. Thanks to Denny Gursky who suggested the idea of Precompute_11bit. That would require three sums (11-bit, 11-bit and 10-bit precomputed counts). I then tried Precompute_16bit which turned out to be even faster.

If you have niftier solutions up your sleeves, please send me an e-mail

转自：http://idning.iteye.com/blog/732769