hash函数
hash函数常用的是mod 素数,或者使用乘法策略,取某些位,这些策略直接影响到hash table的resize,如果是mod素数的话,只能按素数来递增,如果是取乘法方法,只能以2^p来递增。
参考文献[1][2]等给出了很多,常用的字符串hash函数,但更有价值的还包括下面这两个:
murmur hash[3][4],其中[3]无法直接访问,不过本文的例子中有具体的实现,更多murmur hash[8]。
city hash见参考文献[5]
hash table的构建
hash table的构建通常使用list来解决冲突的问题,c++ tr1中就是这样的,tr1的实现中使用了策略base的设计方法,详情见参考文献[6],目前没有仔细看。这个参考文献是在浏览的源码( /usr/include/c++/4.4/tr1_impl/hashtable)时,在头注释中发现的。
resize问题
resize的策略包括:
策略1:
1)全部重新copy一遍,
策略2:
1)resize时,使用2个hashtable,当insert时,只向新的hash table中insert,同时将old hash_table中的r个数据放如新的table;查找时,2个hash table同时查询
2)如果old hash table数据已经全部移动完毕,删除old hash table
上面两个策略的详细信息见参考文献
bloomFilter
说到hash需要提一下bloomfilter,它通过hash实现,好的hash函数可以使bloomfilter具有很好的性能。它的一个使用方法,就是如果要查询数据库,可以在数据库前加一个bloomfilter,如果没在bloomfilter中,就不用查询数据库了,因为bloomfilter返回false是不会有错误的。cityhash和murmurhash来实现bloomfilter应该是非常好的选择。
consistent hash(一致性hash)
C++中的hashtable
本文的计算性能比较
#include "basictypes.h"
#include <string>
#include <vector>
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include "cityhash/include/city.h"
#include <tr1/unordered_map>
#include <map>
// 64-bit hash for 64-bit platforms
const uint32 kFingerPrintSeed = 19820125;
uint64 MurmurHash64A(const void* key, int len, uint32 seed) {
const uint64 m = 0xc6a4a7935bd1e995;
const int r = 47;
uint64 h = seed ^ (len * m);
const uint64* data = (const uint64 *)key;
const uint64* end = data + (len/8);
while (data != end) {
uint64 k = *data++;
k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
}
const uint8* data2 = (const uint8*)data;
switch (len & 7) {
case 7: h ^= static_cast<uint64>(data2[6]) << 48;
case 6: h ^= static_cast<uint64>(data2[5]) << 40;
case 5: h ^= static_cast<uint64>(data2[4]) << 32;
case 4: h ^= static_cast<uint64>(data2[3]) << 24;
case 3: h ^= static_cast<uint64>(data2[2]) << 16;
case 2: h ^= static_cast<uint64>(data2[1]) << 8;
case 1: h ^= static_cast<uint64>(data2[0]);
h *= m;
};
h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
}
// 32-bit hash
uint32 MurmurHash32A(const void* key, int len, uint32 seed) {
const uint32 m = 0x5bd1e995;
const int r = 24;
uint32 h = seed ^ (len * m);
const uint32* data = (const uint32 *)key;
while (len >= 4) {
uint32 k = *(uint32 *)data;
k *= m;
k ^= k >> r;
k *= m;
h *= m;
h ^= k;
data += 1;
len -= 4;
}
// Handle the last few bytes of the input array
const uint8* data2 = (const uint8*)data;
switch (len) {
case 3: h ^= static_cast<uint32>(data2[2]) << 16;
case 2: h ^= static_cast<uint32>(data2[1]) << 8;
case 1: h ^= static_cast<uint32>(data2[0]);
h *= m;
};
// Do a few final mixes of the hash to ensure the last few
// bytes are well-incorporated.
h ^= h >> 13;
h *= m;
h ^= h >> 15;
return h;
}
/* A Simple Hash Function */
unsigned int simple_hash(char *str)
{
register unsigned int hash;
register unsigned char *p;
for(hash = 0, p = (unsigned char *)str; *p ; p++)
hash = 31 * hash + *p;
return (hash & 0x7FFFFFFF);
}
/* RS Hash Function */
unsigned int RS_hash(char *str)
{
unsigned int b = 378551;
unsigned int a = 63689;
unsigned int hash = 0;
while (*str)
{
hash = hash * a + (*str++);
a *= b;
}
return (hash & 0x7FFFFFFF);
}
/* JS Hash Function */
unsigned int JS_hash(char *str)
{
unsigned int hash = 1315423911;
while (*str)
{
hash ^= ((hash << 5) + (*str++) + (hash >> 2));
}
return (hash & 0x7FFFFFFF);
}
/* P. J. Weinberger Hash Function */
unsigned int PJW_hash(char *str)
{
unsigned int BitsInUnignedInt = (unsigned int)(sizeof(unsigned int) * 8);
unsigned int ThreeQuarters = (unsigned int)((BitsInUnignedInt * 3) / 4);
unsigned int OneEighth = (unsigned int)(BitsInUnignedInt / 8);
unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnignedInt - OneEighth);
unsigned int hash = 0;
unsigned int test = 0;
while (*str)
{
hash = (hash << OneEighth) + (*str++);
if ((test = hash & HighBits) != 0)
{
hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
}
}
return (hash & 0x7FFFFFFF);
}
/* ELF Hash Function */
unsigned int ELF_hash(char *str)
{
unsigned int hash = 0;
unsigned int x = 0;
while (*str)
{
hash = (hash << 4) + (*str++);
if ((x = hash & 0xF0000000L) != 0)
{
hash ^= (x >> 24);
hash &= ~x;
}
}
return (hash & 0x7FFFFFFF);
}
/* BKDR Hash Function */
unsigned int BKDR_hash(char *str)
{
unsigned int seed = 131; // 31 131 1313 13131 131313 etc..
unsigned int hash = 0;
while (*str)
{
hash = hash * seed + (*str++);
}
return (hash & 0x7FFFFFFF);
}
/* SDBM Hash Function */
unsigned int SDBM_hash(char *str)
{
unsigned int hash = 0;
while (*str)
{
hash = (*str++) + (hash << 6) + (hash << 16) - hash;
}
return (hash & 0x7FFFFFFF);
}
/* DJB Hash Function */
unsigned int DJB_hash(char *str)
{
unsigned int hash = 5381;
while (*str)
{
hash += (hash << 5) + (*str++);
}
return (hash & 0x7FFFFFFF);
}
/* AP Hash Function */
unsigned int AP_hash(char *str)
{
unsigned int hash = 0;
int i;
for (i=0; *str; i++)
{
if ((i & 1) == 0)
{
hash ^= ((hash << 7) ^ (*str++) ^ (hash >> 3));
}
else
{
hash ^= (~((hash << 11) ^ (*str++) ^ (hash >> 5)));
}
}
return (hash & 0x7FFFFFFF);
}
/* CRC Hash Function */
unsigned int CRC_hash(char *str)
{
unsigned int nleft = strlen(str);
unsigned long long sum = 0;
unsigned short int *w = (unsigned short int *)str;
unsigned short int answer = 0;
/*
* Our algorithm is simple, using a 32 bit accumulator (sum), we add
* sequential 16 bit words to it, and at the end, fold back all the
* carry bits from the top 16 bits into the lower 16 bits.
*/
while ( nleft > 1 ) {
sum += *w++;
nleft -= 2;
}
/*
* mop up an odd byte, if necessary
*/
if ( 1 == nleft ) {
*( unsigned char * )( &answer ) = *( unsigned char * )w ;
sum += answer;
}
/*
* add back carry outs from top 16 bits to low 16 bits
* add hi 16 to low 16
*/
sum = ( sum >> 16 ) + ( sum & 0xFFFF );
/* add carry */
sum += ( sum >> 16 );
/* truncate to 16 bits */
answer = ~sum;
return (answer & 0xFFFFFFFF);
}
std::string Itoa(int value) {
if (value < 0) {
value *= -1;
}
char character[] = "0123456789abcdefghijklmnopqrstuvwxyz";
std::string res = "";
do {
res += character[value % sizeof(character)];
} while ((value /= sizeof(character)) > 0);
return res;
}
int GetTime() {
timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000000 + tv.tv_usec;
}
class StringHash {
public:
uint64 operator()(const std::string& s) const {
return CityHash64(s.c_str(), s.size());
// return MurmurHash64A(s.c_str(), s.size(), kFingerPrintSeed) % (unsigned int) 0xFFFFFFFF;
}
};
class StringEqual {
public:
bool operator()(const std::string& left, const std::string& right) const {
return left == right;
}
};
int main(int argc, char** argv) {
const int kDataSize = 1000000;
std::string content = "";
std::vector<std::string> data;
for (int i = 0; i < kDataSize; ++i) {
content = "";
for (int j = 0; j < 10; ++j) {
content += Itoa(rand());
}
data.push_back(content);
}
//murmur test
int start = GetTime();
for (int i = 0; i < kDataSize; ++i) {
MurmurHash64A(data[i].c_str(), data[i].size(), kFingerPrintSeed);
}
printf("murmur64: %d\n", GetTime() - start);
start = GetTime();
for (int i = 0; i < kDataSize; ++i) {
MurmurHash32A(data[i].c_str(), data[i].size(), kFingerPrintSeed);
}
printf("murmur32:%d\n", GetTime() - start);
//simple hash
start = GetTime();
for (int i = 0; i < kDataSize; ++i) {
simple_hash(const_cast<char*>(data[i].c_str()));
}
printf("simple hash:%d\n", GetTime() - start);
// bkdr hash
start = GetTime();
for (int i = 0; i < kDataSize; ++i) {
BKDR_hash(const_cast<char*>(data[i].c_str()));
}
printf("bkdr hash:%d\n", GetTime() - start);
// AP hash
start = GetTime();
for (int i = 0; i < kDataSize; ++i) {
AP_hash(const_cast<char*>(data[i].c_str()));
}
printf("AP hash:%d\n", GetTime() - start);
// City hash
start = GetTime();
for (int i = 0; i < kDataSize; ++i) {
CityHash64(data[i].c_str(), data[i].size());
}
printf("city hash:%d\n", GetTime() - start);
std::tr1::unordered_map<std::string, int, StringHash, StringEqual> my_map_city;
// City hash insert
start = GetTime();
for (int i = 0; i < kDataSize; ++i) {
my_map_city[data[i]] = i;
}
printf("city hash insert:%d\n", GetTime() - start);
// map insert
std::map<std::string, int> my_map_tree;
start = GetTime();
for (int i = 0; i < kDataSize; ++i) {
my_map_tree[data[i]] = i;
}
printf("tree map insert:%d\n", GetTime() - start);
// City hash search
start = GetTime();
int value = 0;
for (int i = 0; i < kDataSize; ++i) {
value = my_map_city[data[i]];
}
printf("city hash search:%d\n", GetTime() - start);
// map search
start = GetTime();
for (int i = 0; i < kDataSize; ++i) {
value = my_map_tree[data[i]];
}
printf("tree map search:%d\n", GetTime() - start);
}
参考文献
[1]http://blog.youkuaiyun.com/liuben/article/details/5050697
[2]http://www.cnblogs.com/atlantis13579/archive/2010/02/06/1664792.html
[3]http://sites.google.com/site/murmurhash/
[4]http://blog.youkuaiyun.com/wisage/article/details/7104866
[5]http://code.google.com/p/cityhash/
[6]http://gcc.gnu.org/onlinedocs/libstdc++/ext/pb_ds/index.html
[7]http://en.wikipedia.org/wiki/Hash_table
[8]http://en.wikipedia.org/wiki/MurmurHash
[9]http://hi.baidu.com/fdwm_lx/blog/item/f670e73582c8411d90ef3950.html
[10]http://www.cnblogs.com/Frandy/archive/2011/07/26/Hash_map_Unordered_map.html
193

被折叠的 条评论
为什么被折叠?



