看了书上所讲,似乎hash表原理不复杂,但要设计好的hash表就很不容易了。目前我还没有在实践中用到过hash表,感觉不到hash表的强大,但是看其原理,利用hash表访问速度的确非常快,插入也算很快的了。
很多讲散列表的资料都会拿字典来做例子,因为查词需要速度,试想几十上百万的词条如果不组织的有规律,用遍历比较去查那样速度肯定快不起来,尽管也是线性 时间复杂度,尽管现在计算机速度很快。用散列组织这些词条的话查起来就是常数时间了。最近很迷茫,也不像做其它事情,写了个简单的散列模拟存储字典,随即 生成上限一定长度的单词,利用一个简单的算法生成单词对应的key,分别利用separate chaining和linear probing来解决冲突(分别用两个数组存储字典),然后遍历其中一个数组,并在另外一个数组中查找相应记录,记录所需时间。
/*
* @describe
* implement basic functions of hash table.
* create a dictionary which consists 500000 random words with length less than 10 by random
* use Separate Chaining and Open Addressing(linear probing, quadratic probing) and compare their efficiency
* implement by C
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define WORDS_COUNT 200000
#define NUM 400000
#define WORD_LENGTH 50
#define LETTERS_NUM 26
/*
* basic unit of the item involved in this programme
* id represent the number in hash table of the word and word[10] store the word
*/
typedef struct Item
{
int id;
char word[WORD_LENGTH];
} item,*pItem;
/*
* basic linked-list unit used in the Separate Chaining method
*/
typedef struct Word
{
struct Word *next;
char word[WORD_LENGTH];
} word, *pWord;
item createWord(void); // crate a word by random, random seed must be created first
void freeMemory(void); // free memory
void insert(void); // insert words to hash_tables
void init(void); // do initialization
void analyzeHash(void); // analyze hash table
int calculateId(int, const char *); // calculate key of hash by a word
/*
* global variables
*/
pItem hash_lp; // array used by linear probing
// pItem hash_qp; // array used by quadratic probing
pWord hash_sc; // array used by separate chaining
//----------------------------------------------------------------------------------------------------
int main(int argc, char *argv[])
{
int time_start, time_finish;
srand(unsigned(time(NULL)));
time_start = clock();
init(); // initialize
time_finish = clock();
printf("init finished...it takes time %dms/n", time_finish-time_start);
time_start = time_finish;
insert(); // insert random words to hash tables
time_finish = clock();
printf("insert finished...it takes time %dms/n", time_finish-time_start);
time_start = time_finish;
analyzeHash(); // analyze hash tables
time_finish = clock();
printf("analyze finished...it takes time %dms/n", time_finish-time_start);
freeMemory(); // free memory
return 0;
}
/*
* calculate key of hash by a word
* assume a-z is represented by 1-26, and calculate the number of the word in hash table
* take word "aaa" for example: id = (1*26*26 + 1*26 + 1*1)%NUM
*/
int calculateId(int length, const char *word)
{
int i = 0, j = 0, result_id = 0;
for(i = 0; i < length; i++)
{
int num = word[i] - 'a' + 1; // create a number between 1-26
int tmp_id = num; // store num*26*[length-i-1 times]%COUNT
for(j = 1; j <= length - i - 1; j++) // num*26*[length-i-1 times]
{
int tmp = tmp_id * LETTERS_NUM;
tmp_id = (tmp < NUM ? tmp : (tmp%NUM));
}
result_id += tmp_id; // calculate id
if(result_id >= NUM)
{
result_id %= NUM;
}
}
return result_id;
}
/*
* initialize hash tables
*/
void init(void)
{
int i = 0;
item null_item;
word null_word;
null_item.id = -1;
null_item.word[0] = '/0';
null_word.next = NULL;
null_word.word[0] = '/0';
if(!(hash_lp=(pItem)malloc(NUM*sizeof(item))))
{
printf("not enough memory");
exit(0);
}
if(!(hash_sc=(pWord)malloc(NUM*sizeof(word))))
{
printf("not enough memory");
exit(0);
}
// initialize hash tables with NULL
for(i = 0; i < NUM; i++)
{
hash_lp[i] = null_item;
hash_sc[i] = null_word;
}
}
/*
* function: crate a word by random with lower-case letters only and with maxmium length of 10
* assume a-z is represented by 1-26, and calculate the number of the word in hash table
* take word "aaa" for example: id = (1*26*26 + 1*26 + 1*1)%NUM
*/
item createWord(void)
{
item new_item;
int length = 0;
int i = 0;
int j = 0;
new_item.id = 0;
new_item.word[0] = '/0';
length = rand()%WORD_LENGTH + 1;
/*
* create word by random
*/
for(i = 0; i < length; i++)
{
int num = rand()%LETTERS_NUM + 1; // create a number between 1-26
new_item.word[i] = (num - 1 + 'a'); // create a character by random
}
new_item.word[i] = '/0';
new_item.id = calculateId(length, new_item.word);
return new_item;
}
/*
* function: insert words to hash tables
*/
void insert(void)
{
/*
create words and insert them to three kinds of hash table respectively
*/
int i = 0;
for(i = 0; i < WORDS_COUNT; i++)
{
item new_item = createWord();
int id = new_item.id;
// insert into hash_lp
if(id >= NUM/2)
{
while(hash_lp[id].id != -1) { id--; } // search empty place
}
else
{
while(hash_lp[id].id != -1) { id++; } // search empty place
}
hash_lp[id].id = new_item.id;
strcpy(hash_lp[id].word, new_item.word);
// insert into hash_sc
id = new_item.id;
if(hash_sc[id].word[0] == '/0')
{
// if the place is empty
hash_sc[id].next = NULL;
strcpy(hash_sc[id].word, new_item.word);
}
else
{
// the place is not empty
// create a new word by item
pWord pNew_word = (pWord)malloc(sizeof(word));
if(!pNew_word)
{
printf("not enough memory");
exit(0);
}
strcpy(pNew_word->word, new_item.word);
pNew_word->next = hash_sc[id].next;
hash_sc[id].next = pNew_word;
}
}
}
/*
* function: analyze the basic functions of hash table
* A. iterate hash_lp, and find each word in hash_sc, calculate the total search time and vice versa
*/
void analyzeHash(void)
{
int i = 0, id = 0;
int find_count = 0;
/*
* iterate hash_lp, access in hash_sc
*/
for(i = 0; i < NUM; i++)
{
item tmp_item = hash_lp[i];
if(tmp_item.id != -1)
{
id = calculateId(strlen(tmp_item.word), tmp_item.word);
pWord tmp_word = &hash_sc[id];
if(tmp_word->word[0] == '/0')
{
printf("can't find error");
}
else
{
while((tmp_word != NULL) && (strcmp(tmp_item.word, tmp_word->word)))
{
tmp_word = tmp_word->next;
}
if(tmp_word == NULL)
{
// can't find
printf("can't find error");
}
else
{
find_count++;
// printf("find %s, id=%d/n", tmp_word->word, i);
}
}
}
}
printf("iterate hash_lp, access in hash_sc, find_count =%d/n", find_count);
}
/*
* function: free memory after the programe is over
*/
void freeMemory(void)
{
free(hash_lp);
for(int i = 0; i < NUM; i++)
{
if(hash_sc[i].next != NULL)
{
/*
* if the place has linked objects, the memory of them should be freed one by one
* perhaps use C++ class and with desconstructor function it will be much more convenient
*/
pWord pCurrent_word = (&hash_sc[i])->next;
while(pCurrent_word != NULL)
{
pWord tmp = pCurrent_word;
pCurrent_word = pCurrent_word->next;
free(tmp);
}
}
}
free(hash_sc);
}
很多讲散列表的资料都会拿字典来做例子,因为查词需要速度,试想几十上百万的词条如果不组织的有规律,用遍历比较去查那样速度肯定快不起来,尽管也是线性 时间复杂度,尽管现在计算机速度很快。用散列组织这些词条的话查起来就是常数时间了。最近很迷茫,也不像做其它事情,写了个简单的散列模拟存储字典,随即 生成上限一定长度的单词,利用一个简单的算法生成单词对应的key,分别利用separate chaining和linear probing来解决冲突(分别用两个数组存储字典),然后遍历其中一个数组,并在另外一个数组中查找相应记录,记录所需时间。
/*
* @describe
* implement basic functions of hash table.
* create a dictionary which consists 500000 random words with length less than 10 by random
* use Separate Chaining and Open Addressing(linear probing, quadratic probing) and compare their efficiency
* implement by C
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define WORDS_COUNT 200000
#define NUM 400000
#define WORD_LENGTH 50
#define LETTERS_NUM 26
/*
* basic unit of the item involved in this programme
* id represent the number in hash table of the word and word[10] store the word
*/
typedef struct Item
{
int id;
char word[WORD_LENGTH];
} item,*pItem;
/*
* basic linked-list unit used in the Separate Chaining method
*/
typedef struct Word
{
struct Word *next;
char word[WORD_LENGTH];
} word, *pWord;
item createWord(void); // crate a word by random, random seed must be created first
void freeMemory(void); // free memory
void insert(void); // insert words to hash_tables
void init(void); // do initialization
void analyzeHash(void); // analyze hash table
int calculateId(int, const char *); // calculate key of hash by a word
/*
* global variables
*/
pItem hash_lp; // array used by linear probing
// pItem hash_qp; // array used by quadratic probing
pWord hash_sc; // array used by separate chaining
//----------------------------------------------------------------------------------------------------
int main(int argc, char *argv[])
{
int time_start, time_finish;
srand(unsigned(time(NULL)));
time_start = clock();
init(); // initialize
time_finish = clock();
printf("init finished...it takes time %dms/n", time_finish-time_start);
time_start = time_finish;
insert(); // insert random words to hash tables
time_finish = clock();
printf("insert finished...it takes time %dms/n", time_finish-time_start);
time_start = time_finish;
analyzeHash(); // analyze hash tables
time_finish = clock();
printf("analyze finished...it takes time %dms/n", time_finish-time_start);
freeMemory(); // free memory
return 0;
}
/*
* calculate key of hash by a word
* assume a-z is represented by 1-26, and calculate the number of the word in hash table
* take word "aaa" for example: id = (1*26*26 + 1*26 + 1*1)%NUM
*/
int calculateId(int length, const char *word)
{
int i = 0, j = 0, result_id = 0;
for(i = 0; i < length; i++)
{
int num = word[i] - 'a' + 1; // create a number between 1-26
int tmp_id = num; // store num*26*[length-i-1 times]%COUNT
for(j = 1; j <= length - i - 1; j++) // num*26*[length-i-1 times]
{
int tmp = tmp_id * LETTERS_NUM;
tmp_id = (tmp < NUM ? tmp : (tmp%NUM));
}
result_id += tmp_id; // calculate id
if(result_id >= NUM)
{
result_id %= NUM;
}
}
return result_id;
}
/*
* initialize hash tables
*/
void init(void)
{
int i = 0;
item null_item;
word null_word;
null_item.id = -1;
null_item.word[0] = '/0';
null_word.next = NULL;
null_word.word[0] = '/0';
if(!(hash_lp=(pItem)malloc(NUM*sizeof(item))))
{
printf("not enough memory");
exit(0);
}
if(!(hash_sc=(pWord)malloc(NUM*sizeof(word))))
{
printf("not enough memory");
exit(0);
}
// initialize hash tables with NULL
for(i = 0; i < NUM; i++)
{
hash_lp[i] = null_item;
hash_sc[i] = null_word;
}
}
/*
* function: crate a word by random with lower-case letters only and with maxmium length of 10
* assume a-z is represented by 1-26, and calculate the number of the word in hash table
* take word "aaa" for example: id = (1*26*26 + 1*26 + 1*1)%NUM
*/
item createWord(void)
{
item new_item;
int length = 0;
int i = 0;
int j = 0;
new_item.id = 0;
new_item.word[0] = '/0';
length = rand()%WORD_LENGTH + 1;
/*
* create word by random
*/
for(i = 0; i < length; i++)
{
int num = rand()%LETTERS_NUM + 1; // create a number between 1-26
new_item.word[i] = (num - 1 + 'a'); // create a character by random
}
new_item.word[i] = '/0';
new_item.id = calculateId(length, new_item.word);
return new_item;
}
/*
* function: insert words to hash tables
*/
void insert(void)
{
/*
create words and insert them to three kinds of hash table respectively
*/
int i = 0;
for(i = 0; i < WORDS_COUNT; i++)
{
item new_item = createWord();
int id = new_item.id;
// insert into hash_lp
if(id >= NUM/2)
{
while(hash_lp[id].id != -1) { id--; } // search empty place
}
else
{
while(hash_lp[id].id != -1) { id++; } // search empty place
}
hash_lp[id].id = new_item.id;
strcpy(hash_lp[id].word, new_item.word);
// insert into hash_sc
id = new_item.id;
if(hash_sc[id].word[0] == '/0')
{
// if the place is empty
hash_sc[id].next = NULL;
strcpy(hash_sc[id].word, new_item.word);
}
else
{
// the place is not empty
// create a new word by item
pWord pNew_word = (pWord)malloc(sizeof(word));
if(!pNew_word)
{
printf("not enough memory");
exit(0);
}
strcpy(pNew_word->word, new_item.word);
pNew_word->next = hash_sc[id].next;
hash_sc[id].next = pNew_word;
}
}
}
/*
* function: analyze the basic functions of hash table
* A. iterate hash_lp, and find each word in hash_sc, calculate the total search time and vice versa
*/
void analyzeHash(void)
{
int i = 0, id = 0;
int find_count = 0;
/*
* iterate hash_lp, access in hash_sc
*/
for(i = 0; i < NUM; i++)
{
item tmp_item = hash_lp[i];
if(tmp_item.id != -1)
{
id = calculateId(strlen(tmp_item.word), tmp_item.word);
pWord tmp_word = &hash_sc[id];
if(tmp_word->word[0] == '/0')
{
printf("can't find error");
}
else
{
while((tmp_word != NULL) && (strcmp(tmp_item.word, tmp_word->word)))
{
tmp_word = tmp_word->next;
}
if(tmp_word == NULL)
{
// can't find
printf("can't find error");
}
else
{
find_count++;
// printf("find %s, id=%d/n", tmp_word->word, i);
}
}
}
}
printf("iterate hash_lp, access in hash_sc, find_count =%d/n", find_count);
}
/*
* function: free memory after the programe is over
*/
void freeMemory(void)
{
free(hash_lp);
for(int i = 0; i < NUM; i++)
{
if(hash_sc[i].next != NULL)
{
/*
* if the place has linked objects, the memory of them should be freed one by one
* perhaps use C++ class and with desconstructor function it will be much more convenient
*/
pWord pCurrent_word = (&hash_sc[i])->next;
while(pCurrent_word != NULL)
{
pWord tmp = pCurrent_word;
pCurrent_word = pCurrent_word->next;
free(tmp);
}
}
}
free(hash_sc);
}