基于字符串的分离链接hash算法

最新推荐文章于 2025-09-06 09:26:50 发布

转载最新推荐文章于 2025-09-06 09:26:50 发布 · 89 阅读

0 ·

CC 4.0 BY-SA版权

原文链接：https://my.oschina.net/floristgao/blog/1919826

文章标签：

#python #数据结构与算法

本文介绍了一种使用哈希算法处理字符串数组的方法，通过实例演示如何实现字符串的索引及统计出现次数，最后讨论了如何利用该算法解决海量数据查询问题。

2019独角兽企业重金招聘Python工程师标准>>>

Hashes
问题：你有一个很大的字符串数组。需要知道另外一个字符串是否在这个字符串数组中。你可能会将这个字符串与数组中的字符串依次作比较。但是实际中，你会发现这种方法太慢。必须找其它的方法。但是除了依次比较字符串外，还有没有其它方法来知道某个字符串是否存在呢？
解决方案： Hashes。 Hashes是用小的数据类型（如，数字）来表示其它大的数据类型（通常是字符串）。在这种情形下，你可能将字符串存储在hash数组中。然后你可以计算要查找字符串的hash值，用这个hash值与数组中的hash值进行比较。如果在hash数组中有一个hash值与这个新的要查询的hash值相等，则证实这个字符串存在。这个方法，称为索引(indexing)。
本文采用分离链接hash算法来实现基于字符串的hash算法，并且可以统计某个字符串出现的次数

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/*定义hash节点*/
struct hash_node {
    char *value;   /*字符串数据，动态分配内存*/
    int count;      /*此字符串出现的次数*/
    struct hash_node * next; /*解决冲突的分离链接法的next节点*/
};
/*定义hash表结构
 *  *两种方式:
 *   * 1. 用数组定义
 *    * 2. 用链表*/
/*用数组*/
#define MAX_HASH_TABLE 10000

/*用链表表示*/
struct hash_table {
    int num;   /*记录hash表的大小*/
    struct hash_node **hashlist; /*用指针动态分配hash数组内存大小*/
};

typedef struct hash_node * hash_list;
typedef struct hash_table* Hash_Table;

/*根据hash表大小，初始化hash表*/
Hash_Table init_hash_table(int hash_size)
{
    Hash_Table hashtable;
    int i;

    hashtable = (Hash_Table)malloc(sizeof(struct hash_table));

    if(hashtable == NULL) {
        printf("malloc hashtable error\n");
        return NULL;
    }
    hashtable->num = hash_size;/*hash数组大小*/

    /*为hash数组动态分配内存*/
    hashtable->hashlist = (struct hash_node **)malloc(sizeof(struct hash_node*) * hash_size);
    if(hashtable->hashlist == NULL) {
        printf("malloc hashlist error\n");
        free(hashtable);
        hashtable = NULL;
        return NULL;
    }
    /*根据hash数组的大小，为每一个成员分配内存,并且初始化内存*/
    for(i = 0; i < hash_size; i++) {
        hashtable->hashlist[i] = (struct hash_node*)malloc(sizeof(struct hash_node));
        if(hashtable->hashlist[i] == NULL) {
            printf("malloc hashtable->hashlist error\n");
            exit(1);
        }else {
            hashtable->hashlist[i]->value = NULL;
            hashtable->hashlist[i]->count= 0;
            hashtable->hashlist[i]->next = NULL;
        }
    }
    return hashtable;
}
/*获取hash key值的hash算法函数*/
unsigned long get_hash_index(const char *key,int hash_size)
{
    unsigned long ulHash = 0;

    while(*key) {
        ulHash += (ulHash << 5) + *key++;
    }
    return (ulHash % hash_size);
}

/*在hash表中插入一个字符串*/
int hash_insert(char *string, Hash_Table hash_table)
{
    unsigned long index;
    hash_list hash;

    index = get_hash_index(string,hash_table->num);

    hash = hash_table->hashlist[index];


    if(hash == NULL) {
        hash = (hash_list)malloc(sizeof(struct hash_node));
        if(hash == NULL) {
            printf("error: malloc hashlist failed\n");
            return -1;
        }else {
            memset(hash,0,sizeof(struct hash_node));
            hash->value = (char*)malloc(strlen(string)+1);
            hash->count++;
            strncpy(hash->value,string,strlen(string)+1);
        }
    }else {
           while(hash) {
                if(hash->value != NULL) {
                    if(strcmp(hash->value,string) == 0) {
                        hash->count++;
                        return 0;
                    }
                    hash=hash->next;
                }else {
                    hash->value = (char*)malloc(strlen(string)+1);
                    hash->count++;
                    strncpy(hash->value,string,strlen(string)+1);
                    return 0;
                }
           }
 }
    return 0;
}



hash_list hash_find(const char *string, Hash_Table hash_table)
{
    unsigned long index;
    hash_list hash;

    index = get_hash_index(string,hash_table->num);
    hash = hash_table->hashlist[index];

    while(hash) {
        if((hash->value != NULL) && (strcmp(hash->value,string) == 0)) {
            printf("find %s in hash table.....\n",string);
            return hash;
        }
        hash = hash->next;
    }
    return NULL;

}


int main(int argc, char *argv[])
{
    Hash_Table hash_table;
    int rc = 0;
    hash_list hash;

    hash_table = init_hash_table(MAX_HASH_TABLE);

    //rc = hash_insert("wgw",hash_table);
    rc = hash_insert("cdef",hash_table);
    rc = hash_insert("abcd",hash_table);

    rc = hash_insert("cdef",hash_table);

    hash = hash_find("cdef",hash_table);
    if(hash) {
        printf("hit num of cdef is %d\n",hash->count);
    }

    hash = hash_find("wgw",hash_table);
    printf("%s\n",hash?"find wgw":"can't find wgw");
    if(hash) printf("num=%d\n",hash->count);
}

运行结果：
这里写图片描述

海量数据面试题：
搜索引擎会通过日志文件把用户每次检索使用的所有检索串都记录下来，每个查询串的长度为1-255字节。
假设目前有一千万个记录（这些查询串的重复度比较高，虽然总数是1千万，但如果除去重复后，不超过3百万个。一个查询串的重复度越高，说明查询它的用户越多，也就是越热门。），请你统计最热门的10个查询串，要求使用的内存不能超过1G。

分析：
不超过3百万个，假设都是最大的255个字节，加上next指针和技术count总共255+4+4=263bytes
3000000*263=789000000~~~~789Mbytes小于1G内存。而且这个是考虑到极限情况，一般不会所以都是255字节。
可以考虑用上面的hash算法，来统计次数，然后用排序算法获取最大的10个查询串。

转载于:https://my.oschina.net/floristgao/blog/1919826