HashTable—闭散列与开链法

最新推荐文章于 2024-09-03 22:44:46 发布

zhangrrrr

最新推荐文章于 2024-09-03 22:44:46 发布

阅读量2.3k

点赞数

CC 4.0 BY-SA版权

分类专栏：数据结构文章标签： hash hashtable 搜索

本文链接：https://blog.youkuaiyun.com/zhangrrrr/article/details/56009806

数据结构专栏收录该内容

4 篇文章

订阅专栏

本文深入探讨哈希表的基本原理及其两种主要实现方式：闭散列式与开链法。详细介绍了如何处理哈希冲突，包括线性探测、二次探测及使用链表解决冲突的方法。此外，还提供了具体的C++代码实现。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

哈希表，又称散列表，是搜索方法之一，其特点为根据关键字（key）直接访问在内存中的位置

直接定址法
举一个例子，现在有如下一组字符

char* arr[]={"hashtable"};

接着定义一个大小为256的数组Hash，由于是字符型char，这些字符一定可以在这个数组中找到一个对应的位置进行插入；我们将这个表就成为哈希表，搜索时直接根据将自身作为下标便能搜索到所存位置；
而根据key又有两种方法，直接使用和间接使用，上述方法便是直接方法，我们称之为直接定址法，还有一种最为常用，为除留取余法，当然，间接法还有很多，如平方取中法，随机数法等；
除留取余法

就如其名，这种方法是将key模表的大小，从而得到一个小与这个表大小的index，再对应插入，即：Hash（key）=key%size。那么当我们有这样一组数据：

    int arr1[] = { 17, 10005, 108, 1006 };

我们开一个大小为10的数组，就完全可以进行存储；<之前的一片博客中我们介绍到了map与set，其底层为RBTree，而在STL源代码中，还存在unordered_map与unordered_set,区别就在于这两个底层便是哈希表进行实现，那么我们在实现哈希表时，便也使用K,V格式，便于unordered_map的使用；
但是在写之前，我们还有一个重要的问题需要考虑，假设我们的表大小为10，而存在一组数据如下：

    int arr1[] = { 89,18,58,9,49 };

这时，89和9还有49模10都是9，这种情况叫做哈希冲突，指的便是这种不同的数经过一个函数处理映射到相同的位置。而解决这种情况，有两种情况，首先使用第一种方法：
闭散列式
当我们遇到哈希冲突时，检测其下一个位置是否有数据，没有则放置：
即：放置数据为Hash（key）+i（i=0,1,2……）

这种方法称为线性探测，当然这种方法缺陷也很明显：数据集中；因此又出现了二次探测，即Hash（key）+i²（i=0,1,2……）

这种方法使数据更为分散，对于哈希表来说，数据越分散效率越高，但相对来说，这样代码更加复杂，并且当冲突过多，i值太大时，位置并不好找，各有优缺；
而哈希表的大小虽然无特殊规定，但在SGISTL中还是使用了素数作为表格大小

const int _PrimeSize = 28;
        static const unsigned long _PrimeList[_PrimeSize] =
        {
            53ul, 97ul, 193ul, 389ul, 769ul,
            1543ul, 3079ul, 6151ul, 12289ul, 24593ul,
            49157ul, 98317ul, 196613ul, 393241ul, 786433ul,
            1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,
            50331653ul, 100663319ul, 201326611ul, 402653189ul,
            805306457ul, 1610612741ul, 3221225473ul, 4294967291ul
        };

那么万事俱备我们便可以着手写一个闭散列式的哈希表了
哈希表节点

enum Status
{
    EXIST,
    DELETE,
    EMPTY,
};
template<class K,class V>
struct HashNode
{
    K _key;
    V _value;
    Status _status;

    HashNode(const K& key=K(),const V& value=V())
        :_key(key)
        , _value(value)
        , _status(EMPTY)
    {}

};

其中枚举变量status是为了使我们找节点时，避免由于节点删除使我们找不到后面的节点：

这里写图片描述

在删除9后，若不标注为删除状态，则49我们便找不到了；
HashTable类

template<class K, class V,class HashFunc=__HashFunc<K>>
class HashTable
{
    typedef HashNode<K, V>  Node;
public:

    HashTable(size_t size)
        :_size(0)
    {
        assert(size > 0);
        _tables.resize(size);
    }
    ~HashTable()
    {}

    pair<Node*,bool> Insert(const K& key,const V& value )
    {
        CheckCapacity();
        //线性探测
        size_t index = HashFunction(key);
        while (_tables[index]._status == EXIST)
        {
            if (_tables[index]._key == key)
            {
                return make_pair(&_tables[index], false);
            }
            ++index;
            if (index == _tables.size())
                index = 0;
        }
        _tables[index]._key = key;
        _tables[index]._value = value;
        _tables[index]._status = EXIST;
        ++_size;
        return make_pair(&_tables[index], true);
    }
    Node* Find(const K& key)
    {
        size_t index = HashFunction(key);
        while (_tables[index]._status != EMPTY)
        {
            if (_tables[index]._key == key)
            {
                if (_tables[index]._status != DELETE)
                    return &_tables[index];
            }
            else
                return NULL;
            ++index;
            if (index == _tables.size())
                index = 0;
        }
        return NULL;
    }
    bool Remove(const K& key,const V& value)
    {
        size_t index = HashFunction(key);
        while (_tables[index]._status == EXIST)
        {
            if (_tables[index]._key == key)
            {
                _tables[index]._status = DELETE;
                return true;
            }
            ++index;
            if (index == _tables.size())
                index = 0;
        }
        return false;
    }
protected:
    size_t HashFunction(const K& key)//求得模值
    {
        HashFunc k;
        size_t size = k(key);
        return size%_tables.size();
    }
    void Swap(HashTable<K, V, HashFunc>& tmp)//为扩容中交换节点提供交换（现代写法）
    {
        swap(_tables, tmp._tables);
        swap(_size, tmp._size);
    }
    void CheckCapacity()//当负载因子大于等于0.7时，取下一个素数作为新的表格大小
    {
        if (_size * 10 / _tables.size() >= 7)
        {
            size_t NewSize = GetNextPrime(_tables.size());

            HashTable<K, V, HashFunc> tmp(NewSize);
            for (size_t i = 0; i < _tables.size(); i++)
            {
                if (_tables[i]._status == EXIST)
                {
                    tmp.Insert(_tables[i]._key, _tables[i]._value);
                }
            }
            Swap(tmp);
            return;
        }
    }
    size_t GetNextPrime(size_t num)//素数表返回表格大小
    {
        static size_t index = 0;
        const int _PrimeSize = 28;
        static const unsigned long _PrimeList[_PrimeSize] =
        {
            53ul, 97ul, 193ul, 389ul, 769ul,
            1543ul, 3079ul, 6151ul, 12289ul, 24593ul,
            49157ul, 98317ul, 196613ul, 393241ul, 786433ul,
            1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,
            50331653ul, 100663319ul, 201326611ul, 402653189ul,
            805306457ul, 1610612741ul, 3221225473ul, 4294967291ul
        };
        for (size_t i = 0; i<_PrimeSize; ++i)
        {
            if (_PrimeList[i]>num)
            {
                return _PrimeList[i];
            }
        }
        return _PrimeList[27];
    }

protected:
    vector<Node> _tables;
    size_t _size;
};

实现的接口共有Insert,Remove,Find;

其中需要解释有两点

1.模板参数最后一项__HashFunc，是为了使这个表可以存任意类型，当数据为string时，不能进行%表格大小，因此做一个仿函数，并在此处将string 模板特化，这样使用string也可以直接使用

struct __HashFunc
{
    size_t operator()(const K& key)
    {
        return key;
    }
};
template<>
struct __HashFunc<string>
{
    size_t operator()(const string& s)
    {
        return BKDR_Hash(s.c_str());
    }

    static size_t BKDR_Hash(const char* str)
    {
        unsigned int seed = 131;
        unsigned int hash = 0;
        while (*str)
        {
            hash = hash*seed + (*str++);
        }
        return(hash & 0x7FFFFFFF);
    }
};

2.扩容中写道的负载因子，这个概念是由于哈希表本身存在一个驳论，所存数据越多浪费空间越少，但效率越低；数据存的越少，效率就越高，但浪费空间就越多，因此将存储数据个数/表格大小的值称为负载因子，并将其保持在0.7~0.9时，对于效率和空间同时而言最优。

当然还有第二种方法，通过一种顺序表加链表的方法，即在哈希表中每个节点不再只存储数据，而是存储一个指向一个节点的指针，这样在遇到哈希冲突时可以通过节点一直向下存储或访问，这种方法称为：
开链法
这里写图片描述
当然在这种情况下，显然之前的负载因子就不在试用于判断，我们定义此时的负载因子保持在1最好，而如果单个节点下链的节点过多（哈希冲突过多），可以选择在这一单个节点下挂红黑树，从而提高访问效率
哈希桶（开链法）代码

#pragma once
#include<iostream>
#include<vector>
using namespace std;
//为了避免和闭散列式发生冲突，使用不同命名空间
namespace Bucket
{
    template<class K, class V>
    class HashTable;

    template<class K, class V>
    struct HashNode
    {
        pair<K, V> _kv;
        HashNode<K, V>* _next;

        HashNode(const pair<K, V> kv)
            :_kv(kv)
            , _next(NULL)
        {}
    };

    template<class K, class V, class Ref, class Ptr>
    struct HashTableIterator
    {
        typedef HashNode<K, V> Node;
        Node* _node;
        HashTable<K, V>* _ht;
    public:
        typedef HashTableIterator<K, V, Ref, Ptr> Self;
        HashTableIterator(Node* node, HashTable<K, V>* ht)
            :_node(node)
            , _ht(ht)
        {}
        Ref operator* ()
        {
            return _node->_kv;
        }
        Ptr operator-> ()
        {
            return &(operator*());
        }
        Self& operator++ ()
        {
            _node = Next(_node);
            return *this;
        }
        bool operator!= (const Self& s)const
        {
            return _node != s._node;
        }
        Node* Next(Node* _node)
        {
            Node* next = _node->_next;
            if (next)
            {
                return next;
            }
            else
            {
                size_t index = _ht->HashFunc(_node->_kv.first)+1;
                for (; index < _ht->_tables.size(); index++)
                {
                    next = _ht->_tables[index];
                    if (next)
                    {
                        return next;
                    }
                }
            }
            return NULL;
        }
    };
    template<class K, class V>
    class HashTable
    {
        typedef HashNode<K, V>  Node;
    public:
        typedef HashTableIterator<K, V, pair<K, V>&, pair<K, V>*> Iterator;
        typedef HashTableIterator<K, V, const pair<K, V>&, const pair<K, V>*> ConstIterator;

        friend struct Iterator;
        friend struct ConstIterator;

        HashTable()
            :_size(0)
        {}
        HashTable(size_t size)
            :_size(0)
        {
            _tables.resize(size);
        }
        ~HashTable()
        {
            Clear();
        }
        pair<Node*, bool> Insert(const pair<K, V> kv)
        {
            CheckCapacity();
            size_t index = HashFunc(kv.first);
            Node* cur = _tables[index];
            if (Node* ret = Find(kv.first))
            {
                return make_pair(ret, false);
            }
            Node* tmp = new Node(kv);
            tmp->_next = _tables[index];
            _tables[index] = tmp;
            return make_pair(tmp, true);
        }
        Node* Find(const K& key)
        {
            size_t index = HashFunc(key);
            Node* cur = _tables[index];
            while (cur)
            {
                if (cur->_kv.first == key)
                    return cur;
                else
                    cur = cur->_next;
            }
            return NULL;
        }
        bool Remove(const pair<K, V> kv)
        {
            size_t index = HashFunc(kv.first);
            Node* cur = _tables[index];
            Node* prev = NULL;
            while (cur)
            {
                if (cur->_kv.first == kv.first)
                {
                    if (prev == NULL)
                    {
                        _tables[index] = cur->_next;
                    }
                    else
                    {
                        prev->_next = cur->_next;
                    }
                    delete cur;
                    cur = NULL;
                    return true;
                }
                prev = cur;
                cur = cur->_next;
            }
            return false;
        }
        Iterator Begin()
        {
            for (size_t index = 0; index < _tables.size(); index++)
            {
                Node* cur = _tables[index];
                if (cur)
                {
                    return HashTableIterator<K, V, pair<K, V>&, pair<K, V>*>(cur, this);
                }
            }
            return End();
        }
        Iterator End()
        {
            return HashTableIterator<K, V, pair<K, V>&, pair<K, V>*>((Node*)NULL, this);
        }

    protected:
        size_t GetNextPrime(size_t num)
        {
            static size_t index = 0;
            const int _PrimeSize = 28;
            static const unsigned long _PrimeList[_PrimeSize] =
            {
                53ul, 97ul, 193ul, 389ul, 769ul,
                1543ul, 3079ul, 6151ul, 12289ul, 24593ul,
                49157ul, 98317ul, 196613ul, 393241ul, 786433ul,
                1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,
                50331653ul, 100663319ul, 201326611ul, 402653189ul,
                805306457ul, 1610612741ul, 3221225473ul, 4294967291ul
            };
            for (size_t i = 0; i<_PrimeSize; ++i)
            {
                if (_PrimeList[i]>num)
                {
                    return _PrimeList[i];
                }
            }       
            return _PrimeList[27];
        }


        size_t HashFunc(const K& key)
        {
            return key%_tables.size();
        }
        void CheckCapacity()
        {
            if (_size == _tables.size())
            {
                size_t newSize = GetNextPrime(_size);
                HashTable<K, V> tmp(newSize);
                for (size_t index = 0; index < _tables.size(); index++)
                {
                    Node* cur = _tables[index];
                    while (cur)
                    {
                        tmp.Insert(cur->_kv);
                        cur = cur->_next;
                    }
                }
                Swap(tmp);
            }
            else
                return;
        }
        void Swap(HashTable<K, V> tmp)
        {
            swap(tmp._tables, _tables);
        }
        void Clear()
        {
            for (size_t index = 0; index < _tables.size(); index++)
            {
                Node* cur = _tables[index];
                Node* del = NULL;
                while (cur)
                {
                    del = cur;
                    cur = cur->_next;
                    delete del;
                }
                _tables[index] = NULL;
            }
        }


    protected:
        vector<Node*> _tables;
        size_t _size;

    };
};