unordered_map和unordered_set特性以及解决哈希冲突-优快云博客

本文链接：https://blog.youkuaiyun.com/zzzzz_ccc/article/details/149613021

1. 哈希表的特性及核心概念

哈希表（Hash Table）是一种高效的键值对存储结构，通过哈希函数建立键与存储位置的映射关系，实现平均 O (1) 时间复杂度的插入、查找和删除操作。

哈希函数（Hash Function）
将任意大小的输入（键值）映射到固定范围的输出（哈希地址）的函数。理想的哈希函数应具有：
- 确定性：同一键值始终映射到同一地址
- 均匀性：键值分布均匀，减少冲突
- 高效性：计算快速，时间复杂度为 O (1)
哈希冲突（Hash Collision）
不同键值通过哈希函数得到相同相同哈希地址的现象。解决方法主要有：
- 链地址法（Separate 链接法）：每个哈希地址对应一个链表 / 红黑树，冲突元素依次存储在链表中
- 开放定址法：冲突发生时，通过线性探测、二次探测等方式寻找下一个空闲位置
- 再哈希法：使用多个哈希函数，冲突时切换函数重新计算地址
负载因子（Load Factor）
哈希表中元素数量与桶（Bucket）数量的比值，计算公式：负载因子 = 元素数 / 桶数。负载因子过大会导致冲突率上升，通常当负载因子超过阈值（如 0.7）时触发扩容。
扩容（Resizing）
当负载因子超标时，创建更大的桶数组（通常为原大小的 2 倍或素数倍），并将所有元素重新哈希到新桶中，以降低冲突率。
桶（Bucket）
哈希表的基本存储单元，每个桶对应一个哈希地址，可存放单个元素或冲突元素组成的链表 / 树。

开放地址法的代码实现

#pragma once
#include<vector>

enum State
{
	EXIST,
	EMPTY,
	DELETE
};

template<class K, class V>
struct HashData
{
	pair<K, V> _kv;
	State _state = EMPTY;
};

template<class K>
struct HashFunc
{
	size_t operator()(const K& key)
	{
		return (size_t)key;
	}
};

template<>
struct HashFunc<string>
{
	size_t operator()(const string& s)
	{
		// BKDR
		size_t hash = 0;
		for (auto ch : s)
		{
			hash += ch;
			hash *= 131;
		}

		return hash;
	}
};

inline unsigned long __stl_next_prime(unsigned long n)
{
	// Note: assumes long is at least 32 bits.
	static const int __stl_num_primes = 28;
	static const unsigned long __stl_prime_list[__stl_num_primes] = {
		53, 97, 193, 389, 769,
		1543, 3079, 6151, 12289, 24593,
		49157, 98317, 196613, 393241, 786433,
		1572869, 3145739, 6291469, 12582917, 25165843,
		50331653, 100663319, 201326611, 402653189, 805306457,
		1610612741, 3221225473, 4294967291
	};
	const unsigned long* first = __stl_prime_list;
	const unsigned long* last = __stl_prime_list + __stl_num_primes;
	const unsigned long* pos = lower_bound(first, last, n);
	return pos == last ? *(last - 1) : *pos;
}

namespace open_address
{
	template<class K, class V, class Hash = HashFunc<K>>
	class HashTable
	{
	public:
		HashTable()
			:_tables(__stl_next_prime(0))
			, _n(0)
		{}

		bool Insert(const pair<K, V>& kv)
		{
			if (Find(kv.first))
				return false;

			// 负载因子 >= 0.7扩容
			if (_n * 10 / _tables.size() >= 7)
			{
				//vector<HashData<K, V>> newtables(_tables.size()*2);
				//for (auto& data : _tables)
				//{
				//	// 旧表的数据映射到新表
				//	if (data._state == EXIST)
				//	{
				//		size_t hash0 = data._kv.first % newtables.size();
				//		// ...
				//	}
				//}

				//_tables.swap(newtables);

				HashTable<K, V, Hash> newht;
				//newht._tables.resize(_tables.size() * 2);
				newht._tables.resize(__stl_next_prime(_tables.size() + 1));

				for (auto& data : _tables)
				{
					// 旧表的数据映射到新表
					if (data._state == EXIST)
					{
						newht.Insert(data._kv);
					}
				}

				_tables.swap(newht._tables);
			}

			Hash hash;
			size_t hash0 = hash(kv.first) % _tables.size();
			size_t hashi = hash0;
			size_t i = 1;
			int flag = 1;
			while (_tables[hashi]._state == EXIST)
			{
				// 线性探测
				hashi = (hash0 + i) % _tables.size();
				++i;

				/*hashi = (hash0 + (i*i*flag)) % _tables.size();
				if (hashi < _tables.size())
					hashi += _tables.size();

				if (flag == 1)
				{
					flag = -1;
				}
				else
				{
					++i;
					flag = 1;
				}*/
			}

			_tables[hashi]._kv = kv;
			_tables[hashi]._state = EXIST;
			++_n;

			return true;
		}

		HashData<K, V>* Find(const K& key)
		{
			Hash hash;
			size_t hash0 = hash(key) % _tables.size();
			size_t hashi = hash0;
			size_t i = 1;
			while (_tables[hashi]._state != EMPTY)
			{
				if (_tables[hashi]._state == EXIST
					&& _tables[hashi]._kv.first == key)
				{
					return &_tables[hashi];
				}

				// 线性探测
				hashi = (hash0 + i) % _tables.size();
				++i;
			}

			return nullptr;
		}

		bool Erase(const K& key)
		{
			HashData<K, V>* ret = Find(key);
			if (ret)
			{
				ret->_state = DELETE;
				return true;
			}
			else
			{
				return false;
			}
		}

	private:
		vector<HashData<K, V>> _tables;
		size_t _n;  // 记录数据个数
	};
}

1.2链式地址法

#pragma once
#include<vector>
#include<xutility>
using namespace std;
enum State
{
	EXIST,
	EMPTY,
	DELETE
};

template<class K, class V>
struct HashData
{
	pair<K, V> _kv;
	State _state = EMPTY;
};

template<class K>
struct HashFunc
{
	size_t operator()(const K& key)
	{
		return (size_t)key;
	}
};

template<>
struct HashFunc<string>
{
	size_t operator()(const string& s)
	{
		// BKDR
		size_t hash = 0;
		for (auto ch : s)
		{
			hash += ch;
			hash *= 131;
		}

		return hash;
	}
};

inline unsigned long __stl_next_prime(unsigned long n)
{
	// Note: assumes long is at least 32 bits.
	static const int __stl_num_primes = 28;
	static const unsigned long __stl_prime_list[__stl_num_primes] = {
		53, 97, 193, 389, 769,
		1543, 3079, 6151, 12289, 24593,
		49157, 98317, 196613, 393241, 786433,
		1572869, 3145739, 6291469, 12582917, 25165843,
		50331653, 100663319, 201326611, 402653189, 805306457,
		1610612741, 3221225473, 4294967291
	};
	const unsigned long* first = __stl_prime_list;
	const unsigned long* last = __stl_prime_list + __stl_num_primes;
	const unsigned long* pos = lower_bound(first, last, n);
	return pos == last ? *(last - 1) : *pos;
}

 namespace hash_bucket
{
	template<class K,class V>
	struct HashNode
	{
		pair<K, V> _kv;
		HashNode<K, V>* _next;

		HashNode(const pair<K,V>& kv)
			:_kv(kv)
			,_next(nullptr)
		{}
	};

	template<class K,class V,class Hash=HashFunc<K>>
	class HashTable
	{
		typedef HashNode<K,V> Node;
	public:
		Hashtable()
			: _tables(11)
			,_n(0)
		{}

		bool Insert(const pair<K, V>& kv)
		{
			if (_n == _tables.size())
			{
				vector<Node*> newht(__stl_next_prime(_tables.size()+1));
				for (int i = 0; i < _tables.size(); i++)
				{
					Node* cur = _tables[i];
					while (cur)
					{
						Node* next = cur->next;
						size_t hashi = cur->kv.first % newth.size();
						cur->_next = newTatble[hashi];
						newTatble[hashi] = cur;

						cur = next;
					}
					_tables[i] = nullptr;
				}
				_tables.swap(newTatble);
			}
			size_t hashi = kv.first % _tables.size();
			// 头插
			Node* newnode = new Node(kv);
			newnode->_next = _tables[hashi];
			_tables[hashi] = newnode;
			++_n;

			return true;
		}


	private:
		vector<Node*> _tables; // 指针数组
		size_t _n = 0;		   // 表中存储数据个数
	};
}

2. 模拟实现 unordered_map 和 unordered_set 的要点分析

底层哈希表设计

unordered_map和unordered_set底层均依赖哈希表实现，区别在于存储的数据类型：

unordered_set存储单个键值（键即值）
unordered_map存储键值对（pair<Key, Value>）

核心设计要点：

桶数组：使用动态数组存储桶，每个桶为链表头指针（链地址法解决冲突）
节点结构：包含键（或键值对）和指向下一节点的指针
迭代器：需支持遍历桶内链表和跨桶移动，重载++、*、->等操作符

hash_backet的头文件

#pragma once
#include<vector>
#include<xutility>

enum State
{
	EXIST,
	EMPTY,
	DELETE
};

template<class K, class V>
struct HashData
{
	pair<K, V> _kv;
	State _state = EMPTY;
};

template<class K>
struct HashFunc
{
	size_t operator()(const K& key)
	{
		return (size_t)key;
	}
};

template<>
struct HashFunc<string>
{
	size_t operator()(const string& s)
	{
		// BKDR
		size_t hash = 0;
		for (auto ch : s)
		{
			hash += ch;
			hash *= 131;
		}

		return hash;
	}
};

inline unsigned long __stl_next_prime(unsigned long n)
{
	// Note: assumes long is at least 32 bits.
	static const int __stl_num_primes = 28;
	static const unsigned long __stl_prime_list[__stl_num_primes] = {
		53, 97, 193, 389, 769,
		1543, 3079, 6151, 12289, 24593,
		49157, 98317, 196613, 393241, 786433,
		1572869, 3145739, 6291469, 12582917, 25165843,
		50331653, 100663319, 201326611, 402653189, 805306457,
		1610612741, 3221225473, 4294967291
	};
	const unsigned long* first = __stl_prime_list;
	const unsigned long* last = __stl_prime_list + __stl_num_primes;
	const unsigned long* pos = lower_bound(first, last, n);
	return pos == last ? *(last - 1) : *pos;
}

namespace hash_bucket
{
	template<class T>
	struct HashNode
	{
		T _data;
		HashNode<T>* _next;

		HashNode(const T& data)
			:_data(data)
			,_next(nullptr)
		{}
	};

	template<class K,class T,class Ref,class Ptr,class Hash,class KeyOfT>
	struct HTIterator
	{
		typedef HashNode<T> Node;
		typedef HashTables<K, T, KeyOfT, Hash> HT;
		typedef HTIterator<K, T, KeyOfT, Ref, Ptr, Hash> Self;

		Node* _node;
		const HT* _ht;

		HTIterator(Node* node, const HT* ht)
			:_node(node)
			, _ht(ht)
		{}

		Ref operator*()
		{
			return _node->_data;
		}

		Ptr operator->()
		{
			return &_node->_data;
		}

		bool operator!=(const Self& s)
		{
			return _node != s._node;
		}

		Self& operator++()
		{
			if (_node->_next)
			{
				_node = _node->_next;
			}
			else
			{
				KeyOfT kot;
				Hash hash;
				size_t hashi = hash(kot(_node->_data)) % _ht->_tables.size();
				++hashi;
				while (hashi < _ht->_tables.size())
				{
					_node = _ht->_tables[hashi];

					if (_node)
						break;
					else
						++hashi;

				}
				if (hashi == _ht->_tables.size())
				{
					_node = nullptr;
				}
			}
			return this*
		}
	};

	template<class K,class T,class KeyOfT,class Hash>
	class HashTables
	{
		// 友元声明
		template<class K, class T, class Ref, class Ptr, class KeyOfT, class Hash>
		friend struct HTIterator;

		typedef HashNode<T> Node;
	public:
		typedef HTIterator<K, T, T&, T*, KeyOfT, Hash> Iterator;
		typedef HTIterator<K, T, const T&, const T*, KeyOfT, Hash> ConstIterator;

		Iterator brgin()
		{
			if (_n == 0)
				return End();
			for (size_t i = 0; i < _tables.size(); i++)
			{
				Node* cur = _tables[i];
				if (cur)
				{
					return Iterator(cur, this);
				}
			}
		}

		Iterator End()
		{
			return Iterator(nullptr, this);
		}

		ConstIterator Begin() const
		{
			if (_n == 0)
				return End();

			for (size_t i = 0; i < _tables.size(); i++)
			{
				Node* cur = _tables[i];
				if (cur)
				{
					return ConstIterator(cur, this);
				}
			}

			return End();
		}

		ConstIterator End() const
		{
			return ConstIterator(nullptr, this);
		}

		HashTable()
			:_tables(__stl_next_prime(0))
			, _n(0)
		{}

		// 拷贝构造和赋值重载也需要

		~HashTable()
		{
			for (size_t i = 0; i < _tables.size(); i++)
			{
				Node* cur = _tables[i];
				while (cur)
				{
					Node* next = cur->_next;
					delete cur;

					cur = next;
				}

				_tables[i] = nullptr;
			}
		}

		pair<Iterator, bool> Insert(const T& data)
		{
			KeyOfT kot;
			Iterator it = Find(kot(data));
			if (it != End())
				return { it, false };

			Hash hash;

			// 负载因子 == 1时扩容
			if (_n == _tables.size())
			{
				vector<Node*> newTable(__stl_next_prime(_tables.size() + 1));
				for (size_t i = 0; i < _tables.size(); i++)
				{
					Node* cur = _tables[i];
					while (cur)
					{
						Node* next = cur->_next;
						// 头插到新表
						size_t hashi = hash(kot(cur->_data)) % newTable.size();
						cur->_next = newTable[hashi];
						newTable[hashi] = cur;

						cur = next;
					}

					_tables[i] = nullptr;

				}

				_tables.swap(newTable);
			}

			size_t hashi = hash(kot(data)) % _tables.size();
			// 头插
			Node* newnode = new Node(data);
			newnode->_next = _tables[hashi];
			_tables[hashi] = newnode;
			++_n;

			return { Iterator(newnode, this), false };
		}

		Iterator Find(const K& key)
		{
			KeyOfT kot;
			Hash hash;
			size_t hashi = hash(key) % _tables.size();
			Node* cur = _tables[hashi];
			while (cur)
			{
				if (kot(cur->_data) == key)
				{
					return Iterator(cur, this);
				}

				cur = cur->_next;
			}

			return End();
		}

		bool Erase(const K& key)
		{
			KeyOfT kot;
			size_t hashi = key % _tables.size();
			Node* prev = nullptr;
			Node* cur = _tables[hashi];
			while (cur)
			{
				if (kot(cur->_data) == key)
				{
					if (prev == nullptr)
					{
						// 头结点
						_tables[hashi] = cur->_next;
					}
					else
					{
						// 中间节点
						prev->_next = cur->_next;
					}

					delete cur;
					--_n;

					return true;
				}
				else
				{
					prev = cur;
					cur = cur->_next;
				}
			}

			return false;
		}
	private:
		vector<Node*> _tables; // 指针数组
		size_t _n = 0;		   // 表中存储数据个数
	};
}

unordered_set的模拟实现

#pragma once
#include"hash_bucket.h"
namespace aaa
{
	template<class K, class Hash = HashFunc<K>>
	class unordered_set
	{
		struct SetKeyOfT
		{
			const K& operator()(const K& key)
			{
				return key;
			}
		};
	public:
		typedef typename hash_bucket::HashTable<K, const K, SetKeyOfT, Hash>::Iterator iterator;
		typedef typename hash_bucket::HashTable<K, const K, SetKeyOfT, Hash>::ConstIterator const_iterator;

		iterator begin()
		{
			return _ht.Begin();
		}

		iterator end()
		{
			return _ht.End();
		}

		const_iterator begin() const
		{
			return _ht.Begin();
		}

		const_iterator end() const
		{
			return _ht.End();
		}

		pair<iterator, bool> insert(const K& key)
		{
			return _ht.Insert(key);
		}

		iterator Find(const K& key)
		{
			return _ht.Find(key);
		}

		bool Erase(const K& key)
		{
			return _ht.Erase(key);
		}


	private:
		hash_bucket::HashTable<K, const K, SetKeyOfT, Hash> _ht;
	};
}

unordered_map的模拟实现

#pragma once
#include"hash_bucket.h"
namespace aaa
{
	template<class K, class V, class Hash = HashFunc<K>>
	class unordered_map
	{
		struct MapKeyOfT
		{
			const K& operator()(const pair<K, V>& kv)
			{
				return kv.first;
			}
		};
	public:

		typedef typename hash_bucket::HashTable<K, pair<const K, V>, MapKeyOfT, Hash>::Iterator iterator;
		typedef typename hash_bucket::HashTable<K, pair<const K, V>, MapKeyOfT, Hash>::ConstIterator const_iterator;

		iterator begin()
		{
			return _ht.Begin();
		}

		iterator end()
		{
			return _ht.End();
		}

		const_iterator begin() const
		{
			return _ht.Begin();
		}

		const_iterator end() const
		{
			return _ht.End();
		}

		V& operator[](const K& key)
		{
			pair<iterator, bool> ret = insert({ key, V() });
			return ret.first->second;
		}

		pair<iterator, bool> insert(const pair<K, V>& kv)
		{
			return _ht.Insert(kv);
		}

		iterator Find(const K& key)
		{
			return _ht.Find(key);
		}

		bool Erase(const K& key)
		{
			return _ht.Erase(key);
		}

	private:
		hash_bucket::HashTable<K, pair<const K, V>, MapKeyOfT, Hash> _ht;
	};
}

总的来说，unordered_map和unordered_set的模拟实现核心在于：