stdext::hash_set剖析，env:vs08 sp1-优快云博客

本文链接：https://blog.youkuaiyun.com/j8daxue/article/details/37530775

本文详细分析了VS2008 SP1中非标准容器stdext::hash_set的实现，包括其基于hash表的结构、插入操作以及查找效率。在插入过程中，通过计算hash值确定元素位置，并利用list作为开链存储。同时，文章探讨了负载因子超过4.0时的re-hash策略，并对比了find操作与lower_bound的区别。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

hash表对我来说是闻名不见面，因为stl提供的基于rbtree的容器目前是满足我的工作需求的。最近闲得慌，于是看看非标准(C98)容器hash_set的实现。另外我所知道使用hash表的还有MFC里的CMap，MFC的实现比较朴素，代码一看就懂，就不分析了。

源码基于VS2008 SP1。
为了表达明确，假设比较器为less<T>。

结构：

类似于基于rbtree的set、map。基于hash表的hash_set、hash_map都是提供接口的类，底层都是继承于_Hash这个模板类。最终的接口也和rbtree的实现一样，甚至insert都有重载一个hint的迭代器，尽管_Hash实际上都没用到这个迭代器。

hash表的成员讲，一般都是数组+链表指针(如MFC里的CMap)。然而hash_set用list<T>作为开链，并管理所有实际元素，用vector<list<T>::iterator>来代替传统数组做bucket。这和stl很多时候做法一致：用现有容器，而不是裸概念。如用deque实现queue，用vector+xx_heap系列函数实现priority_queue。

构造：

	void _Init(size_type _Buckets = min_buckets)
		{	// initialize hash table with _Buckets buckets, leave list alone
		_Vec.assign(_Buckets + 1, end());
		_Mask = _Buckets - 1;
		_Maxidx = _Buckets;
		}

代码比较少，主要是初始化vector的数量，和最大桶索引，其中_Mask用来取代%操作。a % (2^n) == (a & (2^n - 1))

insert

_Pairib insert(const value_type& _Val)
		{	// try to insert node with value _Val
		return (_Insert(_Val, end()));
		}

_Pairib _Insert(const value_type& _Val, iterator _Where)
		{	// try to insert (possibly existing) node with value _Val
		size_type _Bucket = _Hashval(this->_Kfn(_Val));
		iterator _Plist = _Get_iter_from_vec(_Vec[_Bucket + 1]);

		for (; _Plist != _Get_iter_from_vec(_Vec[_Bucket]); )
			if (this->comp(this->_Kfn(_Val), this->_Kfn(*--_Plist)))
				;	// still too high in bucket list
			else if (_Multi
				|| this->comp(this->_Kfn(*_Plist), this->_Kfn(_Val)))
				{	// found insertion point, back up to it
				++_Plist;
				break;
				}
			else
				{	// discard new list element and return existing
				if (_Where != end())
					_List.erase(_Where);
				return (_Pairib(_Plist, false));
				}

		if (_Where != end())
			_List.splice(_Plist, _List, _Where);	// move element into place
		else
			_Where = _List.insert(_Plist, _Val);	// insert new element
		for (; _Plist == _Get_iter_from_vec(_Vec[_Bucket]); --_Bucket)
			{	// update end iterators if new first bucket element
			_Vec[_Bucket] = _Where;
			if (_Bucket == 0)
				break;
			}

		if (max_load_factor() < load_factor())
 #if _HAS_INCREMENTAL_HASH
			_Grow();	// too dense, need to grow hash table

 #else /* _HAS_INCREMENTAL_HASH */
			{	// rehash to bigger table
			size_type _Maxsize = _Vec.max_size() / 2;
			size_type _Newsize = bucket_count();

			for (int _Idx = 0; _Idx < 3 && _Newsize < _Maxsize; ++_Idx)
				_Newsize *= 2;	// multiply safely by 8
			_Init(_Newsize);
			_Reinsert();
			}
#endif /* _HAS_INCREMENTAL_HASH */

		return (_Pairib(_Where, true));	// return iterator for new element
		}

真正的insert比较长，慢点分析

首先自然是hash值的计算，鉴于本人对hash算法了解不深刻，就不深入研究和评价了。

然后自然是找到对应的bucket，但这里为什么取的是bucket+1？
a1.如果计算的bucket的hash值【有】对应元素：取bucket+1是因为相同hash值的元素是有序的，它们在list中递增。因此，bucket+1保存的list迭代器是bucket+1的hash值的最小元素。于是，迭代器-1后得到的就是bucket的hash值的最大元素。插入过程将会查找到第一个比_val小的节点，在其后插入。这步当然也是为了有序。
a2.如果计算的bucket的hash值【没有】对应元素：bucket+1的迭代器会和bucket的迭代器相等(见下面分析)。于是会退出查找循环，直接在bucket保存的迭代器位置插入，所以插入list的元素位置会在bucket+1对应list元素前面。这样vector里面元素的位置在list里的顺序将是一致的。
a3.对于每一个v[i]、v[j]， [ *(v[i]),*(v[j]) )区间的值hash值都一样，都是hash(*v[i])，且从小到大的顺序排列。
附调试结果图：

接下来是元素插入，注意这个where是list的end的，所以是else分支。

再下面是一个for，注释为:update end iterators if new first bucket element

这一步作用有两点：
b1.bucket新插入元素，将bucket之前未初始化的迭代器全部赋值为bucket的迭代器，主要是为了步骤a2。
b2.在同一bucket插入元素：如果插入位置的为bucket第一个迭代器，也就是hash值为bucket的区间中最小值，则会更新这个边界。

弄成这样的数据结构会比一般的hash表遍历更快。

至此插入完毕，然后引发可能的re-hash

void _Reinsert()
{	// insert elements at beginning of list into table
	iterator _First;
	for (; (_First = _List.begin()) != _Get_iter_from_vec(_Vec[0]); )
		_Insert(*_First, _First);
}

1.hash中bucket_size初始化是8，如果负载因子(list.size()/bucket_size:即每个bucket下平均有几个节点) > 4.0(默认值，可设)时，将进行re-hash：bucket试着扩大8倍。此处不明白为何不取大素数，反而都是2^n。
2.然后就是不断的从list首部取出元素重新插入，直到首部和0号bucket迭代器相等。这里插入和非re-hash插入有些区别，主要是传入了一个迭代器where = list.begin(),而不是默认的list.end()。具体表现在找到合适的迭代器后，正常插入是用list.insert，而re-hash用splice。这样list的第一个元素不断地被移到后面，直到第0个bucket保存的迭代器等于list.begin()。

题外：list::splice可能用的比较少，作用是将参数中list的区间节点拼接到this中指定的初始点，且把这个区间从参数list删除。

find：
find实际使用的是lower_bound:
这里过程和insert有点区别，它是从小找到大，也是找到同一个hash区间(传统意义的开链)后，逐个比较，因为有序，所以可以稍快点退出这个循环。这里当然也是使用‘比较器(comp)'和‘等价’的概念来查找的。不过它和一般lower_bound(algorithm里以及基于rbtree的set、map)不一样，如果找不到是直接返回list.end()，而不再是一个合适的位置，因为没必要。还有另外一个函数upper_bound，不过它和lower_bound相反，从大找到小。对于非multi容器来说这个函数没意义。

本来自己也看过一次，没想再看加写起来还是挺累，断断续续耗了近一天时间。

附和rbtree对比的测试代码、结果： env:Win7, I5 2520M, 4G RAM

#include "stdafx.h"
#include <functional>
#include <iostream>
#include <time.h>
#include <string>
#include <hash_set>
#include <set>
using namespace std;
using namespace std::tr1;
using namespace stdext;

template<typename Con>
	struct  SetWrapper
	{
		typedef typename Con::value_type		value_type;
		typedef typename Con::const_reference	const_reference;
		Con	con_;

		void insert(const_reference val)
		{
			con_.insert(val);
		}

		void find(const_reference val)
		{
			con_.find(val);
		}

		void erase(const_reference val)
		{
			con_.erase(val);
		}

		int travel()
		{
			int n = 0;
			for(typename Con::iterator it = con_.begin() ; it != con_.end() ; ++ it)
			{
				++ n;
			}
			return n;
		}
	};

	template<typename T>
	struct GeneData;

	template<>
	struct GeneData<string>
	{
		static string gen()
		{
			int len = rand() % 10 + 5;
			static char buf[20];
			for(int i = 0 ; i < len + 1 ; ++ i)
			{
				if(i == len)
				{
					buf[i] = '\0';
					break;
				}

				buf[i] = rand() % 26 + 'a';
			}
			return string(buf);
		}
	};
	template<>
	struct GeneData<int>
	{
		static int gen()
		{
			return rand();
		}
	};
	template<typename T, int N = 10000>
	struct TestSuit 
	{
		typedef SetWrapper< set<T> > RBSet;
		typedef SetWrapper< hash_set<T> > HashSet;

		RBSet	rbset_;
		HashSet	hsset_;
		list<T>	td_;

		void gendata(int c)
		{
			td_.clear();
			for(int i = 0 ; i < c ; ++i)
				td_.push_back(GeneData<T>::gen());
		}
		void run()
		{
			printf("-----type : %s, num : %d-----\n", typeid(T).name(), N);
			gendata(N);

			DWORD dw = GetTickCount();
			for_each(td_.begin(), td_.end(), bind(&RBSet::insert,  rbset_, _Ph<1>()));
			printf("rbset [insert] %d, cost %d ms.\n", N, GetTickCount() - dw);

			dw = GetTickCount();
			for_each(td_.begin(), td_.end(), bind(&HashSet::insert, hsset_, _Ph<1>()));
			printf("hashset [insert] %d, cost %d ms.\n", N, GetTickCount() - dw);



			gendata(N);
			dw = GetTickCount();
			for_each(td_.begin(), td_.end(), bind(&RBSet::find,  rbset_, _Ph<1>()));
			printf("rbset [find] %d, cost %d ms.\n", N, GetTickCount() - dw);

			dw = GetTickCount();
			for_each(td_.begin(), td_.end(), bind(&HashSet::find, hsset_, _Ph<1>()));
			printf("hashset [find] %d, cost %d ms.\n", N, GetTickCount() - dw);



			gendata(N);
			dw = GetTickCount();
			for_each(td_.begin(), td_.end(), bind(&RBSet::erase,  rbset_, _Ph<1>()));
			printf("rbset erase %d, cost %d ms.\n", N, GetTickCount() - dw);

			dw = GetTickCount();
			for_each(td_.begin(), td_.end(), bind(&HashSet::erase, hsset_, _Ph<1>()));
			printf("hashset erase %d, cost %d ms.\n", N, GetTickCount() - dw);

			printf("-----------------------------------------------\n");
		}
	};
	int main()
	{
#define N 2000000
		TestSuit<int, N> tsi;
		tsi.run();

		TestSuit<string, N> tss;
		tss.run();

		return 0;
	}