Hash#/(理解,实现与应用)_hash #-优快云博客

namespace CLOSE_HASH
{
	enum State{EMPTY,EXITS,DELETE};
	template<class K,class V>
	struct HashDate
	{
		pair<K,V> _kv;
		State _state=EMPTY;
	};
	// 特化
	template<class K>
	struct Hash
	{
		size_t operator()(const K& key)
		{
			return key;
		}
	};
	template<>
	struct Hash<string>
	{
		// "int"  "insert" 
		// 字符串转成对应一个整形值，因为整形才能取模算映射位置
		// 期望->字符串不同，转出的整形值尽量不同
		// "abcd" "bcad"
		// "abbb" "abca"
		size_t operator()(const string& s)
		{
			// BKDR Hash
			size_t value = 0;
			for (auto ch : s)
			{
				value += ch;
				value *= 131;
			}

			return value;
		}
	};

	template<class K,class V,class KHashFunc=Hash<string>>
	class HashTable
	{
	public:
		bool Insert(const pair<K, V>& kv)
		{
			HashDate<K, V>* ret = find(kv.first);
			if (_table.size() == 0)
			{
				_table.resize(10);
			}
			else if(_size*10 / _table.size() > 7)
			{
				HashTable<K, V, KHashFunc> newHT;
				newHT._table.resize(_table.size() * 2);

				for (auto& e : _table)
				{
					newHT.Insert(e._kv);
				}
				_table.swap(newHT._table);
			}
			KHashFunc hf;
			size_t start = hf(kv.first) % _table.size();
			size_t index = start;
			size_t i = 1;
			while (_table[index]._state == EXITS)
			{
				index = start + i;
				index %= _table.size();
				++i;
				//index += i ^ 2;
			}
			_table[index]._kv = kv;
			_table[index]._state = EXITS;
			++_size;
			return true;
		}
		HashDate<K,V>* find(const K& key)
		{
			KHashFunc hf;
			if (_table.size() == 0)
			{
				return nullptr;
			}
			size_t i = 1;
			size_t start = hf(key) % _table.size();
			size_t index = start;
			while (_table[index]._state != EMPTY)
			{
				if (_table[index]._kv.first==key&&_table[index]._state==EXITS)
				{
					return &_table[index];
				}
				index = start + i;
				index %= _table.size();
				++i;
			}
			return nullptr;
		}
		bool Erase(const K& key)
		{
			HashDate<K, V>* ret = find(key);
			if (ret == nullptr)
			{
				return false;
			}
			else
			{
				ret->_state = DELETE;
				return true;
			}
		}
	private:
		vector<HashDate<K,V>> _table;
		size_t _size=0;//存储的有效数据的个数
	};
}

开散列

开散列法又叫链地址法 ( 开链法 ) ，首先对关键码集合用散列函数计算散列地址，具有相同地址的关键码归于同一子集合，每一个子集合称为一个桶，各个桶中的元素通过一个单链表链接起来，各链表的头结点存储在哈希表中,也叫哈希桶

namespace OpenHash
{
	template<class K>
	struct Hash
	{
		size_t operator()(const K& key)
		{
			return key;
		}
	};
	// 特化
	template<>
	struct Hash < string >
	{
		// "int"  "insert" 
		// 字符串转成对应一个整形值，因为整形才能取模算映射位置
		// 期望->字符串不同，转出的整形值尽量不同
		// "abcd" "bcad"
		// "abbb" "abca"
		size_t operator()(const string& s)
		{
			// BKDR Hash
			size_t value = 0;
			for (auto ch : s)
			{
				value += ch;
				value *= 131;
			}

			return value;
		}
	};

	template<class T>
	struct HashNode
	{
		HashNode<T>* _next;
		T _data;

		HashNode(const T& data)
			:_next(nullptr)
			, _data(data)
		{}
	};

	// 前置声明
	template<class K, class T, class KeyOfT, class HashFunc>
	class HashTable;

	// 迭代器
	template<class K, class T, class KeyOfT, class HashFunc = Hash<K>>
	struct __HTIterator
	{
		typedef HashNode<T> Node;
		typedef __HTIterator<K, T, KeyOfT, HashFunc> Self;
		typedef HashTable<K, T, KeyOfT, HashFunc> HT;
		Node* _node;
		HT* _pht;

		__HTIterator(Node* node, HT* pht)
			:_node(node)
			, _pht(pht)
		{}

		Self& operator++()
		{
			// 1、当前桶中还有数据，那么就在当前桶往后走
			if (_node->_next)
			{
				_node = _node->_next;
			}
			// 2、当前桶走完了，需要往下一个桶去走。
			else
			{
				//size_t index = HashFunc()(KeyOfT()(_node->_data)) % _pht->_table.size();
				KeyOfT kot;
				HashFunc hf;
				size_t index = hf(kot(_node->_data)) % _pht->_table.size();

				++index;
				while (index < _pht->_table.size())
				{
					if (_pht->_table[index])
					{
						_node = _pht->_table[index];
						return *this;
					}
					else
					{
						++index;
					}
				}
				_node = nullptr;
			}

			return *this;
		}
		T& operator*()
		{
			return _node->_data;
		}
		T* operator->()
		{
			return &_node->_data;
		}
		bool operator != (const Self& s) const
		{
			return _node != s._node;
		}
		bool operator == (const Self& s) const
		{
			return _node == s.node;
		}
	};

	template<class K, class T, class KeyOfT, class HashFunc = Hash<K>>
	class HashTable
	{
		typedef HashNode<T> Node;
		template<class K, class T, class KeyOfT, class HashFunc>
		friend struct __HTIterator;
	public:
		typedef __HTIterator<K, T, KeyOfT, HashFunc> iterator;

		HashTable() = default; // 显示指定生成默认构造

		HashTable(const HashTable& ht)
		{
			_n = ht._n;
			_table.resize(ht._table.size());
			for (size_t i = 0; i < ht._table.size(); i++)
			{
				Node* cur = ht._table[i];
				while (cur)
				{
					Node* copy = new Node(cur->_data);
					// 头插到新表
					copy->_next = _table[i];
					_table[i] = copy;

					cur = cur->_next;
				}
			}
		}

		HashTable& operator=(HashTable ht)
		{
			_table.swap(ht._table);
			swap(_n, ht._n);

			return *this;
		}

		~HashTable()
		{
			for (size_t i = 0; i < _table.size(); ++i)
			{
				Node* cur = _table[i];
				while (cur)
				{
					Node* next = cur->_next;
					delete cur;
					cur = next;
				}
				_table[i] = nullptr;
			}
		}
		iterator begin()
		{
			size_t i = 0;
			while (i < _table.size())
			{
				if (_table[i])
				{
					return iterator(_table[i], this);
				}
				++i;
			}
			return end();
		}
		iterator end()
		{
			return iterator(nullptr, this);
		}
		size_t GetNextPrime(size_t prime)
		{
			const int PRIMECOUNT = 28;
			static const size_t primeList[PRIMECOUNT] =
			{
				53ul, 97ul, 193ul, 389ul, 769ul,
				1543ul, 3079ul, 6151ul, 12289ul, 24593ul,
				49157ul, 98317ul, 196613ul, 393241ul, 786433ul,
				1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,
				50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul,
				1610612741ul, 3221225473ul, 4294967291ul
			};

			size_t i = 0;
			for (; i < PRIMECOUNT; ++i)
			{
				if (primeList[i] > prime)
					return primeList[i];
			}

			return primeList[i];
		}

		pair<iterator, bool> Insert(const T& data)
		{
			KeyOfT kot;
			// 找到了
			auto ret = Find(kot(data));
			if (ret != end())
				return make_pair(ret, false);

			HashFunc hf;
			// 负载因子到1时，进行增容
			if (_n == _table.size())
			{
				vector<Node*> newtable;
				//size_t newSize = _table.size() == 0 ? 8 : _table.size() * 2;
				//newtable.resize(newSize, nullptr);
				newtable.resize(GetNextPrime(_table.size()));

				// 遍历取旧表中节点，重新算映射到新表中的位置，挂到新表中
				for (size_t i = 0; i < _table.size(); ++i)
				{
					if (_table[i])
					{
						Node* cur = _table[i];
						while (cur)
						{
							Node* next = cur->_next;
							size_t index = hf(kot(cur->_data)) % newtable.size();
							// 头插
							cur->_next = newtable[index];
							newtable[index] = cur;

							cur = next;
						}
						_table[i] = nullptr;
					}
				}

				_table.swap(newtable);
			}

			size_t index = hf(kot(data)) % _table.size();
			Node* newnode = new Node(data);

			// 头插
			newnode->_next = _table[index];
			_table[index] = newnode;
			++_n;

			return make_pair(iterator(newnode, this), true);
		}

		iterator Find(const K& key)
		{
			if (_table.size() == 0)
			{
				return end();
			}
			KeyOfT kot;
			HashFunc hf;
			size_t index = hf(key) % _table.size();
			Node* cur = _table[index];
			while (cur)
			{
				if (kot(cur->_data) == key)
				{
					return iterator(cur, this);
				}
				else
				{
					cur = cur->_next;
				}
			}

			return end();
		}

		bool Erase(const K& key)
		{
			size_t index = hf(key) % _table.size();
			Node* prev = nullptr;
			Node* cur = _table[index];
			while (cur)
			{
				if (kot(cur->_data) == key)
				{
					if (_table[index] == cur)
					{
						_table[index] = cur->_next;
					}
					else
					{
						prev->_next = cur->_next;
					}

					--_n;
					delete cur;
					return true;
				}
				prev = cur;
				cur = cur->_next;
			}
			return false;
		}
	private:
		vector<Node*> _table;
		size_t _n = 0;         // 有效数据的个数
	};
}

封装unordered_map

namespace ljx
{
	template<class K, class V>
	class unordered_map
	{
		struct MapKeyOfT
		{
			const K& operator()(const pair<K, V>& kv)
			{
				return kv.first;
			}
		};
	public:
		typedef typename OpenHash::HashTable<K, pair<K, V>, MapKeyOfT>::iterator iterator;
		iterator begin()
		{
			return _ht.begin();
		}
		iterator end()
		{
			return _ht.end();
		}
		pair<iterator, bool> insert(const pair<K, V>& kv)
		{
			return _ht.Insert(kv);
		}
		V& operator[](const K& key)
		{
			pair<iterator, bool> ret = _ht.Insert(make_pair(key, V()));
			return ret.first->second;
		}
	private:
		OpenHash::HashTable<K, pair<K, V>, MapKeyOfT> _ht;
	};
}

封装unordered_set

namespace ljx
{
	template<class K>
	class unordered_set
	{
		struct SetKeyOfT
		{
			const K& operator()(const K& k)
			{
				return k;
			}
		};
	public:
		typedef typename OpenHash::HashTable<K, K, SetKeyOfT >::iterator iterator;
		iterator begin()
		{
			return _ht.begin();
		}
		iterator end()
		{
			return _ht.end();
		}
		pair<iterator, bool> insert(const K k)
		{
			return _ht.Insert(k);
		}
	private:
		OpenHash::HashTable<K, K, SetKeyOfT> _ht;
	};
}

哈希的应用

位图

所谓位图，就是用每一位来存放某种状态，适用于海量数据，数据无重复的场景。通常是用来判断某个数据存不存在的。

实现

namespace Y
{
	template<size_t N>
	class BitSet
	{
	public:
		BitSet()
		{
			_bits.resize(N / 32 + 1, 0);
		}

		// 把x映射的位标记成1
		void Set(size_t x)
		{
			assert(x < N);

			// 算出x映射的位在第i个整数
			// 算出x映射的位在这个整数的第j个位
			size_t i = x / 32;
			size_t j = x % 32;

			// _bits[i] 的第j位标记成1，并且不影响他的其他位
			_bits[i] |= (1 << j);
		}

		void Reset(size_t x)
		{
			assert(x < N);

			size_t i = x / 32;
			size_t j = x % 32;

			// _bits[i] 的第j位标记成0，并且不影响他的其他位
			_bits[i] &= (~(1 << j));
		}

		bool Test(size_t x)
		{
			assert(x < N);

			size_t i = x / 32;
			size_t j = x % 32;

			// 如果第j位是1，结果是非0，非0就是真
			// 如果第j为是0，结果是0，0就是假
			return _bits[i] & (1 << j);
		}
	private:
		vector<int> _bits;
	};
}

应用于求交集,快速查找一个数是否在一个集合中.

优点:节省空间,速度快缺点:只能处理整形

布隆过滤器

布隆过滤器是由布隆（Burton Howard Bloom）在1970年提出的一种紧凑型的、比较巧妙的概率型数据结构，特点是高效地插入和查询，可以用来告诉你 “某样东西一定不存在或者可能存在”，它是用多个哈希函数，将一个数据映射到位图结构中。此种方式不仅可以提升查询效率，也可以节省大量的内存空间

布隆过滤器的思想是将一个元素用多个哈希函数映射到一个位图中，因此被映射到的位置的比特位一定为1 。所以可以按照以下方式进行查找：分别计算每个哈希值对应的比特位置存储的是否为零，只要有一个为零，代表该元素一定不在哈希表中，否则可能在哈希表中。

判断在不准确,存在误判,判断不在,准确;针对更多的是字符串

一般不支持删除

有一种删除的办法,使用多个比特位作为计数器,多个值映射时,++计数,删除时,--计数;

struct HashBKDR
{
	// "int"  "insert" 
	// 字符串转成对应一个整形值，因为整形才能取模算映射位置
	// 期望->字符串不同，转出的整形值尽量不同
	// "abcd" "bcad"
	// "abbb" "abca"
	size_t operator()(const std::string& s)
	{
		// BKDR Hash
		size_t value = 0;
		for (auto ch : s)
		{
			value += ch;
			value *= 131;
		}

		return value;
	}
};

struct HashAP
{
	// "int"  "insert" 
	// 字符串转成对应一个整形值，因为整形才能取模算映射位置
	// 期望->字符串不同，转出的整形值尽量不同
	// "abcd" "bcad"
	// "abbb" "abca"
	size_t operator()(const std::string& s)
	{
		// AP Hash
		register size_t hash = 0;
		size_t ch;
		for (long i = 0; i < s.size(); i++)
		{
			ch = s[i];
			if ((i & 1) == 0)
			{
				hash ^= ((hash << 7) ^ ch ^ (hash >> 3));
			}
			else
			{
				hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));
			}
		}
		return hash;
	}
};

struct HashDJB
{
	// "int"  "insert" 
	// 字符串转成对应一个整形值，因为整形才能取模算映射位置
	// 期望->字符串不同，转出的整形值尽量不同
	// "abcd" "bcad"
	// "abbb" "abca"
	size_t operator()(const std::string& s)
	{
		// BKDR Hash
		register size_t hash = 5381;
		for (auto ch : s)
		{
			hash += (hash << 5) + ch;
		}

		return hash;
	}
};

template<size_t N, class K = std::string,
class Hash1 = HashBKDR,
class Hash2 = HashAP,
class Hash3 = HashDJB>
class BloomFilter
{
public:
	void Set(const K& key)
	{
		//Hash1 hf1;
		//size_t i1 = hf1(key);
		size_t i1 = Hash1()(key) % N;
		size_t i2 = Hash2()(key) % N;
		size_t i3 = Hash3()(key) % N;

		cout << i1 << " " << i2 << " " << i3 << endl;

		_bitset.Set(i1);
		_bitset.Set(i2);
		_bitset.Set(i3);
	}

	bool Test(const K& key)
	{
		size_t i1 = Hash1()(key) % N;
		if (_bitset.Test(i1) == false)
		{
			return false;
		}

		size_t i2 = Hash2()(key) % N;
		if (_bitset.Test(i2) == false)
		{
			return false;
		}

		size_t i3 = Hash3()(key) % N;
		if (_bitset.Test(i3) == false)
		{
			return false;
		}

		// 这里3个位都在，有可能是其他key占了，在是不准确的，存在误判
		// 不在是准确的
		return true; 
	}

private:
	bit::BitSet<N> _bitset;
	bit::vector<char> _bitset;

};

void TestBloomFilter()
{
	/*BloomFilter<100> bf;
	bf.Set("张三");
	bf.Set("李四");
	bf.Set("牛魔王");
	bf.Set("红孩儿");

	cout << bf.Test("张三") << endl;
	cout << bf.Test("李四") << endl;
	cout << bf.Test("牛魔王") << endl;
	cout << bf.Test("红孩儿") << endl;
	cout << bf.Test("孙悟空") << endl;*/

	BloomFilter<600> bf;

	size_t N = 100;
	std::vector<std::string> v1;
	for (size_t i = 0; i < N; ++i)
	{
		std::string url = "https://www.cnblogs.com/-clq/archive/2012/05/31/2528153.html";
		url += std::to_string(1234 + i);
		v1.push_back(url);
	}

	for (auto& str : v1)
	{
		bf.Set(str);
	}

	for (auto& str : v1)
	{
		cout << bf.Test(str) << endl;
	}
	cout << endl << endl;

	std::vector<std::string> v2;
	for (size_t i = 0; i < N; ++i)
	{
		std::string url = "https://www.cnblogs.com/-clq/archive/2012/05/31/2528153.html";
		url += std::to_string(6789 + i);
		v2.push_back(url);
	}

	size_t n2 = 0;
	for (auto& str : v2)
	{
		if (bf.Test(str))
		{
			++n2;
		}
	}
	cout << "相似字符串误判率:" << (double)n2 / (double)N << endl;

	std::vector<std::string> v3;
	for (size_t i = 0; i < N; ++i)
	{
		std::string url = "https://zhuanlan.zhihu.com/p/43263751";
		url += std::to_string(6789 + i);
		v3.push_back(url);
	}

	size_t n3 = 0;
	for (auto& str : v3)
	{
		if (bf.Test(str))
		{
			++n3;
		}
	}
	cout << "不相似字符串误判率:" << (double)n3 / (double)N << endl;

}

哈希切割

哈希切割就是一种哈希的切割思想,通过切分成不均匀的部分,方便操作

通过切分把相同元素放到同一个下标的文件中,然后去比较

比如给一个超过100G大小的log fifile, log中存着IP地址, 设计算法找到出现次数最多的IP地址？

方法:

假设生成A0~A99 100个小文件,依次读取ip,计算每个ip映射的文件号,i=HashBKDR()(ip)%100这个ip就进去Ai号小文件,相同的ip一定进入了通一个小文件中,所以我们直接统计小文件中的次数即可
再处理A0~A99,读取Ai文件,如果文件大于2G,可以再切分一次,如果小于2G,那就使用一个map<string,int>统计次数