三.Trie类的定义
Trie.h:
#include <map>
#include <string>
#include "TrieNode.h"
class Trie {
private:
TrieNode __emptyRoot;
int __pace;
TrieNode* insert(TrieNode* parent, std::string& keyword);
TrieNode* insertBranch(TrieNode* parent, std::string& keyword);
TrieNode* find(TrieNode* parent, std::string& keyword);
std::string toString(TrieNode* parent);
std::string getKeywords(const std::string& character, const TrieNode* parent);
public:
int __size;
Trie(TrieNode::Encoding encoding);
~Trie();
TrieNode* insert(std::string& keyword);
TrieNode* insert(const char* keyword);
TrieNode* find(std::string& keyword);
std::string toString();
std::string getKeywords();
};
inline Trie::~Trie() {
}
说明:
1.__emptyRoot即上面建立的TrieNode树的根节点
2.TrieNode* Trie::find(std::string& keyword)函数即查找是否依次以keyword为单元构建起来的一条分支,比如以helloworld为单元构建起来的分支为h->e->l->l->o->w->o->r->l->d,则查找hello的时候就可以确定hello是敏感词
四.Trie类的实现:
Trie.cpp:
#include "Trie.h"
Trie::Trie(TrieNode::Encoding encoding) : __emptyRoot(TrieNode(encoding)), __size(0) {
switch (encoding) {
case TrieNode::UTF8:
__pace = 1;
break;
case TrieNode::UTF16:
__pace = 2;
break;
default:
Logger::error("Unrecognized encoding type.");
break;
}
}
TrieNode*
Trie::insert(std::string& keyword) {
return insert(&__emptyRoot, keyword);
}
TrieNode*
Trie::insert(const char* keyword) {
std::string ___keyword(keyword);
return insert(___keyword);
}
TrieNode*
Trie::insert(TrieNode* parent, std::string& keyword) {
if (keyword.size() == 0) {
return NULL;
}
std::string ___firstCharacter = keyword.substr(0, __pace);
TrieNode* ___firstNode = parent->findChild(___firstCharacter);
if (___firstNode == NULL) {
return insertBranch(parent, keyword);
}
std::string ___restString = keyword.substr(__pace, keyword.size());
return insert(___firstNode, ___restString);
}
TrieNode*
Trie::insertBranch(TrieNode* parent, std::string& keyword) {
std::string ___firstCharacter = keyword.substr(0, __pace);
TrieNode* ___firstNode = parent->insertChild(___firstCharacter);
if (___firstNode != NULL) {
std::string ___restString = keyword.substr(__pace, keyword.size());
if (!___restString.empty())
return insertBranch(___firstNode, ___restString);
}
return NULL;
}
TrieNode*
Trie::find(std::string& keyword) {
return find(&__emptyRoot, keyword);
}
TrieNode*
Trie::find(TrieNode* parent, std::string& keyword) {
std::string ___firstChild = keyword.substr(0, __pace);
TrieNode* ___firstNode = parent->findChild(___firstChild);
if (___firstNode == NULL) {
return NULL;
}
// Last character, then stop searching
if (keyword.size() == (unsigned int)__pace) {
return ___firstNode;
}
// More than one character
std::string ___restString = keyword.substr(__pace, keyword.size());
if (___firstNode->__map.empty()) {
return ___firstNode;
}
return find(___firstNode, ___restString);
}
std::string
Trie::toString() {
std::string ___result("[");
bool ___isFirstChild = true;
for (TrieNode::_TrieMapIterator ___it = __emptyRoot.__map.begin();
___it != __emptyRoot.__map.end(); ++___it) {
if (___isFirstChild) {
___result.append(toString(const_cast<TrieNode*>(&(___it->second))));
___isFirstChild = false;
} else {
___result.append(",");
___result.append(toString(const_cast<TrieNode*>(&(___it->second))));
}
}
___result.append("]");
return ___result;
}
std::string
Trie::toString(TrieNode* parent) {
std::string ___result("[");
___result += parent->getCharacter() + ":";
bool ___isFirstChild = true;
for (TrieNode::_TrieMapIterator ___it = parent->__map.begin();
___it != parent->__map.end(); ++___it) {
if (___isFirstChild) {
___result.append(toString(const_cast<TrieNode*>(&(___it->second))));
___isFirstChild = false;
} else {
___result.append(",");
___result.append(toString(const_cast<TrieNode*>(&(___it->second))));
}
}
___result.append("]");
return ___result;
}
std::string
Trie::getKeywords() {
std::string ___result;
for (TrieNode::_TrieMapIterator ___it = __emptyRoot.__map.begin();
___it != __emptyRoot.__map.end(); ++___it) {
___result.append(getKeywords(___it->first, &(___it->second)));
}
return ___result;
}
std::string
Trie::getKeywords(const std::string& character, const TrieNode* parent) {
std::string ___result;
for (TrieNode::_TrieMapIterator ___it = parent->__map.begin();
___it != parent->__map.end(); ++___it) {
___result.append(Logger::toHex(character, true));
___result.append(getKeywords(___it->first, &(___it->second)));
}
// Last character
if (parent->__map.size() == 0) {
__size++;
___result.append(Logger::toHex(character, true));
___result.append("\n");
}
return ___result;
}
说明:
1.TrieNode* Trie::insert(TrieNode* parent, std::string& keyword)即在建立TrieNode树时,以parent为根节点建立,一开始parent为__emptyRoot,然后把keyword按照规则添加到树中,假设一开始__emptyRoot为空,keyword为hello,则会以hello为一条分支建立成为一颗树枝h->e->l->l->o,此后,若想再添加hero,由于hero与hello的前两个字符相同,则会在h->e->l->l->o的基础上,从字母e开始新生长出一颗分支,即h->e->r->o,这两颗分支共用h->e
2.__pace即与编码方式有关,utf16编码,则__pace为2,utf8是变长编码,不再详述.
本文参考自michael写的文章,具体可参考:http://my.youkuaiyun.com/Poechant