头文件
#include <iostream>
#include <map>
#include <vector>
using namespace std;
struct TrieNode
{
bool bWord;
map<wchar_t,TrieNode*> mapChild;
TrieNode()
{
bWord = false;
}
};
struct FeatureNode
{
int iStart; //起始位置
int iEnd; //结束位置
wstring wstrFeature; //特征词
FeatureNode* pPre; //前驱
FeatureNode* pNext; //后驱
TrieNode* pNode; //记录当前遍历到的字典节点
bool bWord;
FeatureNode()
{
iStart = iEnd = 0;
wstrFeature = L"";
pPre = NULL;
pNext = NULL;
pNode = NULL;
bWord = false;
}
FeatureNode* Clone()
{
FeatureNode* pNew = new FeatureNode();
pNew->iEnd = this->iEnd;
pNew->iStart = this->iStart;
pNew->wstrFeature = this->wstrFeature;
return pNew;
}
};
class TrieTree
{
private:
TrieNode *pRoot;
FeatureNode *pFeatureHead;
FeatureNode *pfResult;
wstring wstrData;
bool bAnalyse;
vector<FeatureNode*> vecFeature;
private:
void InsertWord(TrieNode* pNode,const wstring& wstrWord,int &index);
bool AnalyseFeatureEx(const wstring& wstrData);
bool IsEmptyTree();
void Init();
void AnalyseChar(int index);
TrieNode* AnalyseCharByNode(TrieNode* pNode,const wchar_t& wch);
void InsertFeatureAtLast(FeatureNode* pNewFeature);
void DeletePreNode(FeatureNode* pCurr,FeatureNode* pSave);
void DeleteLink(FeatureNode* pCurr);
public:
TrieTree();
//存入后缀词
void InsertWords(const vector<wstring>& wstrWords);
//解析名称
bool AnalyseFeature(const wstring& wstrData);
FeatureNode* GetFeature();
};
#include "TrieTree.h"
TrieTree::TrieTree()
{
pRoot = new TrieNode();
pFeatureHead = new FeatureNode();
pfResult = NULL;
Init();
}
FeatureNode* TrieTree::GetFeature()
{
return pfResult;
}
void TrieTree::Init()
{
wstrData = L"";
bAnalyse = false;
if(NULL!=pfResult)
{
delete pfResult;
}
pfResult = NULL;
int iSize = vecFeature.size();
for(int i=0;i<iSize;i++)
{
if(NULL==vecFeature[i])
{
delete vecFeature[i];
vecFeature[i] = NULL;
}
}
vecFeature.clear();
}
//将vector中的每个字符串都存入字典树中
void TrieTree::InsertWords(const vector<wstring>& wstrWords)
{
int iSize = wstrWords.size();
for(int i=0;i<iSize;i++)
{
const wstring& wstrWord = wstrWords[i];
int index = wstrWord.length()-1;
if(!wstrWords.empty()) //判断是否为空
{
InsertWord(pRoot,wstrWord,index);
}
}
}
void TrieTree::InsertWord(TrieNode* pNode,const wstring& wstrWord,int &index)
{
if(NULL==pNode || index == -1)
{
return ;
}
wchar_t wch = wstrWord.at(index);
map<wchar_t,TrieNode*>::iterator iter = pNode->mapChild.begin();
iter = pNode->mapChild.find(wch);
if(iter==pNode->mapChild.end()) //如果没有该节点,那么就建一个
{
TrieNode* pNewNode = new TrieNode();
if(index==0) //如果是最后一个字,那么就设置bWord为true
{
pNewNode->bWord = true;
}
pNode->mapChild.insert(make_pair(wch,pNewNode));
index--;
InsertWord(pNewNode,wstrWord,index); //继续向下遍历
}
else
{
index--;
InsertWord(iter->second,wstrWord,index); //该节点存在,向下遍历
}
}
bool TrieTree::AnalyseFeature(const wstring& wstrData)
{
if(wstrData.empty() || IsEmptyTree())
{
return false;
}
if(0==this->wstrData.compare(wstrData))
{
return bAnalyse;
}
Init();
bAnalyse = AnalyseFeatureEx(wstrData);
return bAnalyse;
}
bool TrieTree::AnalyseFeatureEx(const wstring& wstrData)
{
this->wstrData = wstrData;
int iLen = wstrData.length();
for(int i =iLen-1;i>=0;i--)
{
AnalyseChar(i);
if(NULL!=pfResult)
{
break;
}
}
//获取链表中的第一个后缀节点
if(NULL==pfResult)
{
FeatureNode* pCurr = pFeatureHead->pNext;
while(NULL!=pCurr)
{
if(pCurr->bWord)
{
pfResult = pCurr;
pCurr->pPre->pNext = pCurr->pNext;
pCurr->pNext->pPre = pCurr->pPre;
break;
}
}
DeleteLink(pFeatureHead->pNext);
}
return true;
}
//判断树是否为空
bool TrieTree::IsEmptyTree()
{
if(pRoot->mapChild.empty())
{
return true;
}
return false;
}
void TrieTree::AnalyseChar(int index)
{
wchar_t wch = wstrData.at(index);
FeatureNode* pCurr = pFeatureHead->pNext;
while(NULL!=pCurr)
{
/*
如果当前节点已经是后缀词,那么要看现在分析的字符是不是紧挨着自己
如果紧挨着自己,就需要进行分析,这样可以获取最大的后缀词
如果不是紧挨着自己,就不需要分析,以免发生错误 例如 大风车酒店 酒店和大酒店都是后缀词
如果不考虑相邻关系,就将后缀词错误的分析成大酒店
*/
if(pCurr->bWord && pCurr->iStart-index>1)
{
pCurr = pCurr->pNext;
continue;
}
TrieNode* pTmp = AnalyseCharByNode(pCurr->pNode,wch);
if(NULL==pTmp)
{
//如果此时 pCurr是一个后缀,且是第一个节点,那么就说明,已经获取到结尾处的第一个后缀
if(pCurr->bWord && pCurr == pFeatureHead->pNext)
{
//获取到后缀词,做清理操作
pfResult = pCurr;
DeleteLink(pCurr->pNext);
pFeatureHead->pNext = NULL;
return ;
}
if(!pCurr->bWord) //当前节点不是单词,且不能继续分析,删除该节点
{
pCurr->pPre->pNext = pCurr->pNext;
if(NULL!=pCurr->pNext)
{
pCurr->pNext->pPre = pCurr->pPre;
}
FeatureNode* pTmp = pCurr->pPre;
delete pCurr;
pCurr = NULL;
pCurr = pTmp;
}
}
else
{
if(pCurr->bWord) //当前节点已经是后缀,这说明可能存在更长的后缀
{
if(pTmp->bWord) //修改当前节点
{
pCurr->iStart = index;
pCurr->pNode = pTmp;
pCurr->wstrFeature.insert(pCurr->wstrFeature.begin(),1,wch);
}
else
{
//创建新节点,放在当前节点的后面
FeatureNode* pNew = pCurr->Clone();
pNew->pNode = pTmp;
pNew->wstrFeature.insert(pNew->wstrFeature.begin(),1,wch);
pNew->pNext = pCurr->pNext;
pCurr->pNext = pNew;
pNew->pPre = pCurr;
if(NULL!=pNew->pNext)
{
pNew->pNext->pPre = pNew;
}
pCurr = pNew;
}
}
else
{
if(pTmp->bWord) //找到了后缀
{
//找到后缀后,向前删除可能存在的被自己包含的更短的后缀,以及那些还没有形成后缀的节点
pCurr->wstrFeature.insert(pCurr->wstrFeature.begin(),1,wch);
pCurr->bWord = true;
pCurr->iStart = index;
pCurr->pNode = pTmp;
DeletePreNode(pCurr->pPre,pCurr);
}
else //不是后缀,但可以继续分析
{
pCurr->wstrFeature.insert(pCurr->wstrFeature.begin(),1,wch);
pCurr->pNode = pTmp;
}
}
}
pCurr = pCurr->pNext;
}
TrieNode* pTmp = AnalyseCharByNode(pRoot,wch);
if(NULL!=pTmp) //在链表的结尾处添加新节点
{
FeatureNode* pNewFeature = new FeatureNode();
pNewFeature->iEnd = index;
pNewFeature->pNode = pTmp;
pNewFeature->wstrFeature.insert(pNewFeature->wstrFeature.begin(),1,wch);
InsertFeatureAtLast(pNewFeature);
}
}
void TrieTree::DeleteLink(FeatureNode* pFeature)
{
if(NULL==pFeature)
{
return ;
}
if(NULL==pFeature->pNext)
{
delete pFeature;
pFeature = NULL;
}
else
{
DeleteLink(pFeature->pNext);
if(NULL!=pFeature)
{
delete pFeature;
pFeature = NULL;
}
}
}
void TrieTree::DeletePreNode(FeatureNode* pCurr,FeatureNode* pSave)
{
if(NULL==pCurr || pCurr==pFeatureHead || NULL==pSave)
{
return ;
}
if(pSave->iEnd>=pCurr->iEnd) //比较起始位置
{
pCurr->pPre->pNext = pSave;
pSave->pPre = pCurr->pPre;
delete pCurr;
pCurr = NULL;
pCurr = pSave->pPre;
DeletePreNode(pCurr,pSave);
}
}
void TrieTree::InsertFeatureAtLast(FeatureNode* pNewFeature)
{
if(NULL==pNewFeature)
{
return;
}
FeatureNode* pCurr = pFeatureHead;
while(NULL!=pCurr->pNext)
{
pCurr = pCurr->pNext;
}
pCurr->pNext = pNewFeature;
pNewFeature->pPre = pCurr;
}
TrieNode* TrieTree::AnalyseCharByNode(TrieNode* pNode,const wchar_t& wch)
{
if(NULL==pNode)
{
return NULL;
}
map<wchar_t,TrieNode*>::iterator iter = NULL;
iter = pNode->mapChild.find(wch);
if(iter==pNode->mapChild.end())
{
return NULL;
}
return iter->second;
}
mian
void testAnalyse()
{
TrieTree test;
vector<wstring> vecWstr;
vecWstr.push_back(L"酒店");
vecWstr.push_back(L"宾馆");
vecWstr.push_back(L"酒店式宾馆");
vecWstr.push_back(L"连锁式宾馆");
test.InsertWords(vecWstr);
test.AnalyseFeature(L"大地酒店式宾馆洗澡部");
FeatureNode *pNode = test.GetFeature();
}
int main()
{
testAnalyse();
return 1;
}