查询后缀词

最新推荐文章于 2025-05-27 00:18:58 发布

酷python

最新推荐文章于 2025-05-27 00:18:58 发布

阅读量1.7k

点赞数

CC 4.0 BY-SA版权

本文链接：https://blog.youkuaiyun.com/KWSY2008/article/details/11534505

头文件

#include <iostream>
#include <map>
#include <vector>
using namespace std;
struct TrieNode
{
    bool bWord;
    map<wchar_t,TrieNode*> mapChild;
    TrieNode()
    {
        bWord = false;
    }
};

struct FeatureNode
{
    int iStart;                 //起始位置
    int iEnd;                   //结束位置
    wstring wstrFeature;        //特征词
    FeatureNode* pPre;          //前驱
    FeatureNode* pNext;         //后驱
    TrieNode* pNode;            //记录当前遍历到的字典节点
    bool bWord;

    FeatureNode()
    {
        iStart = iEnd = 0;
        wstrFeature = L"";
        pPre = NULL;
        pNext = NULL;
        pNode = NULL;
        bWord = false;
    }

    FeatureNode* Clone()
    {
        FeatureNode* pNew = new FeatureNode();
        pNew->iEnd = this->iEnd;
        pNew->iStart = this->iStart;
        pNew->wstrFeature = this->wstrFeature;
        return pNew;
    }
    
    
};
class TrieTree
{
private:
    TrieNode *pRoot;
    FeatureNode *pFeatureHead;
    FeatureNode *pfResult;
    wstring wstrData;
    bool bAnalyse;
    vector<FeatureNode*> vecFeature;
private:   
    void InsertWord(TrieNode* pNode,const wstring& wstrWord,int &index);
    bool AnalyseFeatureEx(const wstring& wstrData);
    bool IsEmptyTree();
    void Init();
    void AnalyseChar(int index);
    TrieNode* AnalyseCharByNode(TrieNode* pNode,const wchar_t& wch);
    void InsertFeatureAtLast(FeatureNode* pNewFeature);
    void DeletePreNode(FeatureNode* pCurr,FeatureNode* pSave);
    void DeleteLink(FeatureNode* pCurr);
public:
    TrieTree();
    //存入后缀词
    void InsertWords(const vector<wstring>& wstrWords);
    //解析名称
    bool AnalyseFeature(const wstring& wstrData);

    FeatureNode* GetFeature();
};

cpp文件

#include "TrieTree.h"
TrieTree::TrieTree()
{
    pRoot = new TrieNode();
    pFeatureHead = new FeatureNode();
    pfResult = NULL;
    Init();
}
FeatureNode* TrieTree::GetFeature()
{
    return pfResult;
}
void TrieTree::Init()
{   
    wstrData = L"";
    bAnalyse = false;
    if(NULL!=pfResult)
    {
        delete pfResult;
    }
    pfResult = NULL;

    int iSize = vecFeature.size();
    for(int i=0;i<iSize;i++)
    {
        if(NULL==vecFeature[i])
        {
            delete vecFeature[i];
            vecFeature[i] = NULL;
        }   
    }
    vecFeature.clear();
}
//将vector中的每个字符串都存入字典树中
void TrieTree::InsertWords(const vector<wstring>& wstrWords)
{
    int iSize = wstrWords.size();

    for(int i=0;i<iSize;i++)
    {
        const wstring& wstrWord = wstrWords[i];
        int index = wstrWord.length()-1;
        if(!wstrWords.empty())              //判断是否为空
        {   
            InsertWord(pRoot,wstrWord,index);
        }
    }
}

void TrieTree::InsertWord(TrieNode* pNode,const wstring& wstrWord,int &index)
{
    if(NULL==pNode || index == -1)
    {
        return ;
    }

    wchar_t wch = wstrWord.at(index);
    map<wchar_t,TrieNode*>::iterator iter = pNode->mapChild.begin();
    iter = pNode->mapChild.find(wch);

    if(iter==pNode->mapChild.end())                         //如果没有该节点，那么就建一个
    {
        TrieNode* pNewNode = new TrieNode();
        if(index==0)                      //如果是最后一个字，那么就设置bWord为true
        {
            pNewNode->bWord = true;
        }

        pNode->mapChild.insert(make_pair(wch,pNewNode));
        index--;
        InsertWord(pNewNode,wstrWord,index);              //继续向下遍历        
    }
    else
    {
        index--;
        InsertWord(iter->second,wstrWord,index);          //该节点存在，向下遍历
    }
}

bool TrieTree::AnalyseFeature(const wstring& wstrData)
{
    if(wstrData.empty() || IsEmptyTree())
    {
        return false;
    }
    if(0==this->wstrData.compare(wstrData))
    {
        return bAnalyse;
    }
    Init();    
    bAnalyse =  AnalyseFeatureEx(wstrData);
    return bAnalyse;
}
bool TrieTree::AnalyseFeatureEx(const wstring& wstrData)
{
    this->wstrData = wstrData;
    int iLen = wstrData.length();

    for(int i =iLen-1;i>=0;i--)
    {
        AnalyseChar(i);
        if(NULL!=pfResult)
        {
            break;
        }
    }

    //获取链表中的第一个后缀节点
    if(NULL==pfResult)
    {
        FeatureNode* pCurr = pFeatureHead->pNext;
        while(NULL!=pCurr)
        {
            if(pCurr->bWord)
            {
                pfResult = pCurr;
                pCurr->pPre->pNext = pCurr->pNext;
                pCurr->pNext->pPre = pCurr->pPre;
                break;
            }
        }
        DeleteLink(pFeatureHead->pNext);
    }
    return true;
}
//判断树是否为空
bool TrieTree::IsEmptyTree()
{
    if(pRoot->mapChild.empty())
    {
        return true;
    }
    return false;
}

void TrieTree::AnalyseChar(int index)
{
    wchar_t wch = wstrData.at(index);
    FeatureNode* pCurr = pFeatureHead->pNext;
    while(NULL!=pCurr)
    {
        /*
        如果当前节点已经是后缀词，那么要看现在分析的字符是不是紧挨着自己
        如果紧挨着自己，就需要进行分析，这样可以获取最大的后缀词
        如果不是紧挨着自己，就不需要分析，以免发生错误 例如 大风车酒店  酒店和大酒店都是后缀词
        如果不考虑相邻关系，就将后缀词错误的分析成大酒店
        */

        if(pCurr->bWord && pCurr->iStart-index>1)
        {
            pCurr = pCurr->pNext;       
            continue;                   
        }

        TrieNode* pTmp = AnalyseCharByNode(pCurr->pNode,wch);


        if(NULL==pTmp)
        {
            //如果此时 pCurr是一个后缀，且是第一个节点，那么就说明，已经获取到结尾处的第一个后缀
            if(pCurr->bWord && pCurr == pFeatureHead->pNext)
            {
                //获取到后缀词，做清理操作
                pfResult = pCurr;
                DeleteLink(pCurr->pNext);
                pFeatureHead->pNext = NULL;
                return ;
            }

            if(!pCurr->bWord)       //当前节点不是单词，且不能继续分析，删除该节点
            {
                pCurr->pPre->pNext = pCurr->pNext;
                if(NULL!=pCurr->pNext)
                {
                    pCurr->pNext->pPre = pCurr->pPre;
                }                
                FeatureNode* pTmp = pCurr->pPre;
                delete pCurr;
                pCurr = NULL;
                pCurr = pTmp;
            }
        }
        else
        {
            if(pCurr->bWord)        //当前节点已经是后缀，这说明可能存在更长的后缀
            {
                if(pTmp->bWord)     //修改当前节点
                {
                    pCurr->iStart = index;
                    pCurr->pNode = pTmp;
                    pCurr->wstrFeature.insert(pCurr->wstrFeature.begin(),1,wch);
                }
                else
                {
                    //创建新节点，放在当前节点的后面
                    FeatureNode* pNew = pCurr->Clone();
                    pNew->pNode = pTmp;
                    pNew->wstrFeature.insert(pNew->wstrFeature.begin(),1,wch);
                    pNew->pNext = pCurr->pNext;
                    pCurr->pNext = pNew;
                    pNew->pPre = pCurr;
                    if(NULL!=pNew->pNext)
                    {
                        pNew->pNext->pPre = pNew;
                    }                   
                    pCurr = pNew;
                }
            }
            else
            {
                if(pTmp->bWord)     //找到了后缀
                {
                    //找到后缀后，向前删除可能存在的被自己包含的更短的后缀,以及那些还没有形成后缀的节点
                    pCurr->wstrFeature.insert(pCurr->wstrFeature.begin(),1,wch);
                    pCurr->bWord = true;
                    pCurr->iStart = index;
                    pCurr->pNode = pTmp;

                    DeletePreNode(pCurr->pPre,pCurr);
                }
                else                //不是后缀，但可以继续分析
                {
                    pCurr->wstrFeature.insert(pCurr->wstrFeature.begin(),1,wch);
                    pCurr->pNode = pTmp;
                }
            }
        }

        pCurr = pCurr->pNext;
    }

    TrieNode* pTmp = AnalyseCharByNode(pRoot,wch);
    if(NULL!=pTmp)      //在链表的结尾处添加新节点
    {
        FeatureNode* pNewFeature = new FeatureNode();
        pNewFeature->iEnd = index;
        pNewFeature->pNode = pTmp;
        pNewFeature->wstrFeature.insert(pNewFeature->wstrFeature.begin(),1,wch);
        InsertFeatureAtLast(pNewFeature);
    }
}

void TrieTree::DeleteLink(FeatureNode* pFeature)
{
    if(NULL==pFeature)
    {
        return ;
    }

    if(NULL==pFeature->pNext)
    {
        delete pFeature;
        pFeature = NULL;
    }
    else
    {
        DeleteLink(pFeature->pNext);
        if(NULL!=pFeature)
        {
            delete pFeature;
            pFeature = NULL;
        }
    }
}
void TrieTree::DeletePreNode(FeatureNode* pCurr,FeatureNode* pSave)
{
    if(NULL==pCurr || pCurr==pFeatureHead || NULL==pSave)
    {
        return ;
    }

    if(pSave->iEnd>=pCurr->iEnd)    //比较起始位置
    {
        pCurr->pPre->pNext = pSave;
        pSave->pPre = pCurr->pPre;
        delete pCurr;
        pCurr = NULL;

        pCurr = pSave->pPre;
        DeletePreNode(pCurr,pSave);
    }

}
void TrieTree::InsertFeatureAtLast(FeatureNode* pNewFeature)
{
    if(NULL==pNewFeature)
    {
        return;
    }

    FeatureNode* pCurr = pFeatureHead;
    while(NULL!=pCurr->pNext)
    {
        pCurr = pCurr->pNext;
    }

    pCurr->pNext = pNewFeature;
    pNewFeature->pPre = pCurr;


}
TrieNode* TrieTree::AnalyseCharByNode(TrieNode* pNode,const wchar_t& wch)
{
    if(NULL==pNode)
    {
        return NULL;
    }

    map<wchar_t,TrieNode*>::iterator iter = NULL;
    iter = pNode->mapChild.find(wch);
    if(iter==pNode->mapChild.end())
    {
        return NULL;
    }

    return iter->second;
}

mian

void testAnalyse()
{
    TrieTree test;
    vector<wstring> vecWstr;
    vecWstr.push_back(L"酒店");
    vecWstr.push_back(L"宾馆");
    vecWstr.push_back(L"酒店式宾馆");
    vecWstr.push_back(L"连锁式宾馆");
    test.InsertWords(vecWstr);
    test.AnalyseFeature(L"大地酒店式宾馆洗澡部");

    FeatureNode *pNode = test.GetFeature();
}
int main()
{
    testAnalyse();
    return 1;
}