WordNode.h
#ifndef __TOOLS_WORDNODE_H_INCLUDE__
#define __TOOLS_WORDNODE_H_INCLUDE__
#include <map>
class CWordNode
public:
#endif // __TOOLS_WORDNODE_H_INCLUDE__
WordsFilter.h
#ifndef __TOOLS_WORDSFILTER_H_INCLUDE__
#define __TOOLS_WORDSFILTER_H_INCLUDE__
#include <list>
#include "WordNode.h"
class CWordsFilter
{
public:
private:
#endif // __TOOLS_WORDSFILTER_H_INCLUDE__
WordsFilter.cpp
#include <iostream>
#include <sstream>
#include <fstream>
#include <cmath>
#include "WordsFilter.h"
int nStep = 2;
typedef std::vector<std::string> Tokens;
Tokens StrSplit(const std::string &src, const std::string &sep)
{
int CWordsFilter::GetFirstBytes(const std::string& str)
{
CWordsFilter::CWordsFilter():
CWordsFilter::~CWordsFilter()
{
void CWordsFilter::InitSensitiveWords(std::string strWord)
{
void CWordsFilter::InitSensitiveWords(std::list<std::string> lsAllSensitiveWords)
{
std::string CWordsFilter::FilterSensitiveWords(const std::string& strContent)
{
void CWordsFilter::BuildWordTree()
{
void CWordsFilter::InsertNode(CWordNode* pNode, const std::string& strContent, int nIndex)
{
CWordNode* CWordsFilter::FindNode(CWordNode* pNode, const std::string& word)
{
CWordsFilter& CWordsFilter::GetInstance()
{
{
public class WordFilter
{
#ifndef __TOOLS_WORDNODE_H_INCLUDE__
#define __TOOLS_WORDNODE_H_INCLUDE__
#include <map>
class CWordNode
{
public:typedef std::map<std::string, CWordNode*> umap;
CWordNode(const std::string& word) { Reset(word); }
~CWordNode()
{
umap::iterator Ite = this->m_mapWordNodes.begin();while (Ite != this->m_mapWordNodes.end()){
CWordNode* pTmp = Ite->second;delete pTmp;pTmp = NULL;++Ite;
}
this->m_mapWordNodes.clear();this->m_nEndTag = 0;
}
void Reset(const std::string& word)
{
this->m_cWord = word;this->m_nEndTag = 0;this->m_mapWordNodes.clear();
}
public:
std::string m_cWord;};
int m_nEndTag;
umap m_mapWordNodes;
#endif // __TOOLS_WORDNODE_H_INCLUDE__
WordsFilter.h
#ifndef __TOOLS_WORDSFILTER_H_INCLUDE__
#define __TOOLS_WORDSFILTER_H_INCLUDE__
#include <list>
#include "WordNode.h"
class CWordsFilter
{
typedef std::map<std::string, CWordNode*> umap;private:
std::list<std::string> m_lsAllSensitiveWords; // 所有敏感词列表
CWordNode* m_rootWordNode;
bool m_bIsInit;
public:
CWordsFilter();
~CWordsFilter();
static CWordsFilter& GetInstance(); // 获取共享实例
void InitSensitiveWords(std::string strWord); // 初始化敏感词集
void InitSensitiveWords(std::list<std::string> lsAllSensitiveWords); // 初始化敏感词集
std::string FilterSensitiveWords(const std::string& strContent); // 过滤敏感词
private:
void BuildWordTree(); // 构建敏感词树};
void InsertNode(CWordNode* pNode, const std::string& strContent, int nIndex);
CWordNode* FindNode(CWordNode* pNode, const std::string& word);
int GetFirstBytes(const std::string& str); // 获取字符串中的第一个字符字节长度
#endif // __TOOLS_WORDSFILTER_H_INCLUDE__
WordsFilter.cpp
#include <iostream>
#include <sstream>
#include <fstream>
#include <cmath>
#include "WordsFilter.h"
int nStep = 2;
typedef std::vector<std::string> Tokens;
Tokens StrSplit(const std::string &src, const std::string &sep)
{
Tokens r;
std::string s;
for (std::string::const_iterator i = src.begin(); i != src.end(); i++)
{
if (sep.find((*i)) != std::string::npos){
if (s.length()){
r.push_back(s);
}
s = "";
}else{
s += (*i);
}
}
if (s.length())
{
r.push_back(s);
}};
return r;
int CWordsFilter::GetFirstBytes(const std::string& str)
{
for (int i = 0; i < (int)str.size(); ++i)
{
unsigned char chr = (unsigned char)str.at(i);
// 如果是该字节是 0XXX XXXX 样式,说明其是一个英文文字,占1字节if ((chr >> 7) == 0){
return 1;
}// 如果该字节是 1111 110X 样式,说明其是一个文字的头,且该文字占6字节else if ((chr >> 1) == 126){
return 6;
}// 如果该字节是 1111 10XX 样式,说明其是一个文字的头,且该文字占5字节else if ((chr >> 2) == 62){
return 5;
}// 如果该字节是 1111 0XXX 样式,说明其是一个文字的头,且该文字占4字节else if ((chr >> 3) == 30){
return 4;
}// 如果该字节是 1110 XXXX 样式,说明其是一个文字的头,且该文字占3字节else if ((chr >> 4) == 14){
return 3;
}// 如果该字节是 110X XXXX 样式,说明其是一个文字的头,且该文字占2字节else if ((chr >> 5) == 6){
return 2;
}else{
continue;
}
}}
return 1;
CWordsFilter::CWordsFilter():
m_bIsInit(false),{
m_rootWordNode(NULL)
m_lsAllSensitiveWords.clear();}
CWordsFilter::~CWordsFilter()
{
this->m_lsAllSensitiveWords.clear();}
delete this->m_rootWordNode;
this->m_rootWordNode = NULL;
void CWordsFilter::InitSensitiveWords(std::string strWord)
{
Tokens token = StrSplit(strWord, ",");
std::list<std::string> lsAllSensitiveWords;
Tokens::iterator Ite = token.begin();
while (Ite != token.end())
{
lsAllSensitiveWords.push_back(*Ite);++Ite;
}}
InitSensitiveWords(lsAllSensitiveWords);
void CWordsFilter::InitSensitiveWords(std::list<std::string> lsAllSensitiveWords)
{
std::cout << "start init sensitive words" << std::endl;}
this->m_lsAllSensitiveWords.clear();
this->m_lsAllSensitiveWords = lsAllSensitiveWords;
BuildWordTree();
this->m_bIsInit = true;
std::string CWordsFilter::FilterSensitiveWords(const std::string& strContent)
{
if (!this->m_bIsInit || NULL == this->m_rootWordNode)
{
std::cout << "the sensitive words is not init" << std::endl;return "";
}
CWordNode* pNode = this->m_rootWordNode;
std::string strBuffer = "";
std::list<std::string> lsBad;
int a = 0;
while ( a < strContent.size() )
{
std::string strContentTmp = strContent.substr(a, strContent.size());nStep = GetFirstBytes(strContentTmp);std::string strTmp = "";if (nStep <= strContentTmp.size()){
strTmp = strContentTmp.substr(0, nStep);
}
pNode = FindNode(pNode, strTmp);if (pNode == NULL){
pNode = this->m_rootWordNode;int nSize = 0;std::list<std::string>::iterator Ite = lsBad.begin();while (Ite != lsBad.end()){
nSize += (*Ite).size();++Ite;
}
if (lsBad.size() > 0){
lsBad.clear();
}a = a - nSize;if (a < 0){
a = 0;
}std::string strContentTmp = strContent.substr(a, strContent.size());nStep = GetFirstBytes(strContentTmp);strTmp = "";if (nStep <= strContentTmp.size()){
strTmp = strContentTmp.substr(0, nStep);
}strBuffer.append(strTmp);
}
else if (pNode->m_nEndTag == 1){
lsBad.push_back(strTmp);
for (int nIndex = 0; nIndex < lsBad.size(); ++nIndex){
strBuffer.append("*");
}pNode = this->m_rootWordNode;lsBad.clear();
}else{
lsBad.push_back(strTmp);if (a == strContent.size() - nStep){
std::list<std::string>::const_iterator cIte = lsBad.begin();while (cIte != lsBad.end()){
strBuffer.append(*cIte);++cIte;
}
}
}strContentTmp = strContentTmp.substr(nStep, strContentTmp.size());a += nStep;
}
return strBuffer;}
void CWordsFilter::BuildWordTree()
{
if ( this->m_rootWordNode == NULL )
{
this->m_rootWordNode = new CWordNode("R");if (NULL == this->m_rootWordNode){
return;
}
}
this->m_rootWordNode->Reset("R");
std::list<std::string>::const_iterator cIte = this->m_lsAllSensitiveWords.begin();
while (cIte != this->m_lsAllSensitiveWords.end())
{
std::string strTmp = (*cIte);
if (strTmp.size() > 0){
InsertNode(this->m_rootWordNode, strTmp, 0);
}++cIte;
}}
void CWordsFilter::InsertNode(CWordNode* pNode, const std::string& strContent, int nIndex)
{
if (NULL == pNode)
{
return;
}
nStep = GetFirstBytes(strContent);
std::string strTmp = "";
if (nStep <= strContent.size())
{
strContent.substr(0, nStep);
}
CWordNode* pN = FindNode(pNode, strTmp);
if (NULL == pN)
{
pN = new CWordNode(strTmp);
if (NULL == pN){return;}pNode->m_mapWordNodes[strTmp] = pN;
}
if (nIndex == strContent.size() - nStep)
{
pN->m_nEndTag = 1;
}
strTmp = strContent.substr(nStep, strContent.size());
if (strTmp.size() > 0)
{
InsertNode(pN, strTmp, 0);
}}
CWordNode* CWordsFilter::FindNode(CWordNode* pNode, const std::string& word)
{
if ( NULL == pNode )
{
return NULL;
}
umap::iterator Ite = pNode->m_mapWordNodes.find(word);
if (Ite != pNode->m_mapWordNodes.end())
{
return Ite->second;
}}
return NULL;
CWordsFilter& CWordsFilter::GetInstance()
{
static CWordsFilter inst;
return inst;
}
以上是C++实现的
-------------------------------------------------------------分割线---------------------------------------------------------------------
下面是C#实现的
sealed class WordNode : IDisposable{
public WordNode(string word)
{
Reset(word);
}
public void Reset(string word)
{
this.word = word;endTag = 0;wordNodes.Clear();
}
public void Dispose()
{
Reset(string.Empty);
}
}
public string word;
public int endTag;
public Dictionary<string, WordNode> wordNodes = new Dictionary<string,WordNode>();
public class WordFilter
{
private List<string> allSensitiveWords = new List<string>();
private WordNode rootWordNode = null;
private bool isInit = false;
private WordFilter()
{
allSensitiveWords.Clear();isInit = false;rootWordNode = new WordNode(null);
}
public void InitSensitiveWords(string words)
{
string[] wordArr = words.Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries);this.allSensitiveWords.Clear();for (int index = 0; index < wordArr.Length; ++index){
this.allSensitiveWords.Add(wordArr[index]);
}BuildWordTree();this.isInit = true;
}
public string FilterSensitiveWords(string content)
{
if (!isInit || null == rootWordNode){
Console.WriteLine("the sensitive word is not init");return content;
}
WordNode node = this.rootWordNode;StringBuilder buffer = new StringBuilder();List<string> badLst = new List<string>();int a = 0;while (a < content.Length){
string contnetTmp = content.Substring(a);string strTmp = contnetTmp.Substring(0, 1);node = FindNode(node, strTmp);
if (null == node){
node = this.rootWordNode;a = a - badLst.Count;if (a < 0){
a = 0;
}
badLst.Clear();string beginContent = content.Substring(a);if (beginContent.Length > 0){
buffer.Append(beginContent[0]);
}
}else if (node.endTag == 1){
badLst.Add(strTmp);for (int index = 0; index < badLst.Count; ++index){
buffer.Append("*");
}node = this.rootWordNode;badLst.Clear();
}else
{
badLst.Add(strTmp);if (a == content.Length - 1){
for (int index = 0; index < badLst.Count; ++index){
buffer.Append(badLst[index]);
}
}
}
contnetTmp = contnetTmp.Substring(1);++a;
}
return buffer.ToString();
}
public bool IsHasSensitiveWord(string content)
{
if (!isInit || null == rootWordNode){
Console.WriteLine("the sensitive word is not init");return false;
}
WordNode node = this.rootWordNode;StringBuilder buffer = new StringBuilder();List<string> badLst = new List<string>();int a = 0;while (a < content.Length){
string contnetTmp = content.Substring(a);string strTmp = contnetTmp.Substring(0, 1);node = FindNode(node, strTmp);if (null == node){
node = this.rootWordNode;a = a - badLst.Count;if (a < 0){
a = 0;
}badLst.Clear();
}else if (node.endTag == 1){
return true;
}else{
badLst.Add(strTmp);
}
contnetTmp = contnetTmp.Substring(1);++a;
}
return false;
}
private void BuildWordTree()
{if (null == this.rootWordNode){this.rootWordNode = new WordNode("R");}this.rootWordNode.Reset("R");for (int index = 0; index < this.allSensitiveWords.Count; ++index){string strTmp = this.allSensitiveWords[index];if (strTmp.Length > 0){InsertNode(this.rootWordNode, strTmp);}}}
private void InsertNode(WordNode node, string content)
{
if (null == node){Console.WriteLine("the root node is null");return;}
string strTmp = content.Substring(0, 1);WordNode wordNode = FindNode(node, strTmp);if (null == wordNode){
wordNode = new WordNode(strTmp);node.wordNodes[strTmp] = wordNode;
}
strTmp = content.Substring(1);if (string.IsNullOrEmpty(strTmp)){
wordNode.endTag = 1;
}else{
InsertNode(wordNode, strTmp);
}}private WordNode FindNode(WordNode node, string content)
{
if (null == node){
return null;
}
WordNode wordNode = null;node.wordNodes.TryGetValue(content, out wordNode);return wordNode;
}
private static WordFilter instance = null;
public static WordFilter Instance
{
get{
if (null == instance){
instance = new WordFilter();
}return instance;
}
}}