敏感字符过滤

最新推荐文章于 2025-06-19 15:03:44 发布

原创最新推荐文章于 2025-06-19 15:03:44 发布 · 1.2k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#敏感字

game 专栏收录该内容

1 篇文章

订阅专栏

本文介绍了一种基于树形结构的敏感词过滤算法，并提供了C++和C#两种语言的具体实现。通过构建敏感词树，可以高效地检测并替换文本中的敏感词汇。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

WordNode.h
#ifndef __TOOLS_WORDNODE_H_INCLUDE__
#define __TOOLS_WORDNODE_H_INCLUDE__

#include <map>

class CWordNode

{

typedef std::map<std::string, CWordNode*> umap;

public:

CWordNode(const std::string& word) { Reset(word); }
~CWordNode()
{

umap::iterator Ite = this->m_mapWordNodes.begin();

while (Ite != this->m_mapWordNodes.end())

{

CWordNode* pTmp = Ite->second;

delete pTmp;

pTmp = NULL;

++Ite;

}

this->m_mapWordNodes.clear();

this->m_nEndTag = 0;

}

void Reset(const std::string& word)
{

this->m_cWord = word;

this->m_nEndTag = 0;

this->m_mapWordNodes.clear();

}

public:

std::string m_cWord;
int m_nEndTag;
umap m_mapWordNodes;

};

#endif // __TOOLS_WORDNODE_H_INCLUDE__

WordsFilter.h
#ifndef __TOOLS_WORDSFILTER_H_INCLUDE__
#define __TOOLS_WORDSFILTER_H_INCLUDE__

#include <list>
#include "WordNode.h"

class CWordsFilter
{

typedef std::map<std::string, CWordNode*> umap;

private:

std::list<std::string> m_lsAllSensitiveWords; // 所有敏感词列表
CWordNode* m_rootWordNode;
bool m_bIsInit;

public:

CWordsFilter();
~CWordsFilter();

static CWordsFilter& GetInstance(); // 获取共享实例

void InitSensitiveWords(std::string strWord); // 初始化敏感词集
void InitSensitiveWords(std::list<std::string> lsAllSensitiveWords); // 初始化敏感词集
std::string FilterSensitiveWords(const std::string& strContent); // 过滤敏感词

private:

void BuildWordTree(); // 构建敏感词树
void InsertNode(CWordNode* pNode, const std::string& strContent, int nIndex);
CWordNode* FindNode(CWordNode* pNode, const std::string& word);
int GetFirstBytes(const std::string& str); // 获取字符串中的第一个字符字节长度

};

#endif // __TOOLS_WORDSFILTER_H_INCLUDE__

WordsFilter.cpp
#include <iostream>
#include <sstream>
#include <fstream>
#include <cmath>
#include "WordsFilter.h"

int nStep = 2;

typedef std::vector<std::string> Tokens;
Tokens StrSplit(const std::string &src, const std::string &sep)
{

Tokens r;
std::string s;

for (std::string::const_iterator i = src.begin(); i != src.end(); i++)
{

if (sep.find((*i)) != std::string::npos)

{

if (s.length())

{

r.push_back(s);

}

s = "";

}

else

{

s += (*i);

}

if (s.length())
{

r.push_back(s);

}
return r;

};

int CWordsFilter::GetFirstBytes(const std::string& str)
{

for (int i = 0; i < (int)str.size(); ++i)

{

unsigned char chr = (unsigned char)str.at(i);

// 如果是该字节是 0XXX XXXX 样式，说明其是一个英文文字，占1字节

if ((chr >> 7) == 0)

{

return 1;

}

// 如果该字节是 1111 110X 样式，说明其是一个文字的头，且该文字占6字节

else if ((chr >> 1) == 126)

{

return 6;

}

// 如果该字节是 1111 10XX 样式，说明其是一个文字的头，且该文字占5字节

else if ((chr >> 2) == 62)

{

return 5;

}

// 如果该字节是 1111 0XXX 样式，说明其是一个文字的头，且该文字占4字节

else if ((chr >> 3) == 30)

{

return 4;

}

// 如果该字节是 1110 XXXX 样式，说明其是一个文字的头，且该文字占3字节

else if ((chr >> 4) == 14)

{

return 3;

}

// 如果该字节是 110X XXXX 样式，说明其是一个文字的头，且该文字占2字节

else if ((chr >> 5) == 6)

{

return 2;

}

else

{

continue;

}

}
return 1;

}

CWordsFilter::CWordsFilter():

m_bIsInit(false),
m_rootWordNode(NULL)

{

m_lsAllSensitiveWords.clear();

}

CWordsFilter::~CWordsFilter()
{

this->m_lsAllSensitiveWords.clear();
delete this->m_rootWordNode;
this->m_rootWordNode = NULL;

}

void CWordsFilter::InitSensitiveWords(std::string strWord)

{

Tokens token = StrSplit(strWord, ",");
std::list<std::string> lsAllSensitiveWords;
Tokens::iterator Ite = token.begin();
while (Ite != token.end())
{

lsAllSensitiveWords.push_back(*Ite);

++Ite;

}
InitSensitiveWords(lsAllSensitiveWords);

}

void CWordsFilter::InitSensitiveWords(std::list<std::string> lsAllSensitiveWords)
{

std::cout << "start init sensitive words" << std::endl;
this->m_lsAllSensitiveWords.clear();
this->m_lsAllSensitiveWords = lsAllSensitiveWords;

BuildWordTree();
this->m_bIsInit = true;

}

std::string CWordsFilter::FilterSensitiveWords(const std::string& strContent)
{

if (!this->m_bIsInit || NULL == this->m_rootWordNode)
{

std::cout << "the sensitive words is not init" << std::endl;

return "";

}

CWordNode* pNode = this->m_rootWordNode;

std::string strBuffer = "";
std::list<std::string> lsBad;
int a = 0;

while ( a < strContent.size() )
{

std::string strContentTmp = strContent.substr(a, strContent.size());

nStep = GetFirstBytes(strContentTmp);

std::string strTmp = "";

if (nStep <= strContentTmp.size())

{

strTmp = strContentTmp.substr(0, nStep);

}

pNode = FindNode(pNode, strTmp);

if (pNode == NULL)

{

pNode = this->m_rootWordNode;

int nSize = 0;

std::list<std::string>::iterator Ite = lsBad.begin();

while (Ite != lsBad.end())

{

nSize += (*Ite).size();

++Ite;

}

if (lsBad.size() > 0)

{

lsBad.clear();

}

a = a - nSize;

if (a < 0)

{

a = 0;

}

std::string strContentTmp = strContent.substr(a, strContent.size());

nStep = GetFirstBytes(strContentTmp);

strTmp = "";

if (nStep <= strContentTmp.size())

{

strTmp = strContentTmp.substr(0, nStep);

}

strBuffer.append(strTmp);

}

else if (pNode->m_nEndTag == 1)

{

lsBad.push_back(strTmp);

for (int nIndex = 0; nIndex < lsBad.size(); ++nIndex)

{

strBuffer.append("*");

}

pNode = this->m_rootWordNode;

lsBad.clear();

}

else

{

lsBad.push_back(strTmp);

if (a == strContent.size() - nStep)

{

std::list<std::string>::const_iterator cIte = lsBad.begin();

while (cIte != lsBad.end())

{

strBuffer.append(*cIte);

++cIte;

}

}

strContentTmp = strContentTmp.substr(nStep, strContentTmp.size());

a += nStep;

}

return strBuffer;

}

void CWordsFilter::BuildWordTree()
{

if ( this->m_rootWordNode == NULL )
{

this->m_rootWordNode = new CWordNode("R");

if (NULL == this->m_rootWordNode)

{

return;

}

this->m_rootWordNode->Reset("R");

std::list<std::string>::const_iterator cIte = this->m_lsAllSensitiveWords.begin();
while (cIte != this->m_lsAllSensitiveWords.end())

{

std::string strTmp = (*cIte);

if (strTmp.size() > 0)

{

InsertNode(this->m_rootWordNode, strTmp, 0);

}

++cIte;

}

}

void CWordsFilter::InsertNode(CWordNode* pNode, const std::string& strContent, int nIndex)
{

if (NULL == pNode)
{

return;

}
nStep = GetFirstBytes(strContent);
std::string strTmp = "";
if (nStep <= strContent.size())
{

strContent.substr(0, nStep);

}

CWordNode* pN = FindNode(pNode, strTmp);
if (NULL == pN)
{

pN = new CWordNode(strTmp);

if (NULL == pN)

{

return;

}

pNode->m_mapWordNodes[strTmp] = pN;

}

if (nIndex == strContent.size() - nStep)
{

pN->m_nEndTag = 1;

}

strTmp = strContent.substr(nStep, strContent.size());

if (strTmp.size() > 0)
{

InsertNode(pN, strTmp, 0);

}

}

CWordNode* CWordsFilter::FindNode(CWordNode* pNode, const std::string& word)
{

if ( NULL == pNode )
{

return NULL;

}
umap::iterator Ite = pNode->m_mapWordNodes.find(word);
if (Ite != pNode->m_mapWordNodes.end())
{

return Ite->second;

}

return NULL;

}

CWordsFilter& CWordsFilter::GetInstance()
{

static CWordsFilter inst;
return inst;

}

以上是C++实现的

-------------------------------------------------------------分割线---------------------------------------------------------------------

下面是C#实现的

sealed class WordNode : IDisposable
{

public WordNode(string word)
{

Reset(word);

}

public void Reset(string word)
{

this.word = word;

endTag = 0;

wordNodes.Clear();

}

public void Dispose()
{

Reset(string.Empty);

}

public string word;
public int endTag;
public Dictionary<string, WordNode> wordNodes = new Dictionary<string,WordNode>();

}

public class WordFilter
{

private List<string> allSensitiveWords = new List<string>();
private WordNode rootWordNode = null;
private bool isInit = false;

private WordFilter()
{

allSensitiveWords.Clear();

isInit = false;

rootWordNode = new WordNode(null);

}

public void InitSensitiveWords(string words)
{

string[] wordArr = words.Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries);

this.allSensitiveWords.Clear();

for (int index = 0; index < wordArr.Length; ++index)

{

this.allSensitiveWords.Add(wordArr[index]);

}

BuildWordTree();

this.isInit = true;

}

public string FilterSensitiveWords(string content)
{

if (!isInit || null == rootWordNode)

{

Console.WriteLine("the sensitive word is not init");

return content;

}

WordNode node = this.rootWordNode;

StringBuilder buffer = new StringBuilder();

List<string> badLst = new List<string>();

int a = 0;

while (a < content.Length)

{

string contnetTmp = content.Substring(a);

string strTmp = contnetTmp.Substring(0, 1);

node = FindNode(node, strTmp);

if (null == node)

{

node = this.rootWordNode;

a = a - badLst.Count;

if (a < 0)

{

a = 0;

}

badLst.Clear();

string beginContent = content.Substring(a);

if (beginContent.Length > 0)

{

buffer.Append(beginContent[0]);

}

}

else if (node.endTag == 1)

{

badLst.Add(strTmp);

for (int index = 0; index < badLst.Count; ++index)

{

buffer.Append("*");

}

node = this.rootWordNode;

badLst.Clear();

}

else

{

badLst.Add(strTmp);

if (a == content.Length - 1)

{

for (int index = 0; index < badLst.Count; ++index)

{

buffer.Append(badLst[index]);

}

contnetTmp = contnetTmp.Substring(1);

++a;

}

return buffer.ToString();

}

public bool IsHasSensitiveWord(string content)
{

if (!isInit || null == rootWordNode)

{

Console.WriteLine("the sensitive word is not init");

return false;

}

WordNode node = this.rootWordNode;

StringBuilder buffer = new StringBuilder();

List<string> badLst = new List<string>();

int a = 0;

while (a < content.Length)

{

string contnetTmp = content.Substring(a);

string strTmp = contnetTmp.Substring(0, 1);

node = FindNode(node, strTmp);

if (null == node)

{

node = this.rootWordNode;

a = a - badLst.Count;

if (a < 0)

{

a = 0;

}

badLst.Clear();

}

else if (node.endTag == 1)

{

return true;

}

else

{

badLst.Add(strTmp);

}

contnetTmp = contnetTmp.Substring(1);

++a;

}

return false;

}

private void BuildWordTree()
{
if (null == this.rootWordNode)

{

this.rootWordNode = new WordNode("R");

}

this.rootWordNode.Reset("R");

for (int index = 0; index < this.allSensitiveWords.Count; ++index)

{

string strTmp = this.allSensitiveWords[index];

if (strTmp.Length > 0)

{

InsertNode(this.rootWordNode, strTmp);

}

}
}