敏感字符过滤

本文介绍了一种基于树形结构的敏感词过滤算法,并提供了C++和C#两种语言的具体实现。通过构建敏感词树,可以高效地检测并替换文本中的敏感词汇。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

WordNode.h
#ifndef __TOOLS_WORDNODE_H_INCLUDE__
#define __TOOLS_WORDNODE_H_INCLUDE__

#include <map>

class CWordNode

{

typedef std::map<std::string, CWordNode*> umap;

public:
CWordNode(const std::string& word) { Reset(word); }
~CWordNode()
{
umap::iterator Ite = this->m_mapWordNodes.begin();
while (Ite != this->m_mapWordNodes.end())
{
CWordNode* pTmp = Ite->second;
delete pTmp;
pTmp = NULL;
++Ite;
}

this->m_mapWordNodes.clear();
this->m_nEndTag = 0;
}

void Reset(const std::string& word) 
{
this->m_cWord   = word;
this->m_nEndTag = 0;
this->m_mapWordNodes.clear();
}

public:
std::string                      m_cWord;
int                              m_nEndTag;
umap                             m_mapWordNodes;
};


#endif // __TOOLS_WORDNODE_H_INCLUDE__

WordsFilter.h
#ifndef __TOOLS_WORDSFILTER_H_INCLUDE__
#define __TOOLS_WORDSFILTER_H_INCLUDE__

#include <list>
#include "WordNode.h"

class CWordsFilter
{
typedef std::map<std::string, CWordNode*> umap;
private:
std::list<std::string> m_lsAllSensitiveWords; // 所有敏感词列表
CWordNode*             m_rootWordNode;
bool                   m_bIsInit;

public:
CWordsFilter();
~CWordsFilter();

static CWordsFilter&  GetInstance(); // 获取共享实例

void                   InitSensitiveWords(std::string strWord);                        // 初始化敏感词集
void                   InitSensitiveWords(std::list<std::string> lsAllSensitiveWords); // 初始化敏感词集
std::string            FilterSensitiveWords(const std::string& strContent);      // 过滤敏感词

private:
void                   BuildWordTree(); // 构建敏感词树
void                   InsertNode(CWordNode* pNode, const std::string& strContent, int nIndex);
CWordNode*             FindNode(CWordNode* pNode, const std::string& word);
int                    GetFirstBytes(const std::string& str); // 获取字符串中的第一个字符字节长度
};

#endif // __TOOLS_WORDSFILTER_H_INCLUDE__

WordsFilter.cpp
#include <iostream>
#include <sstream>
#include <fstream>
#include <cmath>
#include "WordsFilter.h"

int nStep = 2;

typedef std::vector<std::string> Tokens;
Tokens StrSplit(const std::string &src, const std::string &sep)
{
Tokens r;
std::string s;

for (std::string::const_iterator i = src.begin(); i != src.end(); i++)
{
if (sep.find((*i)) != std::string::npos)
{
if (s.length())
{
r.push_back(s);
}
s = "";
}
else
{
s += (*i);
}
}

if (s.length())
{
r.push_back(s);
}
return r;
};

int CWordsFilter::GetFirstBytes(const std::string& str)
{
for (int i = 0; i < (int)str.size(); ++i)
{
unsigned char chr = (unsigned char)str.at(i);
// 如果是该字节是 0XXX XXXX 样式,说明其是一个英文文字,占1字节
if ((chr >> 7) == 0)
{
return 1;
}
// 如果该字节是 1111 110X 样式,说明其是一个文字的头,且该文字占6字节
else if ((chr >> 1) == 126)
{
return 6;
}
// 如果该字节是 1111 10XX 样式,说明其是一个文字的头,且该文字占5字节
else if ((chr >> 2) == 62)
{
return 5;
}
// 如果该字节是 1111 0XXX 样式,说明其是一个文字的头,且该文字占4字节
else if ((chr >> 3) == 30)
{
return 4;
}
// 如果该字节是 1110 XXXX 样式,说明其是一个文字的头,且该文字占3字节
else if ((chr >> 4) == 14)
{
return 3;
}
// 如果该字节是 110X XXXX 样式,说明其是一个文字的头,且该文字占2字节
else if ((chr >> 5) == 6)
{
return 2;
}
else
{
continue;
}
}
return 1;
}

CWordsFilter::CWordsFilter():
m_bIsInit(false),
m_rootWordNode(NULL)
{
m_lsAllSensitiveWords.clear();
}

CWordsFilter::~CWordsFilter()
{
this->m_lsAllSensitiveWords.clear();
delete this->m_rootWordNode;
this->m_rootWordNode = NULL;
}

void CWordsFilter::InitSensitiveWords(std::string strWord)
{
Tokens token = StrSplit(strWord, ",");
std::list<std::string> lsAllSensitiveWords;
Tokens::iterator Ite = token.begin();
while (Ite != token.end())
{
lsAllSensitiveWords.push_back(*Ite);
++Ite;
}
InitSensitiveWords(lsAllSensitiveWords);
}

void CWordsFilter::InitSensitiveWords(std::list<std::string> lsAllSensitiveWords)
{
std::cout << "start init sensitive words" << std::endl;
this->m_lsAllSensitiveWords.clear();
this->m_lsAllSensitiveWords = lsAllSensitiveWords;

BuildWordTree();
this->m_bIsInit = true;
}

std::string CWordsFilter::FilterSensitiveWords(const std::string& strContent)
{
if (!this->m_bIsInit || NULL == this->m_rootWordNode)
{
std::cout << "the sensitive words is not init" << std::endl;
return "";
}

CWordNode* pNode = this->m_rootWordNode;

std::string strBuffer = "";
std::list<std::string> lsBad;
int a = 0;
while ( a < strContent.size() )
{
std::string strContentTmp = strContent.substr(a, strContent.size());
nStep = GetFirstBytes(strContentTmp);
std::string strTmp = "";
if (nStep <= strContentTmp.size())
{
strTmp = strContentTmp.substr(0, nStep);
}
pNode = FindNode(pNode, strTmp);
if (pNode == NULL)
{
pNode = this->m_rootWordNode;
int nSize = 0;
std::list<std::string>::iterator Ite = lsBad.begin();
while (Ite != lsBad.end())
{
nSize += (*Ite).size();
++Ite;
}
if (lsBad.size() > 0)
{
lsBad.clear();
}
a = a - nSize;
if (a < 0) 
{
a = 0;
}
std::string strContentTmp = strContent.substr(a, strContent.size());
nStep = GetFirstBytes(strContentTmp);
strTmp = "";
if (nStep <= strContentTmp.size())
{
strTmp = strContentTmp.substr(0, nStep);
}
strBuffer.append(strTmp);
}
else if (pNode->m_nEndTag == 1)
{
lsBad.push_back(strTmp);
for (int nIndex = 0; nIndex < lsBad.size(); ++nIndex)
{
strBuffer.append("*");
}
pNode = this->m_rootWordNode;
lsBad.clear();
}
else
{
lsBad.push_back(strTmp);
if (a == strContent.size() - nStep)
{
std::list<std::string>::const_iterator cIte = lsBad.begin();
while (cIte != lsBad.end())
{
strBuffer.append(*cIte);
++cIte;
}
}
}
strContentTmp = strContentTmp.substr(nStep, strContentTmp.size());
a += nStep;
}

return strBuffer;
}

void CWordsFilter::BuildWordTree()
{
if ( this->m_rootWordNode == NULL )
{
this->m_rootWordNode = new CWordNode("R");
if (NULL == this->m_rootWordNode)
{
return;
}
}
this->m_rootWordNode->Reset("R");

std::list<std::string>::const_iterator cIte = this->m_lsAllSensitiveWords.begin();
while (cIte != this->m_lsAllSensitiveWords.end())
{
std::string strTmp = (*cIte);

if (strTmp.size() > 0)
{
InsertNode(this->m_rootWordNode, strTmp, 0);
}
++cIte;
}
}

void CWordsFilter::InsertNode(CWordNode* pNode, const std::string& strContent, int nIndex)
{
if (NULL == pNode)
{
return;
}
nStep = GetFirstBytes(strContent);
std::string strTmp = "";
if (nStep <= strContent.size())
{
strContent.substr(0, nStep);
}
CWordNode* pN = FindNode(pNode, strTmp);
if (NULL == pN)
{
pN = new CWordNode(strTmp);
if (NULL == pN)
{
return;
}
pNode->m_mapWordNodes[strTmp] = pN;
}

if (nIndex == strContent.size() - nStep)
{
pN->m_nEndTag = 1;
}

strTmp = strContent.substr(nStep, strContent.size());

if (strTmp.size() > 0)
{
InsertNode(pN, strTmp, 0);
}
}

CWordNode* CWordsFilter::FindNode(CWordNode* pNode, const std::string& word)
{
if ( NULL == pNode )
{
return NULL;
}
umap::iterator Ite = pNode->m_mapWordNodes.find(word);
if (Ite != pNode->m_mapWordNodes.end())
{
return Ite->second;
}

return NULL;
}

CWordsFilter& CWordsFilter::GetInstance()
{
static CWordsFilter inst;
return inst;

}

以上是C++实现的

-------------------------------------------------------------分割线---------------------------------------------------------------------

下面是C#实现的

sealed class WordNode : IDisposable
{
public WordNode(string word)
{
Reset(word);
}

public void Reset(string word)
{
this.word = word;
endTag = 0;
wordNodes.Clear();
}

public void Dispose()
{
Reset(string.Empty);
}

public string word;
public int endTag;
public Dictionary<string, WordNode> wordNodes = new Dictionary<string,WordNode>();
}

public class WordFilter
{
private List<string> allSensitiveWords = new List<string>();
private WordNode rootWordNode = null;
private bool isInit = false;

private WordFilter()
{
allSensitiveWords.Clear();
isInit = false;
rootWordNode = new WordNode(null);
}

public void InitSensitiveWords(string words)
{
string[] wordArr = words.Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
this.allSensitiveWords.Clear();
for (int index = 0; index < wordArr.Length; ++index)
{
this.allSensitiveWords.Add(wordArr[index]);
}
BuildWordTree();
this.isInit = true;

}


public string FilterSensitiveWords(string content)
{
if (!isInit || null == rootWordNode)
{
Console.WriteLine("the sensitive word is not init");
return content;
}

WordNode node = this.rootWordNode;
StringBuilder buffer = new StringBuilder();
List<string> badLst = new List<string>();
int a = 0;
while (a < content.Length)
{
string contnetTmp = content.Substring(a);
string strTmp = contnetTmp.Substring(0, 1);
node = FindNode(node, strTmp);
if (null == node)
{
node = this.rootWordNode;
a = a - badLst.Count;
if (a < 0)
{
a = 0;
}
badLst.Clear();
string beginContent = content.Substring(a);
if (beginContent.Length > 0)
{
buffer.Append(beginContent[0]);
}
}
else if (node.endTag == 1)
{
badLst.Add(strTmp);
for (int index = 0; index < badLst.Count; ++index)
{
buffer.Append("*");
}
node = this.rootWordNode;
badLst.Clear();
}
else
{
badLst.Add(strTmp);
if (a == content.Length - 1)
{
for (int index = 0; index < badLst.Count; ++index)
{
buffer.Append(badLst[index]);
}
}
}

contnetTmp = contnetTmp.Substring(1);
++a;
}

return buffer.ToString();
}

public bool IsHasSensitiveWord(string content)
{
if (!isInit || null == rootWordNode)
{
Console.WriteLine("the sensitive word is not init");
return false;
}

WordNode node = this.rootWordNode;
StringBuilder buffer = new StringBuilder();
List<string> badLst = new List<string>();
int a = 0;
while (a < content.Length)
{
string contnetTmp = content.Substring(a);
string strTmp = contnetTmp.Substring(0, 1);
node = FindNode(node, strTmp);
if (null == node)
{
node = this.rootWordNode;
a = a - badLst.Count;
if (a < 0)
{
a = 0;
}
badLst.Clear();
}
else if (node.endTag == 1)
{
return true;
}
else
{
badLst.Add(strTmp);
}

contnetTmp = contnetTmp.Substring(1);
++a;
}

return false;
}

private void BuildWordTree()
{
if (null == this.rootWordNode)
{
this.rootWordNode = new WordNode("R");
}
this.rootWordNode.Reset("R");
for (int index = 0; index < this.allSensitiveWords.Count; ++index)
{
string strTmp = this.allSensitiveWords[index];
if (strTmp.Length > 0)
{
InsertNode(this.rootWordNode, strTmp);
}
}
}

private void InsertNode(WordNode node, string content)
{
if (null == node)
{
Console.WriteLine("the root node is null");
return;
}
string strTmp = content.Substring(0, 1);
WordNode wordNode = FindNode(node, strTmp);
if (null == wordNode)
{
wordNode = new WordNode(strTmp);
node.wordNodes[strTmp] = wordNode;
}

strTmp = content.Substring(1);
if (string.IsNullOrEmpty(strTmp))
{
wordNode.endTag = 1;
}
else
{
InsertNode(wordNode, strTmp);
}
}

private WordNode FindNode(WordNode node, string content)
{
if (null == node)
{
return null;
}

WordNode wordNode = null;
node.wordNodes.TryGetValue(content, out wordNode);
return wordNode;
}

private static WordFilter instance = null;
public static WordFilter Instance
{
get
{
if (null == instance)
{
instance = new WordFilter();
}
return instance;
}
}
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值