对于敏感词问题,先占个坑,后续写一篇由初到高的。
下面贴出的是lua实现的前缀树法。注意:系统字符集需要设置为UTF-8,lua脚本文件格式也要设为UTF-8,不然就乱码鸟。也有一种不区分编码的实现,就是按字节构建前缀树,但这样打码的符号个数与实际敏感词字符数就可能不一样了。
local path = "data.NGWords"
local db = require(path)
local NG = {}
function NG.init()
print("++++++++++++++++++++===========================+++++++++++++++++++++", "begin init")
NG.root = NG.parse()
NG.initFinished = true
print("++++++++++++++++++++===========================+++++++++++++++++++++", "init finish")
end
function NG.parse()
local root = {}
local parent = nil
local child = nil
local charArray = nil
for id, v in pairs(db) do
if v.str and "" ~= v.str then
parent = root
child = nil
charArray = NG.toLowerCharArray(v.str)
for _, c in pairs(charArray) do
child = NG.getSubNode(parent, c)
if not child then
child = NG.createNode()
NG.addSubNode(parent, c, child)
end
parent = child
end
NG.setNodeIsEnd(child)
end
end
return root
end
function NG.reload()
print("++++++++++++++++++++===========================+++++++++++++++++++++", "begin reload")
package.loaded[path] = nil
db = require(path)
NG.initFinished = nil
NG.root = NG.parse()
NG.initFinished = true
print("++++++++++++++++++++===========================+++++++++++++++++++++", "reload finish")
end
function NG.toLower(c)
local byte = string.byte(c, 1)
local charByteCount = NG.judgeByteCountByFirstUTF8Byte(byte)
if 1 == charByteCount and by