这里使用python 的 bottle框架来做简易的敏感词过滤系统,算法采用成熟的DFA关键词匹配算法,本系统只提供一套基于http的 api,可以提供给各个应用使用。这里我只把最核心的业务实现,其他的再完善。
检索算法是网上找的DFA算法的python实现版本
smallgfw.py
#encoding=utf-8
#DFA based text filter
#version=0.3
class GFW(object):
def __init__(self):
self.d = {}
#give a list of "ming gan ci"
def set(self,keywords):
p = self.d
q = {}
k = ''
for word in keywords:
word += chr(11)
p = self.d
for char in word:
char = char.lower()
if p=='':
q[k] = {}
p = q[k]
if not (char in p):
p[char] = ''
q = p
k = char
p = p[char]
pass
def replace(self,text,mask):
"""
>>> gfw = GFW()
>>> gfw.set(["sexy","girl","love","shit"])
>>> s = gfw.replace("Shit!,Cherry is a sexy girl. She loves python.","*")
>>> print s
*!,Cherry is a * *. She *s python.
"""
p = self.d
i = 0
j = 0
z = 0
result = []
ln = len(text)
while i+j<ln:
#print i,j
t = text[i+j].lower()
#print hex(ord(t))
if not (t in p):
j = 0
i += 1
p = self.d
continue
p = p[t]
j+=1
if chr(11) in p:
p = self.d
result.append(text[z:i])
result.append(mask)
i = i+j
z = i
j = 0
result.append(text[z:i+j])
return "".join(result)
def check(self,text):
"""
>>> gfw = GFW()
>>> gfw.set(["abd","defz","bcz"])
>>> print gfw.check("xabdabczabdxaadefz")
[(1, 3, 'abd'), (5, 3, 'bcz'), (8, 3, 'abd'), (14, 4, 'defz')]
"""
p = self.d
i = 0
j = 0
result = []
ln = len(text)
while i+j<ln:
t = text[i+j].lower()
#print i,j,hex(ord(t))
if not (t in p):
j = 0
i += 1
p = self.d
continue
p = p[t]
j+=1
#print p,i,j
if chr(11) in p:
p = self.d
result.append((i,j,text[i:i+j]))
i = i+j
j = 0
return result
if __name__=="__main__":
import doctest,sys
doctest.testmod(sys.modules[__name__])
#-*- coding:utf-8 -*-
#localhost testing
#caroltc 2014/10/7
from bottle import route, run, request
from smallgfw import *
import json
import sys
def initWords():
path = 'words.txt'
fp = open(path,'r')
word_list = []
for line in fp:
line = line[0:-1]
word_list.append(line)
fp.close()
return word_list
@route('/replace', method="POST")
def replace():
reload(sys)
sys.setdefaultencoding('utf8')
getwords = request.params.words or ""
gfw = GFW()
words = initWords()
gfw.set(words)#设置敏感词列表
res = gfw.check(getwords.encode('utf8'))
# for obj in res:
# print json.dumps(obj),obj[2]
s = gfw.replace(getwords.encode('utf8'),"**")
return s
@route('/check',method="POST")
def check():
reload(sys)
sys.setdefaultencoding('utf8')
getwords = request.params.words or ""
gfw = GFW()
words = initWords()
gfw.set(words)#设置敏感词列表
res = gfw.check(getwords.encode('utf8'))
resp = {}
resp['count'] = len(res)
resp['datas']= res
return json.dumps(resp)
@route('/test')
def test():
reload(sys)
sys.setdefaultencoding('utf8')
webdata = '<h1>check</h1><form action="/replace" method="post"><input type="text" name="words" /><input type="submit"></from>'
return webdata
run(host='localhost', port=80, debug=True)
测试一下api,均为POST请求
过滤敏感词API,直接返回过滤后的数据
检测敏感词API,返回json格式数据
用bottle来开发这样的小工具相当快,而且敏感词检测系统在很多应用场景都需要,独立出来写成接口可以提高效率,并且易于维护,国内目前第三方敏感词检测服务还不多,天朝的需求又很旺盛,可以试试搞个在线敏感词检测服务平台。