原创文章,转载请加出处 https://blog.youkuaiyun.com/gkbxs/article/details/112667004
知识处理库部分介绍:用来给知识按相似度来去重复的,还有就是收集新增字符串的,如果相似度很高就不再收入,保证知识的精简。
def mycollector(a,blist,set=0.9):
xsdlist=[]
for i in blist:
seq=difflib.SequenceMatcher(None, a, i)
xiangsidu=seq.ratio()
xsdlist.append(xiangsidu)
print(xsdlist)
print('最大值',max(xsdlist))
if max(xsdlist)>=set:
print('这个输入重复了哦 ')
else:
blist.append(a)
return blist
testlist=['今天天气很好','明天去吃大餐','天气正好好','白日依山尽','黄河如海流','测试代码666','明天去吃大餐!','天气 正好好','这是一次删除测试']
def quchong(mylist,set=0.9):
listlong=len(mylist)
delindex=[]
for i in mylist:
newlist=mylist[int(mylist.index(i)+1):int(listlong)]
for j in newlist:
seq=difflib.SequenceMatcher(None, i, j)
xiangsidu=seq.ratio()
#print(xiangsidu)
if xiangsidu>=set:
delindex.append(mylist.index(i))
print('需要删除相似度很高的index是',delindex)
for k in range(len(delindex)):
x=delindex.pop()
print(x)
del mylist[x] #这里有问题,删除前一个后,序号也变了
print(mylist)
return mylist
if __name__ == "__main__":
hehe=quchong(testlist)
print('测试返回',hehe)
a='黄河如 海流!'
oo=mycollector(a,testlist,set=0.7)
print(oo)