直接上代码
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
利用simhash进行文本去重
"""
from simhash import Simhash, SimhashIndex
import jieba
import codecs
import datetime
import os
class Dudup(object):
def __init__(self, data_dir='../data/', model_dir='../model/', file_name='test_data.txt',\
clean_file_name='test_data_clean.txt'):
self.data_dir = data_dir
self.model_dir = model_dir
self.file_name = file_name
self.clean_file_name = clean_file_name
def stop_word_list(self, stop_words_path):
stopwords = [x.strip() for x in codecs.open(stop_words_path, 'r', encoding='utf-8').readlines()]
return stopwords
def tokenization(self, line):
"""
:param line: 每行原始数据
:return: 分词、去除停用词后的数据
"""
result = []
words = jieba.lcut(line)
for word in words:
if word not in self.stop_word_list(self.data_dir + 'stopwords.txt'):
result.append(word)
return result
def read_data(self, file):
data_list = []
with open(self.data_dir + file, encoding='utf-8') as data:
for line in data.readlines():
data_list.append(line.rstrip('\n'))
return data_list
def get_data_dict(self):
data_dic = {}
index = 1
clean_data = []
if not os.path.exists(self.data_dir + self.file_name):
# clean_data = [self.tokenization(sent) for sent in self.read_data() if len(sent)]
with open(self.data_dir + self.clean_file_name, 'w', encoding='utf-8') as cleaned_data:
for sent in self.read_data():
clean_line = self.tokenization(sent)
clean_data.append(clean_line)
cleaned_data.write(' '.join(clean_line)+'\n')
else:
clean_data = self.read_data(self.clean_file_name)
data_line_number = len(clean_data)
for line in clean_data:
data_dic[str(index)] = ' '.join(line)
index += 1
if index == data_line_number:
break
# print(data_dic)
return data_dic
def get_index(self):
data_dic = self.get_data_dict()
print(data_dic) # 打印出字典
line_score = [(id, Simhash(sent)) for id, sent in data_dic.items()]
index = SimhashIndex(line_score, k=2)
return index
if __name__ == '__main__':
start_time = datetime.datetime.now()
find_dup = Dudup()
Sim_Hash_Index = find_dup.get_index()
inp = '“全椒县经开区污水处理厂有限公司提标改造设备采购二次'
inp_sim_hash = Simhash(' '.join(find_dup.tokenization(inp)))
result_index = Sim_Hash_Index.get_near_dups(inp_sim_hash)
if len(result_index):
print('重复行索引\t', result_index[0])
raw_data_list = find_dup.read_data(find_dup.data_dir + find_dup.file_name)
print('重复标题\t', raw_data_list[int(result_index[0]) - 1])
else:
print("没有重复行")
end_time = datetime.datetime.now()
print("consume time is %f minutes." % ((end_time - start_time).seconds * 1.0 / 60))