基于simhash的短文本去重

本文介绍了一种使用SimHash算法进行文本相似性检测的方法,通过分词、去除停用词并利用SimHash生成文本指纹,实现对大量文本数据的高效去重。代码示例展示了如何构建SimHash索引,并查找重复的文本片段。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

直接上代码

#!/usr/bin/env python
# -*- coding:utf-8 -*-

"""
利用simhash进行文本去重
"""

from simhash import Simhash, SimhashIndex
import jieba
import codecs
import datetime
import os


class Dudup(object):
    def __init__(self, data_dir='../data/', model_dir='../model/', file_name='test_data.txt',\
                 clean_file_name='test_data_clean.txt'):
        self.data_dir = data_dir
        self.model_dir = model_dir
        self.file_name = file_name
        self.clean_file_name = clean_file_name

    def stop_word_list(self, stop_words_path):
        stopwords = [x.strip() for x in codecs.open(stop_words_path, 'r', encoding='utf-8').readlines()]
        return stopwords

    def tokenization(self, line):
        """
        :param line: 每行原始数据
        :return: 分词、去除停用词后的数据
        """
        result = []
        words = jieba.lcut(line)
        for word in words:
            if word not in self.stop_word_list(self.data_dir + 'stopwords.txt'):
                result.append(word)
        return result

    def read_data(self, file):
        data_list = []
        with open(self.data_dir + file, encoding='utf-8') as data:
            for line in data.readlines():
                data_list.append(line.rstrip('\n'))
        return data_list

    def get_data_dict(self):
        data_dic = {}
        index = 1
        clean_data = []
        if not os.path.exists(self.data_dir + self.file_name):
            # clean_data = [self.tokenization(sent) for sent in self.read_data() if len(sent)]
            with open(self.data_dir + self.clean_file_name, 'w', encoding='utf-8') as cleaned_data:
                for sent in self.read_data():
                    clean_line = self.tokenization(sent)
                    clean_data.append(clean_line)
                    cleaned_data.write(' '.join(clean_line)+'\n')

        else:
            clean_data = self.read_data(self.clean_file_name)
        data_line_number = len(clean_data)
        for line in clean_data:
            data_dic[str(index)] = ' '.join(line)
            index += 1
            if index == data_line_number:
                break
        # print(data_dic)
        return data_dic

    def get_index(self):
        data_dic = self.get_data_dict()
        print(data_dic)  # 打印出字典
        line_score = [(id, Simhash(sent)) for id, sent in data_dic.items()]
        index = SimhashIndex(line_score, k=2)
        return index


if __name__ == '__main__':
    start_time = datetime.datetime.now()
    find_dup = Dudup()
    Sim_Hash_Index = find_dup.get_index()
    inp = '“全椒县经开区污水处理厂有限公司提标改造设备采购二次'
    inp_sim_hash = Simhash(' '.join(find_dup.tokenization(inp)))

    result_index = Sim_Hash_Index.get_near_dups(inp_sim_hash)
    if len(result_index):
        print('重复行索引\t', result_index[0])
        raw_data_list = find_dup.read_data(find_dup.data_dir + find_dup.file_name)
        print('重复标题\t', raw_data_list[int(result_index[0]) - 1])
    else:
        print("没有重复行")

    end_time = datetime.datetime.now()
    print("consume time is %f minutes." % ((end_time - start_time).seconds * 1.0 / 60))




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值