文本相似度notes

使用SentenceTransformer

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
import numpy as np

device = "mps"

# Load new JSON data
with open('unique_.json', 'r') as f:
    new_data = json.load(f)

# Function to combine instruction and input
def combine_instruction_input(data):
    instructions = []
    for d in data:
        instruction = d['instruction']
        input_text = d['input']
        if input_text != '':
            instruction += ' ' + input_text
        instructions.append(instruction)
    return instructions

# Extract instructions
new_instructions = combine_instruction_input(new_data)

# Initialize model
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Compute embeddings
new_embeddings = model.encode(new_instructions)

# Initialize empty list
final_data = []
existing_embeddings = []

# For each new instruction, check if it's sufficiently different from existing instructions
for i, new_instruction in enumerate(new_instructions):
    # If list is empty, add the first datapoint
    if not final_data:
        final_data.append(new_data[i])
        existing_embeddings.append(new_embeddings[i])
    else:
        # Compute similarity scores with existing instructions
        similarity_scores = cosine_similarity([new_embeddings[i]], existing_embeddings)

        # If new instruction is sufficiently different, add it to the final_data
        if np.max(similarity_scores) <= 0.7:
            final_data.append(new_data[i])
            existing_embeddings.append(new_embeddings[i])

# Save the final_data to a new json file
with open('unique_data_best.json', 'w') as f:
    json.dump(final_data, f, indent=1)

可用库:Text2vec
参考
有时间再仔细写区别
111
去重

去重代码

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

def sbert_dedup(texts, threshold=0.9, model_name='paraphrase-multilingual-MiniLM-L12-v2'):
    """Sentence-BERT语义去重"""
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts)
    
    clean_texts = []
    for i, text in enumerate(texts):
        if i == 0:
            clean_texts.append(text)
            continue
        sims = cosine_similarity([embeddings[i]], embeddings[:i]).flatten()
        if not (sims > threshold).any():
            clean_texts.append(text)
    return clean_texts

# 使用示例
input_file = ""
data = pd.read_excel(input_file)
output_file = ""
clean_data = sbert_dedup(data["样本内容"].tolist(), threshold=0.96)
print(len(clean_data))
pd.DataFrame({"content": clean_data}).to_excel("", index=False)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值