文本相似度notes

使用SentenceTransformer

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
import numpy as np

device = "mps"

# Load new JSON data
with open('unique_.json', 'r') as f:
    new_data = json.load(f)

# Function to combine instruction and input
def combine_instruction_input(data):
    instructions = []
    for d in data:
        instruction = d['instruction']
        input_text = d['input']
        if input_text != '':
            instruction += ' ' + input_text
        instructions.append(instruction)
    return instructions

# Extract instructions
new_instructions = combine_instruction_input(new_data)

# Initialize model
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Compute embeddings
new_embeddings = model.encode(new_instructions)

# Initialize empty list
final_data = []
existing_embeddings = []

# For each new instruction, check if it's sufficiently different from existing instructions
for i, new_instruction in enumerate(new_instructions):
    # If list is empty, add the first datapoint
    if not final_data:
        final_data.append(new_data[i])
        existing_embeddings.append(new_embeddings[i])
    else:
        # Compute similarity scores with existing instructions
        similarity_scores = cosine_similarity([new_embeddings[i]], existing_embeddings)

        # If new instruction is sufficiently different, add it to the final_data
        if np.max(similarity_scores) <= 0.7:
            final_data.append(new_data[i])
            existing_embeddings.append(new_embeddings[i])

# Save the final_data to a new json file
with open('unique_data_best.json', 'w') as f:
    json.dump(final_data, f, indent=1)

可用库:Text2vec
参考
有时间再仔细写区别
111
去重

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值