文本相似度notes-优快云博客

本文链接：https://blog.youkuaiyun.com/Drunk_awm/article/details/149325751

使用SentenceTransformer

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
import numpy as np

device = "mps"

# Load new JSON data
with open('unique_.json', 'r') as f:
    new_data = json.load(f)

# Function to combine instruction and input
def combine_instruction_input(data):
    instructions = []
    for d in data:
        instruction = d['instruction']
        input_text = d['input']
        if input_text != '':
            instruction += ' ' + input_text
        instructions.append(instruction)
    return instructions

# Extract instructions
new_instructions = combine_instruction_input(new_data)

# Initialize model
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Compute embeddings
new_embeddings = model.encode(new_instructions)

# Initialize empty list
final_data = []
existing_embeddings = []

# For each new instruction, check if it's sufficiently different from existing instructions
for i, new_instruction in enumerate(new_instructions):
    # If list is empty, add the first datapoint
    if not final_data:
        final_data.append(new_data[i])
        existing_embeddings.append(new_embeddings[i])
    else:
        # Compute similarity scores with existing instructions
        similarity_scores = cosine_similarity([new_embeddings[i]], existing_embeddings)

        # If new instruction is sufficiently different, add it to the final_data
        if np.max(similarity_scores) <= 0.7:
            final_data.append(new_data[i])
            existing_embeddings.append(new_embeddings[i])

# Save the final_data to a new json file
with open('unique_data_best.json', 'w') as f:
    json.dump(final_data, f, indent=1)