使用SentenceTransformer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
import numpy as np
device = "mps"
# Load new JSON data
with open('unique_.json', 'r') as f:
new_data = json.load(f)
# Function to combine instruction and input
def combine_instruction_input(data):
instructions = []
for d in data:
instruction = d['instruction']
input_text = d['input']
if input_text != '':
instruction += ' ' + input_text
instructions.append(instruction)
return instructions
# Extract instructions
new_instructions = combine_instruction_input(new_data)
# Initialize model
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
# Compute embeddings
new_embeddings = model.encode(new_instructions)
# Initialize empty list
final_data = []
existing_embeddings = []
# For each new instruction, check if it's sufficiently different from existing instructions
for i, new_instruction in enumerate(new_instructions):
# If list is empty, add the first datapoint
if not final_data:
final_data.append(new_data[i])
existing_embeddings.append(new_embeddings[i])
else:
# Compute similarity scores with existing instructions
similarity_scores = cosine_similarity([new_embeddings[i]], existing_embeddings)
# If new instruction is sufficiently different, add it to the final_data
if np.max(similarity_scores) <= 0.7:
final_data.append(new_data[i])
existing_embeddings.append(new_embeddings[i])
# Save the final_data to a new json file
with open('unique_data_best.json', 'w') as f:
json.dump(final_data, f, indent=1)