使用SentenceTransformer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
import numpy as np
device = "mps"
# Load new JSON data
with open('unique_.json', 'r') as f:
new_data = json.load(f)
# Function to combine instruction and input
def combine_instruction_input(data):
instructions = []
for d in data:
instruction = d['instruction']
input_text = d['input']
if input_text != '':
instruction += ' ' + input_text
instructions.append(instruction)
return instructions
# Extract instructions
new_instructions = combine_instruction_input(new_data)
# Initialize model
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
# Compute embeddings
new_embeddings = model.encode(new_instructions)
# Initialize empty list
final_data = []
existing_embeddings = []
# For each new instruction, check if it's sufficiently different from existing instructions
for i, new_instruction in enumerate(new_instructions):
# If list is empty, add the first datapoint
if not final_data:
final_data.append(new_data[i])
existing_embeddings.append(new_embeddings[i])
else:
# Compute similarity scores with existing instructions
similarity_scores = cosine_similarity([new_embeddings[i]], existing_embeddings)
# If new instruction is sufficiently different, add it to the final_data
if np.max(similarity_scores) <= 0.7:
final_data.append(new_data[i])
existing_embeddings.append(new_embeddings[i])
# Save the final_data to a new json file
with open('unique_data_best.json', 'w') as f:
json.dump(final_data, f, indent=1)
可用库:Text2vec
参考
有时间再仔细写区别
111
去重
去重代码
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
def sbert_dedup(texts, threshold=0.9, model_name='paraphrase-multilingual-MiniLM-L12-v2'):
"""Sentence-BERT语义去重"""
model = SentenceTransformer(model_name)
embeddings = model.encode(texts)
clean_texts = []
for i, text in enumerate(texts):
if i == 0:
clean_texts.append(text)
continue
sims = cosine_similarity([embeddings[i]], embeddings[:i]).flatten()
if not (sims > threshold).any():
clean_texts.append(text)
return clean_texts
# 使用示例
input_file = ""
data = pd.read_excel(input_file)
output_file = ""
clean_data = sbert_dedup(data["样本内容"].tolist(), threshold=0.96)
print(len(clean_data))
pd.DataFrame({"content": clean_data}).to_excel("", index=False)
852

被折叠的 条评论
为什么被折叠?



