import json
import torch
import os
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
from tqdm import tqdm
from typing import List, Dict, Any, Tuple, Optional
from sentence_transformers import util
class MiniLMEvaluator:
def __init__(self,
model_path: str,
test_data_path: str,
gen_model_path: Optional[str] = None,
similarity_threshold: float = 0.7):
"""初始化评估器 - 完全离线模式"""
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.similarity_threshold = similarity_threshold
# 加载MiniLM模型(相似度计算)- 完全离线
print(f"正在加载本地相似度模型: {model_path}...")
self.tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
self.model = AutoModel.from_pretrained(model_path, local_files_only=True).to(self.device)
self.model.eval()
# 加载生成模型 - 完全离线
self.gen_model = None
self.gen_tokenizer = None
if gen_model_path and os.path.exists(gen_model_path):
print(f"正在加载本地生成模型: {gen_model_path}...")
try:
# 强制使用本地文件
self.gen_tokenizer = AutoTokenizer.from_pretrained(
gen_model_path,
trust_remote_code=True,
local_files_only=True
)
print("成功加载本地tokenizer")
self.gen_model = AutoModelForCausalLM.from_pretrained(
gen_model_path,
device_map="auto",
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
trust_remote_code=True,
local_files_only=True
).eval()
print("成功加载本地模型")
except Exception as e:
print(f"加载本地模型失败: {str(e)}")
print("请确保模型文件完整且路径正确")
self.gen_model = None
else:
print(f"警告: 生成模型路径不存在 {gen_model_path}")
# 加载测试数据
print(f"正在加载测试数据: {test_data_path}...")
self.test_data = self._load_test_data(test_data_path)
if not self.test_data:
raise ValueError("测试数据加载失败或为空")
def _load_test_data(self, test_data_path: str) -> List[Dict[str, Any]]:
"""加载测试数据"""
try:
with open(test_data_path, 'r', encoding='utf-8') as f:
data = json.load(f)
if not isinstance(data, list):
raise ValueError(f"测试数据应为列表,但得到 {type(data)}")
print(f"成功加载 {len(data)} 条测试数据")
return data
except Exception as e:
raise ValueError(f"加载测试数据失败: {str(e)}")
def get_embedding(self, text: str) -> torch.Tensor:
"""获取文本的嵌入向量"""
if not text.strip():
return torch.zeros(384).to(self.device)
inputs = self.tokenizer(
text,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
).to(self.device)
with torch.no_grad():
outputs = self.model(**inputs)
# 平均池化生成句子嵌入
return self._mean_pooling(outputs, inputs['attention_mask']).cpu().squeeze(0)
def _mean_pooling(self, model_output, attention_mask):
"""平均池化生成句子嵌入"""
token_embeddings = model_output.last_hidden_state
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
def generate_response(self, prompt: str) -> str:
"""使用生成模型生成回复"""
if not self.gen_model or not self.gen_tokenizer:
# 如果没有生成模型,则返回空字符串
return ""
inputs = self.gen_tokenizer(prompt, return_tensors="pt").to(self.device)
with torch.no_grad():
outputs = self.gen_model.generate(
**inputs,
max_new_tokens=512,
temperature=0.1,
top_p=0.8,
do_sample=True
)
response = self.gen_tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
return response
def evaluate_pair(self, text1: str, text2: str) -> Dict[str, Any]:
"""评估文本对相似度"""
emb1 = self.get_embedding(text1)
emb2 = self.get_embedding(text2)
similarity = util.pytorch_cos_sim(emb1.unsqueeze(0), emb2.unsqueeze(0)).item()
return {
"text1": text1,
"text2": text2,
"similarity": similarity,
"is_correct": similarity >= self.similarity_threshold,
"threshold": self.similarity_threshold
}
def evaluate(self) -> Tuple[float, float, List[Dict[str, Any]]]:
"""执行完整评估流程"""
if not self.test_data:
raise ValueError("无有效测试数据")
total_similarity = 0.0
correct_count = 0
results = []
for sample in tqdm(self.test_data, desc="评估进度"):
try:
# 提取对话内容
if not isinstance(sample, dict) or "conversations" not in sample:
print(f"跳过无效样本: {sample.get('id', '未知')}")
continue
# 获取最后一轮对话
conversations = sample["conversations"]
user_input = None
reference_reply = None
# 拼接完整的对话历史
full_dialog = ""
for i, turn in enumerate(conversations):
if turn.get("from") == "user":
user_input = turn.get("value", "").strip()
full_dialog += f"用户: {user_input}\n"
elif turn.get("from") == "assistant":
reference_reply = turn.get("value", "").strip()
if i < len(conversations) - 1: # 不是最后一轮
full_dialog += f"助手: {reference_reply}\n"
if not user_input or not reference_reply:
print(f"跳过不完整对话: {sample.get('id', '未知')}")
continue
# 生成模型回复(基于完整对话历史)
prompt = full_dialog + "助手:"
generated_reply = self.generate_response(prompt) if self.gen_model else reference_reply
# 评估生成结果
eval_result = self.evaluate_pair(generated_reply, reference_reply)
results.append({
"id": sample.get("id", len(results)),
"user_input": user_input,
"generated_reply": generated_reply,
"reference_reply": reference_reply,
**eval_result
})
total_similarity += eval_result["similarity"]
if eval_result["is_correct"]:
correct_count += 1
except Exception as e:
print(f"处理样本出错: {str(e)}")
continue
# 计算指标
valid_samples = len(results)
if valid_samples == 0:
return 0.0, 0.0, []
avg_similarity = total_similarity / valid_samples
accuracy = correct_count / valid_samples
return avg_similarity, accuracy, results
def save_results(self, results: List[Dict[str, Any]], output_path: str):
"""保存评估结果"""
with open(output_path, 'w', encoding='utf-8') as f:
json.dump({
"summary": {
"total_samples": len(results),
"avg_similarity": sum(r["similarity"] for r in results) / len(results),
"accuracy": sum(r["is_correct"] for r in results) / len(results),
"threshold": self.similarity_threshold
},
"details": results
}, f, ensure_ascii=False, indent=2)
if __name__ == "__main__":
# 配置参数 - 全部使用本地路径
CONFIG = {
"model_path": "/root/autodl-tmp/paraphrase-multilingual-MiniLM-L12-v2", # 本地相似度模型路径
"gen_model_path": "/root/autodl-tmp/Qwen3-14b", # 本地生成模型路径
"test_data_path": "/root/autodl-tmp/test_new.json", # 测试数据路径
"output_path": "/root/autodl-tmp/evaluation_results.json", # 输出路径
"similarity_threshold": 0.7 # 相似度阈值
}
try:
print("="*50)
print("开始评估 - 离线模式".center(40))
print("="*50)
# 检查模型路径是否存在
if not os.path.exists(CONFIG["model_path"]):
raise FileNotFoundError(f"相似度模型路径不存在: {CONFIG['model_path']}")
# 初始化评估器
evaluator = MiniLMEvaluator(
model_path=CONFIG["model_path"],
test_data_path=CONFIG["test_data_path"],
gen_model_path=CONFIG["gen_model_path"],
similarity_threshold=CONFIG["similarity_threshold"]
)
# 执行评估
avg_sim, accuracy, results = evaluator.evaluate()
# 打印结果
print("\n" + "评估结果".center(40, "-"))
print(f"评估样本数: {len(results)}")
print(f"平均相似度: {avg_sim:.4f}")
print(f"准确率(≥{CONFIG['similarity_threshold']}): {accuracy:.2%}")
print(f"正确样本数: {int(accuracy * len(results))}/{len(results)}")
# 保存结果
evaluator.save_results(results, CONFIG["output_path"])
print(f"\n结果已保存至: {CONFIG['output_path']}")
except Exception as e:
print(f"\n评估过程出错: {str(e)}")
print("请确保:")
print("1. 所有模型路径都是本地绝对路径")
print("2. 模型文件完整且未损坏")
print("3. 测试数据文件存在且格式正确")
print("4. 有足够的存储空间")
root@autodl-container-8bf9409db0-3243e00c:~/autodl-tmp# python utils_one.py
==================================================
开始评估 - 离线模式
==================================================
正在加载本地相似度模型: /root/autodl-tmp/paraphrase-multilingual-MiniLM-L12-v2...
正在加载本地生成模型: /root/autodl-tmp/Qwen3-14b...
成功加载本地tokenizer
加载本地模型失败: argument of type 'NoneType' is not iterable
请确保模型文件完整且路径正确
正在加载测试数据: /root/autodl-tmp/test_new.json...
成功加载 5434 条测试数据
评估进度: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5434/5434 [00:48<00:00, 111.78it/s]
------------------评估结果------------------
评估样本数: 5434
平均相似度: 1.0000
准确率(≥0.7): 100.00%
正确样本数: 5434/5434为什么评估结果一直是100