gen similarity compute

1.similar_gen.cpp

#include <string.h>
#include <iostream>
#include <string>
#include <algorithm>
#include "boost/noncopyable.hpp"
using namespace std;
class similar_gen : public boost::noncopyable {
public:
    inline static similar_gen &get() {
        static similar_gen obj;
        return obj;
    }
    int compute_similarity(const string &gen_str0, const string &gen_str1) {
        if (false == check_gen_str(gen_str0) ||  false == check_gen_str(gen_str1)) {
            cerr << "invalid gen string...!" << endl;
            return -1;
        }
        make_status_equation(gen_str0, gen_str1);
        int size0 = gen_str0.size();
        int size1 = gen_str0.size();
        for (int i = 1;i <= size0;i++) {
            for (int j = 1;j <= size1;j++) {
                status_equation_[i][j] = max(status_equation_[i][j], status_equation_[i - 1][j - 1] + get_similarity(gen_str0[i - 1], gen_str1[j - 1]));
                status_equation_[i][j] = max(status_equation_[i][j], status_equation_[i][j - 1] + get_similarity(gen_str1[j - 1], '-'));
                status_equation_[i][j] = max(status_equation_[i][j], status_equation_[i - 1][j] + get_similarity(gen_str0[i - 1], '-'));
            }
        }
        return status_equation_[size0][size1];

    }
private:
    bool check_gen_str(const string &gen_str) {
        if (gen_str.size() > max_gen_len) {
            return false;
        }
        for (auto &ch : gen_str) {
            if (ch != 'A' && ch != 'C' && ch != 'G' && ch != 'T') {
                return false;
            }
        }
        return true;
    }
    void make_similarity() {
        similarity_array_['A']['A'] = 5;
        similarity_array_['A']['C'] = -1;
        similarity_array_['A']['G'] = -2;
        similarity_array_['A']['T'] = -1;
        similarity_array_['A']['-'] = -3;

        similarity_array_['C']['A'] = -1;
        similarity_array_['C']['C'] = 5;
        similarity_array_['C']['G'] = -3;
        similarity_array_['C']['T'] = -2;
        similarity_array_['C']['-'] = -4;

        similarity_array_['G']['A'] = -2;
        similarity_array_['G']['C'] = -3;
        similarity_array_['G']['G'] = 5;
        similarity_array_['G']['T'] = -2;
        similarity_array_['G']['-'] = -2;

        similarity_array_['T']['A'] = -1;
        similarity_array_['T']['C'] = -2;
        similarity_array_['T']['G'] = -2;
        similarity_array_['T']['T'] = 5;
        similarity_array_['T']['-'] = -1;

        similarity_array_['-']['A'] = -3;
        similarity_array_['-']['C'] = -4;
        similarity_array_['-']['G'] = -2;
        similarity_array_['-']['T'] = -1;
        similarity_array_['-']['-'] = -10;
    }
    inline int get_similarity(char ch0, char ch1) {
        return similarity_array_[ch0][ch1];
    }
    void make_status_equation(const string &gen_str0, const string &gen_str1) {
        memset(status_equation_, 0, sizeof(status_equation_));
        int size = gen_str0.size();
        for (int i = 1;i <= size;i++) {
            status_equation_[i][0] = status_equation_[i - 1][0] + get_similarity(gen_str0[i - 1], '-');
        }
        size = gen_str1.size();
        for (int i = 1;i <= size;i++) {
            status_equation_[0][i] = status_equation_[0][i - 1] + get_similarity(gen_str1[i - 1], '-');
        }
    }
private:
    similar_gen() {
        make_similarity();
    }
    virtual ~similar_gen() = default;
private:
    static const int max_gen_len = 100;
private:
    int similarity_array_[256][256];
    int status_equation_[max_gen_len + 1][max_gen_len + 1];

};
int main() {
    string str0 = "AGTGATG";
    string str1 = "GTTAG";
    cout << similar_gen::get().compute_similarity(str0, str1) << endl;

    return 0;
}

2.make.sh

g++ -std=c++14 -g -o Test similar_gen.cpp -I ../boost_1_69_0/ -L ../boost_1_69_0/libs/

import json import torch import os from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM from tqdm import tqdm from typing import List, Dict, Any, Tuple, Optional from sentence_transformers import util class MiniLMEvaluator: def __init__(self, model_path: str, test_data_path: str, gen_model_path: Optional[str] = None, similarity_threshold: float = 0.7): """初始化评估器 - 完全离线模式""" self.device = "cuda" if torch.cuda.is_available() else "cpu" self.similarity_threshold = similarity_threshold # 加载MiniLM模型(相似度计算)- 完全离线 print(f"正在加载本地相似度模型: {model_path}...") self.tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True) self.model = AutoModel.from_pretrained(model_path, local_files_only=True).to(self.device) self.model.eval() # 加载生成模型 - 完全离线 self.gen_model = None self.gen_tokenizer = None if gen_model_path and os.path.exists(gen_model_path): print(f"正在加载本地生成模型: {gen_model_path}...") try: # 强制使用本地文件 self.gen_tokenizer = AutoTokenizer.from_pretrained( gen_model_path, trust_remote_code=True, local_files_only=True ) print("成功加载本地tokenizer") self.gen_model = AutoModelForCausalLM.from_pretrained( gen_model_path, device_map="auto", torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, trust_remote_code=True, local_files_only=True ).eval() print("成功加载本地模型") except Exception as e: print(f"加载本地模型失败: {str(e)}") print("请确保模型文件完整且路径正确") self.gen_model = None else: print(f"警告: 生成模型路径不存在 {gen_model_path}") # 加载测试数据 print(f"正在加载测试数据: {test_data_path}...") self.test_data = self._load_test_data(test_data_path) if not self.test_data: raise ValueError("测试数据加载失败或为空") def _load_test_data(self, test_data_path: str) -> List[Dict[str, Any]]: """加载测试数据""" try: with open(test_data_path, 'r', encoding='utf-8') as f: data = json.load(f) if not isinstance(data, list): raise ValueError(f"测试数据应为列表,但得到 {type(data)}") print(f"成功加载 {len(data)} 条测试数据") return data except Exception as e: raise ValueError(f"加载测试数据失败: {str(e)}") def get_embedding(self, text: str) -> torch.Tensor: """获取文本的嵌入向量""" if not text.strip(): return torch.zeros(384).to(self.device) inputs = self.tokenizer( text, return_tensors="pt", padding=True, truncation=True, max_length=512 ).to(self.device) with torch.no_grad(): outputs = self.model(**inputs) # 平均池化生成句子嵌入 return self._mean_pooling(outputs, inputs['attention_mask']).cpu().squeeze(0) def _mean_pooling(self, model_output, attention_mask): """平均池化生成句子嵌入""" token_embeddings = model_output.last_hidden_state input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) def generate_response(self, prompt: str) -> str: """使用生成模型生成回复""" if not self.gen_model or not self.gen_tokenizer: # 如果没有生成模型,则返回空字符串 return "" inputs = self.gen_tokenizer(prompt, return_tensors="pt").to(self.device) with torch.no_grad(): outputs = self.gen_model.generate( **inputs, max_new_tokens=512, temperature=0.1, top_p=0.8, do_sample=True ) response = self.gen_tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) return response def evaluate_pair(self, text1: str, text2: str) -> Dict[str, Any]: """评估文本对相似度""" emb1 = self.get_embedding(text1) emb2 = self.get_embedding(text2) similarity = util.pytorch_cos_sim(emb1.unsqueeze(0), emb2.unsqueeze(0)).item() return { "text1": text1, "text2": text2, "similarity": similarity, "is_correct": similarity >= self.similarity_threshold, "threshold": self.similarity_threshold } def evaluate(self) -> Tuple[float, float, List[Dict[str, Any]]]: """执行完整评估流程""" if not self.test_data: raise ValueError("无有效测试数据") total_similarity = 0.0 correct_count = 0 results = [] for sample in tqdm(self.test_data, desc="评估进度"): try: # 提取对话内容 if not isinstance(sample, dict) or "conversations" not in sample: print(f"跳过无效样本: {sample.get('id', '未知')}") continue # 获取最后一轮对话 conversations = sample["conversations"] user_input = None reference_reply = None # 拼接完整的对话历史 full_dialog = "" for i, turn in enumerate(conversations): if turn.get("from") == "user": user_input = turn.get("value", "").strip() full_dialog += f"用户: {user_input}\n" elif turn.get("from") == "assistant": reference_reply = turn.get("value", "").strip() if i < len(conversations) - 1: # 不是最后一轮 full_dialog += f"助手: {reference_reply}\n" if not user_input or not reference_reply: print(f"跳过不完整对话: {sample.get('id', '未知')}") continue # 生成模型回复(基于完整对话历史) prompt = full_dialog + "助手:" generated_reply = self.generate_response(prompt) if self.gen_model else reference_reply # 评估生成结果 eval_result = self.evaluate_pair(generated_reply, reference_reply) results.append({ "id": sample.get("id", len(results)), "user_input": user_input, "generated_reply": generated_reply, "reference_reply": reference_reply, **eval_result }) total_similarity += eval_result["similarity"] if eval_result["is_correct"]: correct_count += 1 except Exception as e: print(f"处理样本出错: {str(e)}") continue # 计算指标 valid_samples = len(results) if valid_samples == 0: return 0.0, 0.0, [] avg_similarity = total_similarity / valid_samples accuracy = correct_count / valid_samples return avg_similarity, accuracy, results def save_results(self, results: List[Dict[str, Any]], output_path: str): """保存评估结果""" with open(output_path, 'w', encoding='utf-8') as f: json.dump({ "summary": { "total_samples": len(results), "avg_similarity": sum(r["similarity"] for r in results) / len(results), "accuracy": sum(r["is_correct"] for r in results) / len(results), "threshold": self.similarity_threshold }, "details": results }, f, ensure_ascii=False, indent=2) if __name__ == "__main__": # 配置参数 - 全部使用本地路径 CONFIG = { "model_path": "/root/autodl-tmp/paraphrase-multilingual-MiniLM-L12-v2", # 本地相似度模型路径 "gen_model_path": "/root/autodl-tmp/Qwen3-14b", # 本地生成模型路径 "test_data_path": "/root/autodl-tmp/test_new.json", # 测试数据路径 "output_path": "/root/autodl-tmp/evaluation_results.json", # 输出路径 "similarity_threshold": 0.7 # 相似度阈值 } try: print("="*50) print("开始评估 - 离线模式".center(40)) print("="*50) # 检查模型路径是否存在 if not os.path.exists(CONFIG["model_path"]): raise FileNotFoundError(f"相似度模型路径不存在: {CONFIG['model_path']}") # 初始化评估器 evaluator = MiniLMEvaluator( model_path=CONFIG["model_path"], test_data_path=CONFIG["test_data_path"], gen_model_path=CONFIG["gen_model_path"], similarity_threshold=CONFIG["similarity_threshold"] ) # 执行评估 avg_sim, accuracy, results = evaluator.evaluate() # 打印结果 print("\n" + "评估结果".center(40, "-")) print(f"评估样本数: {len(results)}") print(f"平均相似度: {avg_sim:.4f}") print(f"准确率(≥{CONFIG['similarity_threshold']}): {accuracy:.2%}") print(f"正确样本数: {int(accuracy * len(results))}/{len(results)}") # 保存结果 evaluator.save_results(results, CONFIG["output_path"]) print(f"\n结果已保存至: {CONFIG['output_path']}") except Exception as e: print(f"\n评估过程出错: {str(e)}") print("请确保:") print("1. 所有模型路径都是本地绝对路径") print("2. 模型文件完整且未损坏") print("3. 测试数据文件存在且格式正确") print("4. 有足够的存储空间") root@autodl-container-8bf9409db0-3243e00c:~/autodl-tmp# python utils_one.py ================================================== 开始评估 - 离线模式 ================================================== 正在加载本地相似度模型: /root/autodl-tmp/paraphrase-multilingual-MiniLM-L12-v2... 正在加载本地生成模型: /root/autodl-tmp/Qwen3-14b... 成功加载本地tokenizer 加载本地模型失败: argument of type 'NoneType' is not iterable 请确保模型文件完整且路径正确 正在加载测试数据: /root/autodl-tmp/test_new.json... 成功加载 5434 条测试数据 评估进度: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5434/5434 [00:48<00:00, 111.78it/s] ------------------评估结果------------------ 评估样本数: 5434 平均相似度: 1.0000 准确率(≥0.7): 100.00% 正确样本数: 5434/5434为什么评估结果一直是100
06-05
内容概要:本文详细介绍了“秒杀商城”微服务架构的设计与实战全过程,涵盖系统从需求分析、服务拆分、技术选型到核心功能开发、分布式事务处理、容器化部署及监控链路追踪的完整流程。重点解决了高并发场景下的超卖问题,采用Redis预减库存、消息队列削峰、数据库乐观锁等手段保障数据一致性,并通过Nacos实现服务注册发现与配置管理,利用Seata处理跨服务分布式事务,结合RabbitMQ实现异步下单,提升系统吞吐能力。同时,项目支持Docker Compose快速部署和Kubernetes生产级编排,集成Sleuth+Zipkin链路追踪与Prometheus+Grafana监控体系,构建可观测性强的微服务系统。; 适合人群:具备Java基础和Spring Boot开发经验,熟悉微服务基本概念的中高级研发人员,尤其是希望深入理解高并发系统设计、分布式事务、服务治理等核心技术的开发者;适合工作2-5年、有志于转型微服务或提升架构能力的工程师; 使用场景及目标:①学习如何基于Spring Cloud Alibaba构建完整的微服务项目;②掌握秒杀场景下高并发、超卖控制、异步化、削峰填谷等关键技术方案;③实践分布式事务(Seata)、服务熔断降级、链路追踪、统一配置中心等企业级中间件的应用;④完成从本地开发到容器化部署的全流程落地; 阅读建议:建议按照文档提供的七个阶段循序渐进地动手实践,重点关注秒杀流程设计、服务间通信机制、分布式事务实现和系统性能优化部分,结合代码调试与监控工具深入理解各组件协作原理,真正掌握高并发微服务系统的构建能力。
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值