改进版的sp_helptext

本文介绍了一种改进型的SQLServer存储过程sp_helptext2,该过程可以直接将帮助文本转换为易于阅读的格式,并能自动识别并显示同义词的原始对象。

  SQL Server中系统自带的sp_helptext存在一些问题, 如显示格式不易阅读,空格未自动过滤等,

现开发改进型sp_helptext2, 可直接显示为text格式,且能自动找出同义词的源对象.

if exists(select 1 from sys.objects where type='P' and name='sp_helptext2')
   drop proc dbo.sp_helptext2
go 

create proc dbo.sp_helptext2
(@objectname sysname)
as
begin
set nocount on
declare @objectid int,
              @objecttype varchar(10),
              @Print nvarchar(max)

select @objectid=object_id,
           @objecttype=type
 from sys.objects 
 where type in('P','V','TR','FN','SN')
 and name=@objectname

if @objectid is null
begin
  print 'Invalid object name '''+@objectname+'''. '
  return
end

if @objecttype='SN'
begin
  select @Print='Synonym: '+@objectname+char(13)+char(10)
                          +'BaseObject: '+base_object_name 
    from sys.synonyms 
    where name=@objectname
  print @Print
  return
end

declare @T table(Col nvarchar(max))

insert @T(Col)
  select object_definition(@objectid)+char(13)+char(10)

while(select Col from @T)<>''
begin  
   select @Print=replace(left(Col,charindex(char(13)+char(10),Col)-1),char(32),' ')
     from @T

   print rtrim(@Print)

   update @T set Col=stuff(Col,1,charindex(char(13)+char(10),Col)+1,'')  
end

end


[root@yfw ~]# cd /www/wwwroot/szrengjing.com [root@yfw szrengjing.com]# mysql -u szrengjing_com -p Enter password: Welcome to the MySQL monitor. Commands end with ; or \g. Your MySQL connection id is 93707 Server version: 5.7.42-log Source distribution Copyright (c) 2000, 2023, Oracle and/or its affiliates. Oracle is a registered trademark of Oracle Corporation and/or its affiliates. Other names may be trademarks of their respective owners. Type 'help;' or '\h' for help. Type '\c' to clear the current input statement. mysql> USE szrengjing_com; Database changed mysql> CREATE TABLE IF NOT EXISTS ecs_user_credit ( -> id INT AUTO_INCREMENT PRIMARY KEY, -> user_id INT NOT NULL UNIQUE, -> score INT DEFAULT 80, -> level VARCHAR(20) AS ( -> CASE -> WHEN score >= 95 THEN '★★★★★' -> WHEN score >= 85 THEN '★★★★☆' -> WHEN score >= 75 THEN '★★★☆☆' -> WHEN score >= 60 THEN '★★☆☆☆' -> ELSE '★☆☆☆☆' -> END -> ) STORED, -> updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP -> ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; Query OK, 0 rows affected, 1 warning (0.00 sec) mysql> CREATE TABLE IF NOT EXISTS ecs_user_credit_log ( -> id BIGINT AUTO_INCREMENT PRIMARY KEY, -> user_id INT NOT NULL, -> old_score INT, -> new_score INT, -> change_value INT, -> reason VARCHAR(100), -> created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP -> ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; Query OK, 0 rows affected, 1 warning (0.00 sec) mysql> DROP TRIGGER IF EXISTS tr_after_update_credit; Query OK, 0 rows affected, 1 warning (0.01 sec) mysql> DROP PROCEDURE IF EXISTS sp_add_credit; Query OK, 0 rows affected, 1 warning (0.00 sec) mysql> DELIMITER ;; mysql> CREATE TRIGGER tr_after_update_credit -> AFTER UPDATE ON ecs_user_credit -> FOR EACH ROW -> BEGIN -> INSERT INTO ecs_user_credit_log (user_id, old_score, new_score, change_value, reason) -> VALUES (NEW.user_id, OLD.score, NEW.score, NEW.score - OLD.score, '系统调整'); -> END;; Query OK, 0 rows affected (0.01 sec) mysql> CREATE PROCEDURE sp_add_credit( -> IN p_user_id INT, -> IN p_change INT, -> IN p_reason VARCHAR(100) -> ) -> BEGIN -> DECLARE current_score INT DEFAULT 80; -> -- 查询当前分数并加锁防止并发 -> SELECT score INTO current_score FROM ecs_user_credit WHERE user_id = p_user_id FOR UPDATE; -> -> -- 插入或更新分数 -> INSERT INTO ecs_user_credit (user_id, score) -> VALUES (p_user_id, current_score + p_change) -> ON DUPLICATE KEY UPDATE score = score + p_change; -> END;; Query OK, 0 rows affected (0.00 sec) mysql> DELIMITER ; mysql> SHOW TABLES LIKE 'ecs_user_credit%'; +---------------------------------------------+ | Tables_in_szrengjing_com (ecs_user_credit%) | +---------------------------------------------+ | ecs_user_credit | | ecs_user_credit_log | +---------------------------------------------+ 2 rows in set (0.00 sec) mysql> SHOW TRIGGERS LIKE 'ecs_user_credit'\G *************************** 1. row *************************** Trigger: tr_update_credit_level Event: UPDATE Table: ecs_user_credit Statement: BEGIN IF NEW.score >= 95 THEN SET NEW.level = '★★★★★'; ELSEIF NEW.score >= 85 THEN SET NEW.level = '★★★★☆'; ELSEIF NEW.score >= 75 THEN SET NEW.level = '★★★☆☆'; ELSEIF NEW.score >= 60 THEN SET NEW.level = '★★☆☆☆'; ELSE SET NEW.level = '★☆☆☆☆'; END IF; END Timing: BEFORE Created: 2025-11-09 05:25:20.26 sql_mode: NO_AUTO_CREATE_USER,NO_ENGINE_SUBSTITUTION Definer: szrengjing_com@localhost character_set_client: utf8 collation_connection: utf8_general_ci Database Collation: utf8mb4_general_ci *************************** 2. row *************************** Trigger: tr_after_update_credit Event: UPDATE Table: ecs_user_credit Statement: BEGIN INSERT INTO ecs_user_credit_log (user_id, old_score, new_score, change_value, reason) VALUES (NEW.user_id, OLD.score, NEW.score, NEW.score - OLD.score, '系统调整'); END Timing: AFTER Created: 2025-11-09 06:19:33.39 sql_mode: NO_AUTO_CREATE_USER,NO_ENGINE_SUBSTITUTION Definer: szrengjing_com@localhost character_set_client: utf8 collation_connection: utf8_general_ci Database Collation: utf8mb4_general_ci 2 rows in set (0.00 sec) mysql> SHOW PROCEDURE STATUS WHERE Name = 'sp_add_credit'\G *************************** 1. row *************************** Db: szrengjing_com Name: sp_add_credit Type: PROCEDURE Definer: szrengjing_com@localhost Modified: 2025-11-09 06:19:48 Created: 2025-11-09 06:19:48 Security_type: DEFINER Comment: character_set_client: utf8 collation_connection: utf8_general_ci Database Collation: utf8mb4_general_ci 1 row in set (0.00 sec) mysql> CALL sp_add_credit(1, 5, '手动测试加分'); Query OK, 2 rows affected (0.01 sec) mysql> SELECT * FROM ecs_user_credit_log ORDER BY id DESC LIMIT 1\G *************************** 1. row *************************** id: 4 user_id: 1 old_score: 90 new_score: 95 change_value: 5 reason: 系统调整 created_at: 2025-11-09 06:21:06 1 row in set (0.00 sec) mysql> DROP TRIGGER IF EXISTS tr_update_credit_level; Query OK, 0 rows affected (0.01 sec) mysql> CALL sp_add_credit(2, 10, '新用户注册奖励'); Query OK, 1 row affected (0.00 sec) mysql> CALL sp_add_credit(1, -5, '违规扣分测试'); Query OK, 2 rows affected (0.01 sec) mysql> SELECT * FROM ecs_user_credit_log ORDER BY id DESC LIMIT 5\G *************************** 1. row *************************** id: 5 user_id: 1 old_score: 95 new_score: 90 change_value: -5 reason: 系统调整 created_at: 2025-11-09 06:22:39 *************************** 2. row *************************** id: 4 user_id: 1 old_score: 90 new_score: 95 change_value: 5 reason: 系统调整 created_at: 2025-11-09 06:21:06 *************************** 3. row *************************** id: 3 user_id: 1 old_score: 85 new_score: 90 change_value: 5 reason: 测试加分 created_at: 2025-11-09 05:55:47 *************************** 4. row *************************** id: 2 user_id: 1 old_score: 95 new_score: 85 change_value: -10 reason: 评论被举报 created_at: 2025-11-09 05:38:14 *************************** 5. row *************************** id: 1 user_id: 1 old_score: 80 new_score: 95 change_value: 15 reason: 完善个人资料 created_at: 2025-11-09 05:38:14 5 rows in set (0.00 sec) mysql> SELECT user_id, score, level, updated_at FROM ecs_user_credit; +---------+-------+-----------------+---------------------+ | user_id | score | level | updated_at | +---------+-------+-----------------+---------------------+ | 1 | 90 | ★★★★★ | 2025-11-09 06:22:39 | | 2 | 90 | ★★★★☆ | 2025-11-09 06:22:39 | +---------+-------+-----------------+---------------------+ 2 rows in set (0.00 sec) mysql> CALL sp_add_credit(3, 10, '邀请好友'); Query OK, 1 row affected (0.00 sec) mysql> DELIMITER ;; mysql> DROP PROCEDURE IF EXISTS sp_add_credit;; Query OK, 0 rows affected (0.00 sec) mysql> CREATE PROCEDURE sp_add_credit( -> IN p_user_id INT, -> IN p_change INT, -> IN p_reason VARCHAR(100) -> ) -> BEGIN -> DECLARE current_score INT DEFAULT 80; -> DECLARE CONTINUE HANDLER FOR NOT FOUND SET current_score = 80; -> -> -- 尝试获取当前分数(如果存在) -> SELECT score INTO current_score -> FROM ecs_user_credit -> WHERE user_id = p_user_id -> FOR UPDATE; -> -> -- 插入或更新:不存在则从 80+p_change 开始 -> INSERT INTO ecs_user_credit (user_id, score) -> VALUES (p_user_id, current_score + p_change) -> ON DUPLICATE KEY UPDATE score = score + p_change; -> END;; Query OK, 0 rows affected (0.00 sec) mysql> DELIMITER ; mysql> DELIMITER ;; mysql> DROP PROCEDURE IF EXISTS sp_add_credit;; Query OK, 0 rows affected (0.00 sec) mysql> CREATE PROCEDURE sp_add_credit( -> IN p_user_id INT, -> IN p_change INT, -> IN p_reason VARCHAR(100) -> ) -> BEGIN -> DECLARE current_score INT DEFAULT 80; -> DECLARE new_score INT; -> DECLARE CONTINUE HANDLER FOR NOT FOUND SET current_score = 80; -> -> -- 获取当前分数(加锁) -> SELECT score INTO current_score -> FROM ecs_user_credit -> WHERE user_id = p_user_id -> FOR UPDATE; -> -> -- 计算新分数 -> SET new_score = current_score + p_change; -> -> -- 限制范围 -> IF new_score < 0 THEN -> SET new_score = 0; -> ELSEIF new_score > 100 THEN -> SET new_score = 100; -> END IF; -> -> -- 写入数据库 -> INSERT INTO ecs_user_credit (user_id, score) -> VALUES (p_user_id, new_score) -> ON DUPLICATE KEY UPDATE score = new_score; -> END;; Query OK, 0 rows affected (0.01 sec) mysql> DELIMITER ; mysql> CREATE OR REPLACE VIEW vw_user_credit_summary AS -> SELECT -> u.user_id, -> u.score, -> u.level, -> u.updated_at, -> ( -> SELECT GROUP_CONCAT(reason SEPARATOR '; ') -> FROM ecs_user_credit_log l -> WHERE l.user_id = u.user_id -> ORDER BY l.created_at DESC -> LIMIT 3 -> ) AS recent_actions -> FROM ecs_user_credit u -> ORDER BY u.score DESC; Query OK, 0 rows affected (0.01 sec) mysql> SELECT * FROM vw_user_credit_summary\G *************************** 1. row *************************** user_id: 1 score: 90 level: ★★★★★ updated_at: 2025-11-09 06:22:39 recent_actions: 完善个人资料; 评论被举报; 测试加分; 系统调整; 系统调整 *************************** 2. row *************************** user_id: 2 score: 90 level: ★★★★☆ updated_at: 2025-11-09 06:22:39 recent_actions: NULL *************************** 3. row *************************** user_id: 3 score: 90 level: ★★★★☆ updated_at: 2025-11-09 06:24:47 recent_actions: NULL 3 rows in set (0.00 sec) mysql> -- 日志表按用户和时间查询频繁 mysql> ALTER TABLE ecs_user_credit_log ADD INDEX idx_user_time (user_id, created_at); Query OK, 0 rows affected (0.02 sec) Records: 0 Duplicates: 0 Warnings: 0 mysql> mysql> -- 主表按分数排序常用 mysql> ALTER TABLE ecs_user_credit ADD INDEX idx_score (score); Query OK, 0 rows affected (0.03 sec) Records: 0 Duplicates: 0 Warnings: 0 mysql>
最新发布
11-10
这个是我的源文件translate.py和配置文件config.yaml import os import re import json import yaml import logging import argparse from pathlib import Path from lxml import etree from transformers import MarianMTModel, MarianTokenizer, logging as hf_logging from typing import Dict, List, Tuple # -------------------------------------------------- # 日志 # -------------------------------------------------- logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[ logging.FileHandler("translation.log", encoding="utf-8"), logging.StreamHandler() ] ) hf_logging.set_verbosity_error() XML_NS = "http://www.w3.org/XML/1998/namespace" # -------------------------------------------------- # 配置 & 术语表 # -------------------------------------------------- def load_config(config_path: str = "config.yaml") -> dict: try: with open(config_path, "r", encoding="utf-8") as f: cfg = yaml.safe_load(f) or {} defaults = { "source_language": "en", "target_languages": ["fr", "de", "es"], "input_dir": "xliff_in", "output_dir": "output", "qa_report_dir": "qa_reports", "model_template": "Helsinki-NLP/opus-mt-{src}-{tgt}", "generate_qa_only": False } for k, v in defaults.items(): cfg.setdefault(k, v) # 处理旧的键名兼容 if "qa_report" in cfg: cfg["qa_report_dir"] = cfg["qa_report"] if "target_languages" in cfg: cfg["target_language"] = cfg["target_languages"] elif "target_language" not in cfg: cfg["target_language"] = defaults["target_languages"] # 创建输出目录 Path(cfg["output_dir"]).mkdir(exist_ok=True) Path(cfg["qa_report_dir"]).mkdir(exist_ok=True) return cfg except FileNotFoundError: logging.error(f"配置文件 {config_path} 未找到!") raise except Exception as e: logging.error(f"加载配置失败: {e}") raise def load_glossary(glossary_path: str = "glossary.json") -> dict: try: if os.path.exists(glossary_path) and os.path.getsize(glossary_path) > 0: with open(glossary_path, "r", encoding="utf-8") as f: return json.load(f) or {} logging.warning(f"术语表 {glossary_path} 不存在或为空,将不使用术语表") return {} except Exception as e: logging.error(f"加载术语表失败({glossary_path}): {e}") return {} # -------------------------------------------------- # 模型管理 # -------------------------------------------------- class ModelManager: def __init__(self, model_template: str): self.model_cache = {} self.model_template = model_template self.supported_langs = {"en", "fr", "de", "es", "ar", "pt", "ja", "ru", "zh"} def is_supported(self, src_lang: str, tgt_lang: str) -> bool: return src_lang in self.supported_langs and tgt_lang in self.supported_langs def get_model(self, src_lang: str, tgt_lang: str): key = f"{src_lang}-{tgt_lang}" if key not in self.model_cache: model_name = self.model_template.format(src=src_lang, tgt=tgt_lang) logging.info(f"加载模型: {model_name}") tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) self.model_cache[key] = {"tokenizer": tokenizer, "model": model} logging.info(f"模型 {model_name} 加载成功") return self.model_cache[key] # -------------------------------------------------- # 内容保护函数 # -------------------------------------------------- def protect_content(text: str) -> Tuple[str, Dict[str, str]]: """保护HTML标签和短代码""" if not text: return text, {} protected = text tag_map = {} idx = 0 # 保护短代码 [shortcode] shortcodes = re.findall(r'(\[[^\]]+\])', protected) for sc in shortcodes: key = f"__SC{idx}__" tag_map[key] = sc protected = protected.replace(sc, key, 1) idx += 1 # 保护HTML标签 <tag> html_tags = re.findall(r'(<[^>]+>)', protected) for tag in html_tags: key = f"__HTML{idx}__" tag_map[key] = tag protected = protected.replace(tag, key, 1) idx += 1 return protected, tag_map def restore_content(text: str, tag_map: Dict[str, str]) -> str: """还原被保护的内容""" if not text or not tag_map: return text restored = text for key, original in tag_map.items(): restored = restored.replace(key, original) return restored # -------------------------------------------------- # 翻译 & 检查 # -------------------------------------------------- BRAND_NAMES = {"Flatsome", "WPML", "Helsinki-NLP", "MarianMT", "Petsva"} def should_translate(text: str, tgt_lang: str, glossary: dict) -> bool: s = (text or "").strip() if not s: return False if s in BRAND_NAMES: return False if re.fullmatch(r'^[\d\.,\s]+(?:USD|EUR|¥|\$|€)?$', s): return False # 检查WPML短代码 - 如果整个文本都是短代码,则不翻译 if re.fullmatch(r'\[wpml[^\]]*\]', s, re.IGNORECASE): return False if re.fullmatch(r'^SKU[-_]\d+$', s, re.IGNORECASE): return False if re.fullmatch(r'https?://.*|www\..*|/[\w\-/]+', s): return False if s in glossary and tgt_lang in glossary[s]: return False if re.search(r'(email|password|username|account|phone|tel|contact)', s, re.IGNORECASE): return False return True def translate_text(text: str, tgt_lang: str, model_manager: ModelManager, glossary: dict, src_lang: str = "en") -> str: s = (text or "").strip() # 1) 术语表优先 if s in glossary and tgt_lang in glossary[s]: return glossary[s][tgt_lang] # 2) 不需要翻译直接返回 if not should_translate(text, tgt_lang, glossary): return text or "" # 3) 语言支持 if not model_manager.is_supported(src_lang, tgt_lang): logging.warning(f"不支持语言对 {src_lang}->{tgt_lang},返回原文") return text or "" # 4) 保护内容(HTML标签 + 短代码) protected, tag_map = protect_content(text) try: model_pack = model_manager.get_model(src_lang, tgt_lang) tokenizer, model = model_pack["tokenizer"], model_pack["model"] # 5) 翻译 inputs = tokenizer(protected, return_tensors="pt", padding=True, truncation=True, max_length=512) out = model.generate(**inputs) translated = tokenizer.decode(out[0], skip_special_tokens=True) # 6) 还原保护内容 translated = restore_content(translated, tag_map) # 7) 修复标签完整性 translated = fix_tag_integrity(translated) # 8) 有效性检查 if not is_valid_translation(translated): logging.error(f"翻译结果无效,回退使用源文本") return fix_tag_integrity(text or "") return translated except Exception as e: logging.error(f"翻译失败: {s[:60]}... 错误: {e}") return fix_tag_integrity(text or "") def fix_tag_integrity(txt: str) -> str: """修复标签完整性""" if not txt: return txt # 简单的标签平衡检查 open_tags = re.findall(r'<([a-zA-Z0-9]+)[^>]*>', txt) close_tags = re.findall(r'</([a-zA-Z0-9]+)>', txt) # 计算需要补全的标签 for tag in close_tags: if tag in open_tags: open_tags.remove(tag) # 补全缺失的闭合标签 result = txt for tag in reversed(open_tags): result += f'</{tag}>' logging.warning(f"补全未闭合标签:</{tag}>") return result def is_valid_translation(txt: str) -> bool: s = (txt or "").strip() if len(s) < 2: return False # 特殊字符占比 specials = len(re.findall(r'[*\-=_+#@]', s)) if specials and specials / max(1, len(s)) > 0.3: return False # 不全是重复符号 if re.fullmatch(r'([*\-=])\1{6,}', s): return False return True def check_consistency(source: str, target: str) -> list: issues = [] if not (source or "").strip(): return issues # 数字/金额一致性 - 更精确的提取 def extract_important_numbers(text: str) -> List[str]: """只提取重要的数字(货币等),避免日期、页码等""" patterns = [ r'\$\d+\.?\d*', # 美元金额 r'€\d+\.?\d*', # 欧元金额 r'¥\d+\.?\d*', # 日元金额 r'£\d+\.?\d*', # 英镑金额 r'\b\d+\.\d{2}\b', # 金额格式的小数 ] numbers = [] for pattern in patterns: numbers.extend(re.findall(pattern, text)) return numbers s_num = extract_important_numbers(source or "") t_num = extract_important_numbers(target or "") if s_num != t_num: issues.append(f"数字或金额不一致:源[{','.join(s_num)}] → 目标[{','.join(t_num)}]") # {} 占位符 def extract_placeholders(text: str) -> list: return re.findall(r'\{[^}]+\}', text or "") sp = extract_placeholders(source or "") tp = extract_placeholders(target or "") if len(sp) != len(tp): issues.append(f"占位符数量不一致:源{len(sp)}个 → 目标{len(tp)}个") else: semantic_maps = { "physical address": ["adresse physique", "physische adresse", "dirección física"], "email": ["email", "e-mail", "correo electrónico"], "phone": ["téléphone", "telefon", "teléfono"], "name": ["nom", "name", "nombre"], "order id": ["numéro de commande", "bestellnummer", "número de pedido"] } for s_ph, t_ph in zip(sp, tp): if s_ph == t_ph: continue s_core = s_ph.strip("{}").lower() t_core = t_ph.strip("{}").lower() ok = False if s_core in semantic_maps: ok = t_core in semantic_maps[s_core] else: cores = [w for w in s_core.split() if len(w) > 2] ok = any(c in t_core for c in cores) if not ok: issues.append(f"占位符内容不匹配:源{s_ph} → 目标{t_ph}") # 短代码一致性检查 def extract_shortcodes(text: str) -> list: return re.findall(r'\[[^\]]+\]', text or "") source_shortcodes = extract_shortcodes(source or "") target_shortcodes = extract_shortcodes(target or "") if len(source_shortcodes) != len(target_shortcodes): issues.append(f"短代码数量不一致:源{len(source_shortcodes)}个 → 目标{len(target_shortcodes)}个") else: for i, (src_sc, tgt_sc) in enumerate(zip(source_shortcodes, target_shortcodes)): if src_sc != tgt_sc: issues.append(f"短代码内容不匹配:位置{i+1}, 源={src_sc}, 目标={tgt_sc}") # 标签一致性 def extract_tags(text: str) -> list: tags = [t for t in re.findall(r"<[^>]+>", text or "") if t.strip()] return [t for t in tags if not re.search(r'img', t, re.I)] st = extract_tags(source or "") tt = extract_tags(target or "") if len(st) != len(tt): issues.append(f"标签数量不一致:源{len(st)}个 → 目标{len(tt)}个") # 目标有效性 t_stripped = (target or "").strip() if not t_stripped or re.fullmatch(r'([*\-=\_\+\#\@\s])+', t_stripped) or len(t_stripped) < max(1, len((source or "").strip())) * 0.3: issues.append("目标文本无效:全为特殊字符/空白/过度截断,无有效内容") return issues # -------------------------------------------------- # XLIFF 处理 # -------------------------------------------------- def ensure_target_element(trans_unit, ns, tgt_lang: str, default_state: str = "translated"): target = trans_unit.find(f"{{{ns}}}target") if target is None: target = etree.SubElement(trans_unit, "target") target.set(f"{{{XML_NS}}}lang", tgt_lang) if "state" not in target.attrib: target.set("state", default_state) return target def build_qa_root_like(original_root): qa_root = etree.Element(original_root.tag, nsmap=original_root.nsmap) for k, v in original_root.attrib.items(): qa_root.set(k, v) return qa_root def process_xliff(file_path: Path, model_manager: ModelManager, glossary: dict, config: dict): try: logging.info(f"开始处理文件: {file_path.name}") parser = etree.XMLParser(remove_blank_text=True) tree = etree.parse(str(file_path), parser) root = tree.getroot() ns = root.nsmap.get(None, "urn:oasis:names:tc:xliff:document:1.2") tool_ns = root.nsmap.get("tool", "https://cdn.wpml.org/xliff/custom-attributes.xsd") file_elem = root.find(f".//{{{ns}}}file") if file_elem is None: logging.warning(f"文件 {file_path.name} 无 <file> 标签,跳过") return src_lang = file_elem.get("source-language") or config["source_language"] tgt_lang = file_elem.get("target-language") or ( config["target_language"][0] if isinstance(config["target_language"], list) else config["target_language"] ) if src_lang == tgt_lang: logging.info(f"源语言与目标语言相同({src_lang}),跳过文件 {file_path.name}") return qa_root = build_qa_root_like(root) qa_file = etree.SubElement(qa_root, "file", attrib=file_elem.attrib) qa_body = etree.SubElement(qa_file, "body") trans_units = file_elem.findall(f".//{{{ns}}}body/{{{ns}}}trans-unit") total = len(trans_units) if total == 0: logging.warning(f"文件 {file_path.name} 无翻译单元,跳过") return logging.info(f"找到 {total} 个翻译单元") write_translated = not bool(config.get("generate_qa_only", False)) for i, tu in enumerate(trans_units, 1): if i % 10 == 0 or i == total: logging.info(f"进度 {i}/{total}") source_el = tu.find(f"{{{ns}}}source") if source_el is None: continue src_text = source_el.text or "" translated_text = translate_text(src_text, tgt_lang, model_manager, glossary, src_lang) issues = check_consistency(src_text, translated_text) if write_translated: target_el = ensure_target_element(tu, ns, tgt_lang, default_state="translated") target_el.text = translated_text if issues: tu_id = tu.get("id") or f"unit-{i}" qa_tu = etree.SubElement(qa_body, "trans-unit", id=tu_id) extradata = tu.find(f"{{{tool_ns}}}extradata") if extradata is not None: qa_ex = etree.SubElement(qa_tu, f"{{{tool_ns}}}extradata", attrib=extradata.attrib) qa_ex.text = extradata.text etree.SubElement(qa_tu, "source").text = src_text t = etree.SubElement(qa_tu, "target") t.set(f"{{{XML_NS}}}lang", tgt_lang) t.set("state", "needs-review-translation") t.text = translated_text etree.SubElement(qa_tu, "note").text = "; ".join(issues) out_dir = Path(config["output_dir"]) out_dir.mkdir(exist_ok=True) if write_translated: out_file = out_dir / f"{file_path.stem}_translated.xliff" tree.write(str(out_file), encoding="utf-8", xml_declaration=True, pretty_print=True) logging.info(f"已保存翻译文件: {out_file}") has_qa = len(qa_body) > 0 if has_qa: qa_dir = Path(config["qa_report_dir"]) qa_dir.mkdir(exist_ok=True) qa_file_out = qa_dir / f"{file_path.stem}_qa.xliff" # 修复:保存QA根结构 qa_tree = etree.ElementTree(qa_root) qa_tree.write(str(qa_file_out), encoding="utf-8", xml_declaration=True, pretty_print=True) logging.info(f"QA 文件已保存至 {qa_file_out}") except Exception as e: logging.error(f"处理文件 {file_path.name} 出错: {e}") def main(): parser = argparse.ArgumentParser(description="处理 XLIFF 文件的翻译") parser.add_argument("--config", default="config.yaml", help="配置文件路径") args = parser.parse_args() config = load_config(args.config) glossary = load_glossary() model_manager = ModelManager(config["model_template"]) input_dir = Path(config["input_dir"]) for xliff_file in input_dir.glob("*.xliff"): process_xliff(xliff_file, model_manager, glossary, config) if __name__ == "__main__": main()# 源语言 source_language: en # 目标语言列表(随时可以扩展) target_languages: - fr - de - es # 后期可新增:ar, pt, ja, ru, zh, ... # 输入输出目录 input_dir: "xliff_in" output_dir: "output" qa_report: qa_reports generate_qa_only: true # 模型选择模板 # {src} 和 {tgt} 会自动替换为源语言和目标语言 model_template: "Helsinki-NLP/opus-mt-{src}-{tgt}" # cmd 里面输入.venv\Scripts\activate.bat 激活环境
09-01
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值