CodeReview(2)-正则表达式Pattern类使用需要进行预编译

本文介绍了一个常见的手机号验证方法存在的性能问题,并提供了解决方案。通过预编译正则表达式,避免了每次调用方法时的重复编译,显著提高了验证效率。

代码位置:

/waf-base/src/main/java/com/yxt/common/util/CommonUtil.java

涉及方法:

Line556:isMobileNo()

问题:每次调用isMobileNo()都需要重新编译一次。

public static boolean isMobileNo(String mobile) {
	if (StringUtils.isNotBlank(mobile)) {
		Pattern p = Pattern.compile("1\\d{10}");
		Matcher m = p.matcher(mobile);
		return m.matches();
	} else {
		return false;
	}
}

解决方法:预编译

public static Pattern p = Pattern.compile("1\\d{10}");
public static boolean isMobileNo(String mobile) {
	if (StringUtils.isNotBlank(mobile)) {
		Matcher m = p.matcher(mobile);
		return m.matches();
	} else {
		return false;
	}
}

 

我做的translate.py现在有很多问题。如何解决这些问题还有潜在的问题预防。目录结构式怎么样的。如何完善我的逻辑,和我同样的开发者遇到过那些坑。import os import re import json import yaml import logging import argparse from pathlib import Path from lxml import etree from transformers import MarianMTModel, MarianTokenizer, logging as hf_logging from typing import Dict, List, Tuple # -------------------------------------------------- # 日志配置 # -------------------------------------------------- logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[ logging.FileHandler("translation.log", encoding="utf-8"), logging.StreamHandler() ] ) hf_logging.set_verbosity_error() XML_NS = "http://www.w3.org/XML/1998/namespace" # -------------------------------------------------- # 配置与术语表加载 # -------------------------------------------------- def load_config(config_path: str = "config.yaml") -> dict: try: with open(config_path, "r", encoding="utf-8") as f: cfg = yaml.safe_load(f) or {} defaults = { "source_language": "en", "target_languages": ["fr", "de", "es"], "input_dir": "xliff_in", "output_dir": "output", "qa_report_dir": "qa_reports", "model_template": "Helsinki-NLP/opus-mt-{src}-{tgt}", "generate_qa_only": False } for k, v in defaults.items(): cfg.setdefault(k, v) # 兼容旧配置键名 if "qa_report" in cfg: cfg["qa_report_dir"] = cfg["qa_report"] if "target_languages" in cfg: cfg["target_language"] = cfg["target_languages"] elif "target_language" not in cfg: cfg["target_language"] = defaults["target_languages"] # 创建输出目录 Path(cfg["output_dir"]).mkdir(exist_ok=True) Path(cfg["qa_report_dir"]).mkdir(exist_ok=True) return cfg except FileNotFoundError: logging.error(f"配置文件 {config_path} 未找到!") raise except Exception as e: logging.error(f"加载配置失败: {e}") raise def load_glossary(glossary_path: str = "glossary.json") -> dict: try: if os.path.exists(glossary_path) and os.path.getsize(glossary_path) > 0: with open(glossary_path, "r", encoding="utf-8") as f: return json.load(f) or {} logging.warning(f"术语表 {glossary_path} 不存在或为空,将不使用术语表") return {} except Exception as e: logging.error(f"加载术语表失败({glossary_path}): {e}") return {} # -------------------------------------------------- # 模型管理 # -------------------------------------------------- class ModelManager: def __init__(self, model_template: str): self.model_cache = {} self.model_template = model_template self.supported_langs = {"en", "fr", "de", "es", "ar", "pt", "ja", "ru", "zh"} def is_supported(self, src_lang: str, tgt_lang: str) -> bool: return src_lang in self.supported_langs and tgt_lang in self.supported_langs def get_model(self, src_lang: str, tgt_lang: str): key = f"{src_lang}-{tgt_lang}" if key not in self.model_cache: model_name = self.model_template.format(src=src_lang, tgt=tgt_lang) logging.info(f"加载模型: {model_name}") tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) self.model_cache[key] = {"tokenizer": tokenizer, "model": model} logging.info(f"模型 {model_name} 加载成功") return self.model_cache[key] # -------------------------------------------------- # 核心修复:内容保护机制(防篡改+全场景匹配) # -------------------------------------------------- def generate_robust_placeholder(idx: int) -> str: """生成模型无法识别和修改的防篡改占位符""" return f"[[XLFF_PROTECT_{idx}_SAFE]]" def protect_content(text: str) -> Tuple[str, Dict[str, str]]: """ 保护HTML标签(含属性)和短代码,解决: 1. 占位符被模型篡改问题 2. 标签属性被翻译问题 3. 短代码识别不全问题 """ if not text: return text, {} protected = text tag_map = {} idx = 0 # 1. 保护HTML标签(含所有属性,避免模型修改style等属性) html_pattern = r'(<\/?[a-zA-Z0-9]+(?:\s+[a-zA-Z0-9\-]+(?:="[^"]*")?)*\s*\/?>)' html_matches = list(re.finditer(html_pattern, protected, re.IGNORECASE)) # 倒序替换避免干扰后续匹配 for match in reversed(html_matches): original_tag = match.group(1) placeholder = generate_robust_placeholder(idx) protected = protected[:match.start()] + placeholder + protected[match.end():] tag_map[placeholder] = original_tag idx += 1 # 2. 保护短代码(支持前后空白和复杂属性) shortcode_pattern = r'(\s*\[[a-zA-Z0-9\_\-]+\s*(?:[a-zA-Z0-9\_\-]+\="[^"]*"\s*)*\/?\]\s*)' shortcode_matches = list(re.finditer(shortcode_pattern, protected, re.IGNORECASE)) for match in reversed(shortcode_matches): original_sc = match.group(1).strip() # 去除空白但保留原始内容 placeholder = generate_robust_placeholder(idx) protected = protected[:match.start()] + placeholder + protected[match.end():] tag_map[placeholder] = original_sc idx += 1 return protected, tag_map def restore_content(text: str, tag_map: Dict[str, str]) -> Tuple[str, bool]: """严格还原占位符,处理模型可能的篡改""" if not text or not tag_map: return text, True restored = text remaining_placeholders = [] # 按占位符长度倒序替换,避免短占位符被长占位符匹配 sorted_placeholders = sorted(tag_map.keys(), key=lambda x: len(x), reverse=True) for placeholder in sorted_placeholders: original = tag_map[placeholder] if placeholder in restored: restored = restored.replace(placeholder, original) else: # 检查是否有被篡改的占位符(如少了下划线或前缀) tampered_pattern = re.compile(re.escape(placeholder).replace(r'\[', r'\[?').replace(r'\]', r'\]?')) if tampered_pattern.search(restored): restored = tampered_pattern.sub(original, restored) logging.warning(f"修复被篡改的占位符: {placeholder}") else: remaining_placeholders.append(placeholder) # 处理未还原的占位符(强制补全) if remaining_placeholders: logging.error(f"未还原的占位符: {remaining_placeholders}") for ph in remaining_placeholders: restored += f" {tag_map[ph]}" # 追加原始内容避免结构丢失 return restored, False return restored, True # -------------------------------------------------- # 翻译与检查逻辑 # -------------------------------------------------- BRAND_NAMES = {"Flatsome", "WPML", "Helsinki-NLP", "MarianMT", "Petsva"} def should_translate(text: str, tgt_lang: str, glossary: dict) -> bool: s = (text or "").strip() if not s: return False if s in BRAND_NAMES: return False if re.fullmatch(r'^[\d\.,\s]+(?:USD|EUR|¥|\$|€)?$', s): return False if re.fullmatch(r'\[wpml[^\]]*\]', s, re.IGNORECASE): return False if re.fullmatch(r'^SKU[-_]\d+$', s, re.IGNORECASE): return False if re.fullmatch(r'https?://.*|www\..*|/[\w\-/]+', s): return False if s in glossary and tgt_lang in glossary[s]: return False if re.search(r'(email|password|username|account|phone|tel|contact)', s, re.IGNORECASE): return False return True def apply_glossary(text: str, tgt_lang: str, glossary: dict) -> Tuple[str, bool]: """应用术语表,支持多词术语和模糊匹配""" if not text or not glossary: return text, False processed_text = text hit_glossary = False sorted_terms = sorted(glossary.items(), key=lambda x: len(x[0]), reverse=True) for src_term, tgt_dict in sorted_terms: if tgt_lang not in tgt_dict: continue tgt_term = tgt_dict[tgt_lang] pattern = re.compile(re.escape(src_term), re.IGNORECASE) processed_text = pattern.sub(tgt_term, processed_text) if pattern.search(text): hit_glossary = True return processed_text, hit_glossary def filter_invalid_input(text: str) -> Tuple[str, bool]: """过滤无意义输入,避免无效翻译""" s = text.strip() if len(s) < 2: return text, False if re.fullmatch(r'[\s\*\-=_+#@~`!%^&()\[\]{}|\\;:\'",./<>?]+', s): return text, False if re.fullmatch(r'^\d+$', s): return text, False return text, True def translate_text(text: str, tgt_lang: str, model_manager: ModelManager, glossary: dict, src_lang: str = "en") -> str: s = (text or "").strip() original_text = text # 保存原文用于回退 # 1. 过滤无意义输入 filtered_text, is_valid = filter_invalid_input(s) if not is_valid: logging.debug(f"无意义输入,跳过翻译:{s[:30]}...") return original_text # 2. 应用术语表 term_processed_text, hit_glossary = apply_glossary(filtered_text, tgt_lang, glossary) if hit_glossary: logging.debug(f"术语表命中,直接返回:{term_processed_text[:30]}...") return term_processed_text # 3. 检查语言支持 if not model_manager.is_supported(src_lang, tgt_lang): logging.warning(f"不支持语言对 {src_lang}->{tgt_lang},返回原文") return original_text # 4. 内容保护 protected_text, tag_map = protect_content(term_processed_text) try: # 5. 模型翻译 model_pack = model_manager.get_model(src_lang, tgt_lang) tokenizer, model = model_pack["tokenizer"], model_pack["model"] inputs = tokenizer( protected_text, return_tensors="pt", padding=True, truncation=True, max_length=512, add_special_tokens=True ) out = model.generate( **inputs, max_length=1024, num_beams=4, early_stopping=True, no_repeat_ngram_size=2 # 避免重复文本 ) translated_protected = tokenizer.decode(out[0], skip_special_tokens=True) # 6. 内容还原 translated_text, is_restore_complete = restore_content(translated_protected, tag_map) if not is_restore_complete: logging.warning(f"内容还原不完整,已尝试修复:{translated_text[:30]}...") # 7. 标签完整性修复 translated_text = fix_tag_integrity(translated_text) if not is_valid_translation(translated_text, original_text): logging.error(f"翻译结果无效,回退原文:{translated_text[:30]}...") return fix_tag_integrity(original_text) return translated_text except Exception as e: logging.error(f"翻译失败,回退原文:{s[:30]}... 错误:{str(e)[:50]}") return fix_tag_integrity(original_text) def fix_tag_integrity(txt: str) -> str: """修复未闭合的HTML标签""" if not txt: return txt open_tags = [] # 提取所有开放标签(排除自闭合标签) for match in re.finditer(r'<([a-zA-Z0-9]+)[^>]*>', txt): tag = match.group(1).lower() if not re.search(r'\/\s*>$', match.group(0)): # 非自闭合标签 open_tags.append(tag) # 提取所有闭合标签 close_tags = [match.group(1).lower() for match in re.finditer(r'</([a-zA-Z0-9]+)>', txt)] # 补全缺失的闭合标签 result = txt for tag in reversed(open_tags): if tag in close_tags: close_tags.remove(tag) else: result += f"</{tag}>" logging.warning(f"补全未闭合标签:</{tag}>") return result def is_valid_translation(translated: str, original: str) -> bool: """校验翻译有效性""" translated_stripped = translated.strip() original_stripped = original.strip() # 长度校验 if len(translated_stripped) < max(1, len(original_stripped) * 0.3): return False # 残留占位符校验 if re.search(r'\[\[XLFF_PROTECT_\d+_SAFE\]\]', translated_stripped): return False # 标签数量校验(允许±1的误差) original_tag_count = len(re.findall(r'<[^>]+>', original_stripped)) translated_tag_count = len(re.findall(r'<[^>]+>', translated_stripped)) if abs(original_tag_count - translated_tag_count) > 1: return False return True def check_consistency(source: str, target: str, tgt_lang: str) -> list: issues = [] source_stripped = (source or "").strip() target_stripped = (target or "").strip() if not source_stripped: return issues # 1. 标签校验 def extract_elements(text: str, pattern: str) -> list: return re.findall(pattern, text or "", re.IGNORECASE) tag_pattern = r'(<\/?[a-zA-Z0-9]+(?:\s+[a-zA-Z0-9\-]+(?:="[^"]*")?)*\s*\/?>)' source_tags = extract_elements(source, tag_pattern) target_tags = extract_elements(target, tag_pattern) if len(source_tags) != len(target_tags): issues.append(f"标签数量不一致:源[{len(source_tags)}] → 目标[{len(target_tags)}]") else: for i, (src_tag, tgt_tag) in enumerate(zip(source_tags, target_tags)): if src_tag.lower() != tgt_tag.lower(): issues.append(f"标签不匹配(位置{i+1}):源[{src_tag}] → 目标[{tgt_tag}]") # 2. 短代码校验 sc_pattern = r'(\[[a-zA-Z0-9\_\-]+\s*(?:[a-zA-Z0-9\_\-]+\="[^"]*"\s*)*\/?\])' source_sc = extract_elements(source, sc_pattern) target_sc = extract_elements(target, sc_pattern) if len(source_sc) != len(target_sc): issues.append(f"短代码数量不一致:源[{len(source_sc)}] → 目标[{len(target_sc)}]") else: for i, (src_sc, tgt_sc) in enumerate(zip(source_sc, target_sc)): if src_sc != tgt_sc: issues.append(f"短代码不匹配(位置{i+1}):源[{src_sc}] → 目标[{tgt_sc}]") # 3. 占位符校验 placeholder_pattern = r'\{[^}]+\}' source_ph = extract_elements(source, placeholder_pattern) target_ph = extract_elements(target, placeholder_pattern) if len(source_ph) != len(target_ph): issues.append(f"占位符数量不一致:源[{len(source_ph)}] → 目标[{len(target_ph)}]") # 4. 数字/金额一致性 def extract_numbers(text: str) -> List[str]: patterns = [r'\$\d+\.?\d*', r'€\d+\.?\d*', r'¥\d+\.?\d*', r'£\d+\.?\d*', r'\b\d+\.\d{2}\b'] numbers = [] for p in patterns: numbers.extend(re.findall(p, text)) return numbers s_num = extract_numbers(source) t_num = extract_numbers(target) if s_num != t_num: issues.append(f"数字不一致:源[{','.join(s_num)}] → 目标[{','.join(t_num)}]") return issues # -------------------------------------------------- # XLIFF处理 # -------------------------------------------------- def ensure_target_element(trans_unit, ns, tgt_lang: str, default_state: str = "translated"): target = trans_unit.find(f"{{{ns}}}target") if target is None: target = etree.SubElement(trans_unit, "target") target.set(f"{{{XML_NS}}}lang", tgt_lang) if "state" not in target.attrib: target.set("state", default_state) return target def build_qa_root_like(original_root): qa_root = etree.Element(original_root.tag, nsmap=original_root.nsmap) for k, v in original_root.attrib.items(): qa_root.set(k, v) return qa_root def process_xliff(file_path: Path, model_manager: ModelManager, glossary: dict, config: dict): try: logging.info(f"开始处理文件: {file_path.name}") parser = etree.XMLParser(remove_blank_text=True) tree = etree.parse(str(file_path), parser) root = tree.getroot() ns = root.nsmap.get(None, "urn:oasis:names:tc:xliff:document:1.2") tool_ns = root.nsmap.get("tool", "https://cdn.wpml.org/xliff/custom-attributes.xsd") file_elem = root.find(f".//{{{ns}}}file") if file_elem is None: logging.warning(f"文件 {file_path.name} 无 <file> 标签,跳过") return src_lang = file_elem.get("source-language") or config["source_language"] tgt_lang = file_elem.get("target-language") or ( config["target_language"][0] if isinstance(config["target_language"], list) else config["target_language"] ) if src_lang == tgt_lang: logging.info(f"源语言与目标语言相同({src_lang}),跳过文件") return qa_root = build_qa_root_like(root) qa_file = etree.SubElement(qa_root, "file", attrib=file_elem.attrib) qa_body = etree.SubElement(qa_file, "body") trans_units = file_elem.findall(f".//{{{ns}}}body/{{{ns}}}trans-unit") total = len(trans_units) if total == 0: logging.warning(f"文件 {file_path.name} 无翻译单元,跳过") return logging.info(f"找到 {total} 个翻译单元") write_translated = not bool(config.get("generate_qa_only", False)) for i, tu in enumerate(trans_units, 1): if i % 10 == 0 or i == total: logging.info(f"进度 {i}/{total}") source_el = tu.find(f"{{{ns}}}source") if source_el is None: continue src_text = source_el.text or "" translated_text = translate_text(src_text, tgt_lang, model_manager, glossary, src_lang) issues = check_consistency(src_text, translated_text, tgt_lang) if write_translated: target_el = ensure_target_element(tu, ns, tgt_lang) target_el.text = translated_text if issues: tu_id = tu.get("id") or f"unit-{i}" qa_tu = etree.SubElement(qa_body, "trans-unit", id=tu_id) extradata = tu.find(f"{{{tool_ns}}}extradata") if extradata is not None: qa_ex = etree.SubElement(qa_tu, f"{{{tool_ns}}}extradata", attrib=extradata.attrib) qa_ex.text = extradata.text etree.SubElement(qa_tu, "source").text = src_text t = etree.SubElement(qa_tu, "target") t.set(f"{{{XML_NS}}}lang", tgt_lang) t.set("state", "needs-review-translation") t.text = translated_text etree.SubElement(qa_tu, "note").text = "; ".join(issues) if write_translated: out_file = Path(config["output_dir"]) / f"{file_path.stem}_translated.xliff" tree.write(str(out_file), encoding="utf-8", xml_declaration=True, pretty_print=True) logging.info(f"已保存翻译文件: {out_file}") if len(qa_body) > 0: qa_file_out = Path(config["qa_report_dir"]) / f"{file_path.stem}_qa.xliff" qa_tree = etree.ElementTree(qa_root) qa_tree.write(str(qa_file_out), encoding="utf-8", xml_declaration=True, pretty_print=True) logging.info(f"QA 文件已保存至 {qa_file_out}") except Exception as e: logging.error(f"处理文件 {file_path.name} 出错: {e}", exc_info=True) def main(): parser = argparse.ArgumentParser(description="XLIFF翻译工具(修复版)") parser.add_argument("--config", default="config.yaml", help="配置文件路径") args = parser.parse_args() config = load_config(args.config) glossary = load_glossary() model_manager = ModelManager(config["model_template"]) input_dir = Path(config["input_dir"]) for xliff_file in input_dir.glob("*.xliff"): process_xliff(xliff_file, model_manager, glossary, config) if __name__ == "__main__": main() 下面的是config.yaml
最新发布
09-02
import os import re import json import yaml import logging import argparse from pathlib import Path from lxml import etree from transformers import MarianMTModel, MarianTokenizer, logging as hf_logging from typing import Dict, List, Tuple # -------------------------------------------------- # 日志 # -------------------------------------------------- logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[ logging.FileHandler("translation.log", encoding="utf-8"), logging.StreamHandler() ] ) hf_logging.set_verbosity_error() XML_NS = "http://www.w3.org/XML/1998/namespace" # -------------------------------------------------- # 配置 & 术语表 # -------------------------------------------------- def load_config(config_path: str = "config.yaml") -> dict: try: with open(config_path, "r", encoding="utf-8") as f: cfg = yaml.safe_load(f) or {} defaults = { "source_language": "en", "target_languages": ["fr", "de", "es"], "input_dir": "xliff_in", "output_dir": "output", "qa_report_dir": "qa_reports", "model_template": "Helsinki-NLP/opus-mt-{src}-{tgt}", "generate_qa_only": False } for k, v in defaults.items(): cfg.setdefault(k, v) # 处理旧的键名兼容 if "qa_report" in cfg: cfg["qa_report_dir"] = cfg["qa_report"] if "target_languages" in cfg: cfg["target_language"] = cfg["target_languages"] elif "target_language" not in cfg: cfg["target_language"] = defaults["target_languages"] # 创建输出目录 Path(cfg["output_dir"]).mkdir(exist_ok=True) Path(cfg["qa_report_dir"]).mkdir(exist_ok=True) return cfg except FileNotFoundError: logging.error(f"配置文件 {config_path} 未找到!") raise except Exception as e: logging.error(f"加载配置失败: {e}") raise def load_glossary(glossary_path: str = "glossary.json") -> dict: try: if os.path.exists(glossary_path) and os.path.getsize(glossary_path) > 0: with open(glossary_path, "r", encoding="utf-8") as f: return json.load(f) or {} logging.warning(f"术语表 {glossary_path} 不存在或为空,将不使用术语表") return {} except Exception as e: logging.error(f"加载术语表失败({glossary_path}): {e}") return {} # -------------------------------------------------- # 模型管理 # -------------------------------------------------- class ModelManager: def __init__(self, model_template: str): self.model_cache = {} self.model_template = model_template self.supported_langs = {"en", "fr", "de", "es", "ar", "pt", "ja", "ru", "zh"} def is_supported(self, src_lang: str, tgt_lang: str) -> bool: return src_lang in self.supported_langs and tgt_lang in self.supported_langs def get_model(self, src_lang: str, tgt_lang: str): key = f"{src_lang}-{tgt_lang}" if key not in self.model_cache: model_name = self.model_template.format(src=src_lang, tgt=tgt_lang) logging.info(f"加载模型: {model_name}") tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) self.model_cache[key] = {"tokenizer": tokenizer, "model": model} logging.info(f"模型 {model_name} 加载成功") return self.model_cache[key] # -------------------------------------------------- # 内容保护函数 # -------------------------------------------------- def protect_content(text: str) -> Tuple[str, Dict[str, str]]: """保护HTML标签和短代码""" if not text: return text, {} protected = text tag_map = {} idx = 0 # 保护短代码 [shortcode] shortcodes = re.findall(r'(\[[^\]]+\])', protected) for sc in shortcodes: key = f"__SC{idx}__" tag_map[key] = sc protected = protected.replace(sc, key, 1) idx += 1 # 保护HTML标签 <tag> html_tags = re.findall(r'(<[^>]+>)', protected) for tag in html_tags: key = f"__HTML{idx}__" tag_map[key] = tag protected = protected.replace(tag, key, 1) idx += 1 return protected, tag_map def restore_content(text: str, tag_map: Dict[str, str]) -> str: """还原被保护的内容""" if not text or not tag_map: return text restored = text for key, original in tag_map.items(): restored = restored.replace(key, original) return restored # -------------------------------------------------- # 翻译 & 检查 # -------------------------------------------------- BRAND_NAMES = {"Flatsome", "WPML", "Helsinki-NLP", "MarianMT", "Petsva"} def should_translate(text: str, tgt_lang: str, glossary: dict) -> bool: s = (text or "").strip() if not s: return False if s in BRAND_NAMES: return False if re.fullmatch(r'^[\d\.,\s]+(?:USD|EUR|¥|\$|€)?$', s): return False # 检查WPML短代码 - 如果整个文本都是短代码,则不翻译 if re.fullmatch(r'\[wpml[^\]]*\]', s, re.IGNORECASE): return False if re.fullmatch(r'^SKU[-_]\d+$', s, re.IGNORECASE): return False if re.fullmatch(r'https?://.*|www\..*|/[\w\-/]+', s): return False if s in glossary and tgt_lang in glossary[s]: return False if re.search(r'(email|password|username|account|phone|tel|contact)', s, re.IGNORECASE): return False return True def translate_text(text: str, tgt_lang: str, model_manager: ModelManager, glossary: dict, src_lang: str = "en") -> str: s = (text or "").strip() # 1) 术语表优先 if s in glossary and tgt_lang in glossary[s]: return glossary[s][tgt_lang] # 2) 不需要翻译直接返回 if not should_translate(text, tgt_lang, glossary): return text or "" # 3) 语言支持 if not model_manager.is_supported(src_lang, tgt_lang): logging.warning(f"不支持语言对 {src_lang}->{tgt_lang},返回原文") return text or "" # 4) 保护内容(HTML标签 + 短代码) protected, tag_map = protect_content(text) try: model_pack = model_manager.get_model(src_lang, tgt_lang) tokenizer, model = model_pack["tokenizer"], model_pack["model"] # 5) 翻译 inputs = tokenizer(protected, return_tensors="pt", padding=True, truncation=True, max_length=512) out = model.generate(**inputs) translated = tokenizer.decode(out[0], skip_special_tokens=True) # 6) 还原保护内容 translated = restore_content(translated, tag_map) # 7) 修复标签完整性 translated = fix_tag_integrity(translated) # 8) 有效性检查 if not is_valid_translation(translated): logging.error(f"翻译结果无效,回退使用源文本") return fix_tag_integrity(text or "") return translated except Exception as e: logging.error(f"翻译失败: {s[:60]}... 错误: {e}") return fix_tag_integrity(text or "") def fix_tag_integrity(txt: str) -> str: """修复标签完整性""" if not txt: return txt # 简单的标签平衡检查 open_tags = re.findall(r'<([a-zA-Z0-9]+)[^>]*>', txt) close_tags = re.findall(r'</([a-zA-Z0-9]+)>', txt) # 计算需要补全的标签 for tag in close_tags: if tag in open_tags: open_tags.remove(tag) # 补全缺失的闭合标签 result = txt for tag in reversed(open_tags): result += f'</{tag}>' logging.warning(f"补全未闭合标签:</{tag}>") return result def is_valid_translation(txt: str) -> bool: s = (txt or "").strip() if len(s) < 2: return False # 特殊字符占比 specials = len(re.findall(r'[*\-=_+#@]', s)) if specials and specials / max(1, len(s)) > 0.3: return False # 不全是重复符号 if re.fullmatch(r'([*\-=])\1{6,}', s): return False return True def check_consistency(source: str, target: str) -> list: issues = [] if not (source or "").strip(): return issues # 数字/金额一致性 - 更精确的提取 def extract_important_numbers(text: str) -> List[str]: """只提取重要的数字(货币等),避免日期、页码等""" patterns = [ r'\$\d+\.?\d*', # 美元金额 r'€\d+\.?\d*', # 欧元金额 r'¥\d+\.?\d*', # 日元金额 r'£\d+\.?\d*', # 英镑金额 r'\b\d+\.\d{2}\b', # 金额格式的小数 ] numbers = [] for pattern in patterns: numbers.extend(re.findall(pattern, text)) return numbers s_num = extract_important_numbers(source or "") t_num = extract_important_numbers(target or "") if s_num != t_num: issues.append(f"数字或金额不一致:源[{','.join(s_num)}] → 目标[{','.join(t_num)}]") # {} 占位符 def extract_placeholders(text: str) -> list: return re.findall(r'\{[^}]+\}', text or "") sp = extract_placeholders(source or "") tp = extract_placeholders(target or "") if len(sp) != len(tp): issues.append(f"占位符数量不一致:源{len(sp)}个 → 目标{len(tp)}个") else: semantic_maps = { "physical address": ["adresse physique", "physische adresse", "dirección física"], "email": ["email", "e-mail", "correo electrónico"], "phone": ["téléphone", "telefon", "teléfono"], "name": ["nom", "name", "nombre"], "order id": ["numéro de commande", "bestellnummer", "número de pedido"] } for s_ph, t_ph in zip(sp, tp): if s_ph == t_ph: continue s_core = s_ph.strip("{}").lower() t_core = t_ph.strip("{}").lower() ok = False if s_core in semantic_maps: ok = t_core in semantic_maps[s_core] else: cores = [w for w in s_core.split() if len(w) > 2] ok = any(c in t_core for c in cores) if not ok: issues.append(f"占位符内容不匹配:源{s_ph} → 目标{t_ph}") # 短代码一致性检查 def extract_shortcodes(text: str) -> list: return re.findall(r'\[[^\]]+\]', text or "") source_shortcodes = extract_shortcodes(source or "") target_shortcodes = extract_shortcodes(target or "") if len(source_shortcodes) != len(target_shortcodes): issues.append(f"短代码数量不一致:源{len(source_shortcodes)}个 → 目标{len(target_shortcodes)}个") else: for i, (src_sc, tgt_sc) in enumerate(zip(source_shortcodes, target_shortcodes)): if src_sc != tgt_sc: issues.append(f"短代码内容不匹配:位置{i+1}, 源={src_sc}, 目标={tgt_sc}") # 标签一致性 def extract_tags(text: str) -> list: tags = [t for t in re.findall(r"<[^>]+>", text or "") if t.strip()] return [t for t in tags if not re.search(r'img', t, re.I)] st = extract_tags(source or "") tt = extract_tags(target or "") if len(st) != len(tt): issues.append(f"标签数量不一致:源{len(st)}个 → 目标{len(tt)}个") # 目标有效性 t_stripped = (target or "").strip() if not t_stripped or re.fullmatch(r'([*\-=\_\+\#\@\s])+', t_stripped) or len(t_stripped) < max(1, len((source or "").strip())) * 0.3: issues.append("目标文本无效:全为特殊字符/空白/过度截断,无有效内容") return issues # -------------------------------------------------- # XLIFF 处理 # -------------------------------------------------- def ensure_target_element(trans_unit, ns, tgt_lang: str, default_state: str = "translated"): target = trans_unit.find(f"{{{ns}}}target") if target is None: target = etree.SubElement(trans_unit, "target") target.set(f"{{{XML_NS}}}lang", tgt_lang) if "state" not in target.attrib: target.set("state", default_state) return target def build_qa_root_like(original_root): qa_root = etree.Element(original_root.tag, nsmap=original_root.nsmap) for k, v in original_root.attrib.items(): qa_root.set(k, v) return qa_root def process_xliff(file_path: Path, model_manager: ModelManager, glossary: dict, config: dict): try: logging.info(f"开始处理文件: {file_path.name}") parser = etree.XMLParser(remove_blank_text=True) tree = etree.parse(str(file_path), parser) root = tree.getroot() ns = root.nsmap.get(None, "urn:oasis:names:tc:xliff:document:1.2") tool_ns = root.nsmap.get("tool", "https://cdn.wpml.org/xliff/custom-attributes.xsd") file_elem = root.find(f".//{{{ns}}}file") if file_elem is None: logging.warning(f"文件 {file_path.name} 无 <file> 标签,跳过") return src_lang = file_elem.get("source-language") or config["source_language"] tgt_lang = file_elem.get("target-language") or ( config["target_language"][0] if isinstance(config["target_language"], list) else config["target_language"] ) if src_lang == tgt_lang: logging.info(f"源语言与目标语言相同({src_lang}),跳过文件 {file_path.name}") return qa_root = build_qa_root_like(root) qa_file = etree.SubElement(qa_root, "file", attrib=file_elem.attrib) qa_body = etree.SubElement(qa_file, "body") trans_units = file_elem.findall(f".//{{{ns}}}body/{{{ns}}}trans-unit") total = len(trans_units) if total == 0: logging.warning(f"文件 {file_path.name} 无翻译单元,跳过") return logging.info(f"找到 {total} 个翻译单元") write_translated = not bool(config.get("generate_qa_only", False)) for i, tu in enumerate(trans_units, 1): if i % 10 == 0 or i == total: logging.info(f"进度 {i}/{total}") source_el = tu.find(f"{{{ns}}}source") if source_el is None: continue src_text = source_el.text or "" translated_text = translate_text(src_text, tgt_lang, model_manager, glossary, src_lang) issues = check_consistency(src_text, translated_text) if write_translated: target_el = ensure_target_element(tu, ns, tgt_lang, default_state="translated") target_el.text = translated_text if issues: tu_id = tu.get("id") or f"unit-{i}" qa_tu = etree.SubElement(qa_body, "trans-unit", id=tu_id) extradata = tu.find(f"{{{tool_ns}}}extradata") if extradata is not None: qa_ex = etree.SubElement(qa_tu, f"{{{tool_ns}}}extradata", attrib=extradata.attrib) qa_ex.text = extradata.text etree.SubElement(qa_tu, "source").text = src_text t = etree.SubElement(qa_tu, "target") t.set(f"{{{XML_NS}}}lang", tgt_lang) t.set("state", "needs-review-translation") t.text = translated_text etree.SubElement(qa_tu, "note").text = "; ".join(issues) out_dir = Path(config["output_dir"]) out_dir.mkdir(exist_ok=True) if write_translated: out_file = out_dir / f"{file_path.stem}_translated.xliff" tree.write(str(out_file), encoding="utf-8", xml_declaration=True, pretty_print=True) logging.info(f"已保存翻译文件: {out_file}") has_qa = len(qa_body) > 0 if has_qa: qa_dir = Path(config["qa_report_dir"]) qa_dir.mkdir(exist_ok=True) qa_file_out = qa_dir / f"{file_path.stem}_qa.xliff" # 修复:保存QA根结构 qa_tree = etree.ElementTree(qa_root) qa_tree.write(str(qa_file_out), encoding="utf-8", xml_declaration=True, pretty_print=True) logging.info(f"QA 文件已保存至 {qa_file_out}") except Exception as e: logging.error(f"处理文件 {file_path.name} 出错: {e}") def main(): parser = argparse.ArgumentParser(description="处理 XLIFF 文件的翻译") parser.add_argument("--config", default="config.yaml", help="配置文件路径") args = parser.parse_args() config = load_config(args.config) glossary = load_glossary() model_manager = ModelManager(config["model_template"]) input_dir = Path(config["input_dir"]) for xliff_file in input_dir.glob("*.xliff"): process_xliff(xliff_file, model_manager, glossary, config) if __name__ == "__main__": main() 我的代码是这个,
09-01
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值