我做的translate.py现在有很多问题。如何解决这些问题还有潜在的问题预防。目录结构式怎么样的。如何完善我的逻辑,和我同样的开发者遇到过那些坑。import os
import re
import json
import yaml
import logging
import argparse
from pathlib import Path
from lxml import etree
from transformers import MarianMTModel, MarianTokenizer, logging as hf_logging
from typing import Dict, List, Tuple
# --------------------------------------------------
# 日志配置
# --------------------------------------------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[
logging.FileHandler("translation.log", encoding="utf-8"),
logging.StreamHandler()
]
)
hf_logging.set_verbosity_error()
XML_NS = "http://www.w3.org/XML/1998/namespace"
# --------------------------------------------------
# 配置与术语表加载
# --------------------------------------------------
def load_config(config_path: str = "config.yaml") -> dict:
try:
with open(config_path, "r", encoding="utf-8") as f:
cfg = yaml.safe_load(f) or {}
defaults = {
"source_language": "en",
"target_languages": ["fr", "de", "es"],
"input_dir": "xliff_in",
"output_dir": "output",
"qa_report_dir": "qa_reports",
"model_template": "Helsinki-NLP/opus-mt-{src}-{tgt}",
"generate_qa_only": False
}
for k, v in defaults.items():
cfg.setdefault(k, v)
# 兼容旧配置键名
if "qa_report" in cfg:
cfg["qa_report_dir"] = cfg["qa_report"]
if "target_languages" in cfg:
cfg["target_language"] = cfg["target_languages"]
elif "target_language" not in cfg:
cfg["target_language"] = defaults["target_languages"]
# 创建输出目录
Path(cfg["output_dir"]).mkdir(exist_ok=True)
Path(cfg["qa_report_dir"]).mkdir(exist_ok=True)
return cfg
except FileNotFoundError:
logging.error(f"配置文件 {config_path} 未找到!")
raise
except Exception as e:
logging.error(f"加载配置失败: {e}")
raise
def load_glossary(glossary_path: str = "glossary.json") -> dict:
try:
if os.path.exists(glossary_path) and os.path.getsize(glossary_path) > 0:
with open(glossary_path, "r", encoding="utf-8") as f:
return json.load(f) or {}
logging.warning(f"术语表 {glossary_path} 不存在或为空,将不使用术语表")
return {}
except Exception as e:
logging.error(f"加载术语表失败({glossary_path}): {e}")
return {}
# --------------------------------------------------
# 模型管理
# --------------------------------------------------
class ModelManager:
def __init__(self, model_template: str):
self.model_cache = {}
self.model_template = model_template
self.supported_langs = {"en", "fr", "de", "es", "ar", "pt", "ja", "ru", "zh"}
def is_supported(self, src_lang: str, tgt_lang: str) -> bool:
return src_lang in self.supported_langs and tgt_lang in self.supported_langs
def get_model(self, src_lang: str, tgt_lang: str):
key = f"{src_lang}-{tgt_lang}"
if key not in self.model_cache:
model_name = self.model_template.format(src=src_lang, tgt=tgt_lang)
logging.info(f"加载模型: {model_name}")
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
self.model_cache[key] = {"tokenizer": tokenizer, "model": model}
logging.info(f"模型 {model_name} 加载成功")
return self.model_cache[key]
# --------------------------------------------------
# 核心修复:内容保护机制(防篡改+全场景匹配)
# --------------------------------------------------
def generate_robust_placeholder(idx: int) -> str:
"""生成模型无法识别和修改的防篡改占位符"""
return f"[[XLFF_PROTECT_{idx}_SAFE]]"
def protect_content(text: str) -> Tuple[str, Dict[str, str]]:
"""
保护HTML标签(含属性)和短代码,解决:
1. 占位符被模型篡改问题
2. 标签属性被翻译问题
3. 短代码识别不全问题
"""
if not text:
return text, {}
protected = text
tag_map = {}
idx = 0
# 1. 保护HTML标签(含所有属性,避免模型修改style等属性)
html_pattern = r'(<\/?[a-zA-Z0-9]+(?:\s+[a-zA-Z0-9\-]+(?:="[^"]*")?)*\s*\/?>)'
html_matches = list(re.finditer(html_pattern, protected, re.IGNORECASE))
# 倒序替换避免干扰后续匹配
for match in reversed(html_matches):
original_tag = match.group(1)
placeholder = generate_robust_placeholder(idx)
protected = protected[:match.start()] + placeholder + protected[match.end():]
tag_map[placeholder] = original_tag
idx += 1
# 2. 保护短代码(支持前后空白和复杂属性)
shortcode_pattern = r'(\s*\[[a-zA-Z0-9\_\-]+\s*(?:[a-zA-Z0-9\_\-]+\="[^"]*"\s*)*\/?\]\s*)'
shortcode_matches = list(re.finditer(shortcode_pattern, protected, re.IGNORECASE))
for match in reversed(shortcode_matches):
original_sc = match.group(1).strip() # 去除空白但保留原始内容
placeholder = generate_robust_placeholder(idx)
protected = protected[:match.start()] + placeholder + protected[match.end():]
tag_map[placeholder] = original_sc
idx += 1
return protected, tag_map
def restore_content(text: str, tag_map: Dict[str, str]) -> Tuple[str, bool]:
"""严格还原占位符,处理模型可能的篡改"""
if not text or not tag_map:
return text, True
restored = text
remaining_placeholders = []
# 按占位符长度倒序替换,避免短占位符被长占位符匹配
sorted_placeholders = sorted(tag_map.keys(), key=lambda x: len(x), reverse=True)
for placeholder in sorted_placeholders:
original = tag_map[placeholder]
if placeholder in restored:
restored = restored.replace(placeholder, original)
else:
# 检查是否有被篡改的占位符(如少了下划线或前缀)
tampered_pattern = re.compile(re.escape(placeholder).replace(r'\[', r'\[?').replace(r'\]', r'\]?'))
if tampered_pattern.search(restored):
restored = tampered_pattern.sub(original, restored)
logging.warning(f"修复被篡改的占位符: {placeholder}")
else:
remaining_placeholders.append(placeholder)
# 处理未还原的占位符(强制补全)
if remaining_placeholders:
logging.error(f"未还原的占位符: {remaining_placeholders}")
for ph in remaining_placeholders:
restored += f" {tag_map[ph]}" # 追加原始内容避免结构丢失
return restored, False
return restored, True
# --------------------------------------------------
# 翻译与检查逻辑
# --------------------------------------------------
BRAND_NAMES = {"Flatsome", "WPML", "Helsinki-NLP", "MarianMT", "Petsva"}
def should_translate(text: str, tgt_lang: str, glossary: dict) -> bool:
s = (text or "").strip()
if not s:
return False
if s in BRAND_NAMES:
return False
if re.fullmatch(r'^[\d\.,\s]+(?:USD|EUR|¥|\$|€)?$', s):
return False
if re.fullmatch(r'\[wpml[^\]]*\]', s, re.IGNORECASE):
return False
if re.fullmatch(r'^SKU[-_]\d+$', s, re.IGNORECASE):
return False
if re.fullmatch(r'https?://.*|www\..*|/[\w\-/]+', s):
return False
if s in glossary and tgt_lang in glossary[s]:
return False
if re.search(r'(email|password|username|account|phone|tel|contact)', s, re.IGNORECASE):
return False
return True
def apply_glossary(text: str, tgt_lang: str, glossary: dict) -> Tuple[str, bool]:
"""应用术语表,支持多词术语和模糊匹配"""
if not text or not glossary:
return text, False
processed_text = text
hit_glossary = False
sorted_terms = sorted(glossary.items(), key=lambda x: len(x[0]), reverse=True)
for src_term, tgt_dict in sorted_terms:
if tgt_lang not in tgt_dict:
continue
tgt_term = tgt_dict[tgt_lang]
pattern = re.compile(re.escape(src_term), re.IGNORECASE)
processed_text = pattern.sub(tgt_term, processed_text)
if pattern.search(text):
hit_glossary = True
return processed_text, hit_glossary
def filter_invalid_input(text: str) -> Tuple[str, bool]:
"""过滤无意义输入,避免无效翻译"""
s = text.strip()
if len(s) < 2:
return text, False
if re.fullmatch(r'[\s\*\-=_+#@~`!%^&()\[\]{}|\\;:\'",./<>?]+', s):
return text, False
if re.fullmatch(r'^\d+$', s):
return text, False
return text, True
def translate_text(text: str, tgt_lang: str, model_manager: ModelManager, glossary: dict, src_lang: str = "en") -> str:
s = (text or "").strip()
original_text = text # 保存原文用于回退
# 1. 过滤无意义输入
filtered_text, is_valid = filter_invalid_input(s)
if not is_valid:
logging.debug(f"无意义输入,跳过翻译:{s[:30]}...")
return original_text
# 2. 应用术语表
term_processed_text, hit_glossary = apply_glossary(filtered_text, tgt_lang, glossary)
if hit_glossary:
logging.debug(f"术语表命中,直接返回:{term_processed_text[:30]}...")
return term_processed_text
# 3. 检查语言支持
if not model_manager.is_supported(src_lang, tgt_lang):
logging.warning(f"不支持语言对 {src_lang}->{tgt_lang},返回原文")
return original_text
# 4. 内容保护
protected_text, tag_map = protect_content(term_processed_text)
try:
# 5. 模型翻译
model_pack = model_manager.get_model(src_lang, tgt_lang)
tokenizer, model = model_pack["tokenizer"], model_pack["model"]
inputs = tokenizer(
protected_text,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512,
add_special_tokens=True
)
out = model.generate(
**inputs,
max_length=1024,
num_beams=4,
early_stopping=True,
no_repeat_ngram_size=2 # 避免重复文本
)
translated_protected = tokenizer.decode(out[0], skip_special_tokens=True)
# 6. 内容还原
translated_text, is_restore_complete = restore_content(translated_protected, tag_map)
if not is_restore_complete:
logging.warning(f"内容还原不完整,已尝试修复:{translated_text[:30]}...")
# 7. 标签完整性修复
translated_text = fix_tag_integrity(translated_text)
if not is_valid_translation(translated_text, original_text):
logging.error(f"翻译结果无效,回退原文:{translated_text[:30]}...")
return fix_tag_integrity(original_text)
return translated_text
except Exception as e:
logging.error(f"翻译失败,回退原文:{s[:30]}... 错误:{str(e)[:50]}")
return fix_tag_integrity(original_text)
def fix_tag_integrity(txt: str) -> str:
"""修复未闭合的HTML标签"""
if not txt:
return txt
open_tags = []
# 提取所有开放标签(排除自闭合标签)
for match in re.finditer(r'<([a-zA-Z0-9]+)[^>]*>', txt):
tag = match.group(1).lower()
if not re.search(r'\/\s*>$', match.group(0)): # 非自闭合标签
open_tags.append(tag)
# 提取所有闭合标签
close_tags = [match.group(1).lower() for match in re.finditer(r'</([a-zA-Z0-9]+)>', txt)]
# 补全缺失的闭合标签
result = txt
for tag in reversed(open_tags):
if tag in close_tags:
close_tags.remove(tag)
else:
result += f"</{tag}>"
logging.warning(f"补全未闭合标签:</{tag}>")
return result
def is_valid_translation(translated: str, original: str) -> bool:
"""校验翻译有效性"""
translated_stripped = translated.strip()
original_stripped = original.strip()
# 长度校验
if len(translated_stripped) < max(1, len(original_stripped) * 0.3):
return False
# 残留占位符校验
if re.search(r'\[\[XLFF_PROTECT_\d+_SAFE\]\]', translated_stripped):
return False
# 标签数量校验(允许±1的误差)
original_tag_count = len(re.findall(r'<[^>]+>', original_stripped))
translated_tag_count = len(re.findall(r'<[^>]+>', translated_stripped))
if abs(original_tag_count - translated_tag_count) > 1:
return False
return True
def check_consistency(source: str, target: str, tgt_lang: str) -> list:
issues = []
source_stripped = (source or "").strip()
target_stripped = (target or "").strip()
if not source_stripped:
return issues
# 1. 标签校验
def extract_elements(text: str, pattern: str) -> list:
return re.findall(pattern, text or "", re.IGNORECASE)
tag_pattern = r'(<\/?[a-zA-Z0-9]+(?:\s+[a-zA-Z0-9\-]+(?:="[^"]*")?)*\s*\/?>)'
source_tags = extract_elements(source, tag_pattern)
target_tags = extract_elements(target, tag_pattern)
if len(source_tags) != len(target_tags):
issues.append(f"标签数量不一致:源[{len(source_tags)}] → 目标[{len(target_tags)}]")
else:
for i, (src_tag, tgt_tag) in enumerate(zip(source_tags, target_tags)):
if src_tag.lower() != tgt_tag.lower():
issues.append(f"标签不匹配(位置{i+1}):源[{src_tag}] → 目标[{tgt_tag}]")
# 2. 短代码校验
sc_pattern = r'(\[[a-zA-Z0-9\_\-]+\s*(?:[a-zA-Z0-9\_\-]+\="[^"]*"\s*)*\/?\])'
source_sc = extract_elements(source, sc_pattern)
target_sc = extract_elements(target, sc_pattern)
if len(source_sc) != len(target_sc):
issues.append(f"短代码数量不一致:源[{len(source_sc)}] → 目标[{len(target_sc)}]")
else:
for i, (src_sc, tgt_sc) in enumerate(zip(source_sc, target_sc)):
if src_sc != tgt_sc:
issues.append(f"短代码不匹配(位置{i+1}):源[{src_sc}] → 目标[{tgt_sc}]")
# 3. 占位符校验
placeholder_pattern = r'\{[^}]+\}'
source_ph = extract_elements(source, placeholder_pattern)
target_ph = extract_elements(target, placeholder_pattern)
if len(source_ph) != len(target_ph):
issues.append(f"占位符数量不一致:源[{len(source_ph)}] → 目标[{len(target_ph)}]")
# 4. 数字/金额一致性
def extract_numbers(text: str) -> List[str]:
patterns = [r'\$\d+\.?\d*', r'€\d+\.?\d*', r'¥\d+\.?\d*', r'£\d+\.?\d*', r'\b\d+\.\d{2}\b']
numbers = []
for p in patterns:
numbers.extend(re.findall(p, text))
return numbers
s_num = extract_numbers(source)
t_num = extract_numbers(target)
if s_num != t_num:
issues.append(f"数字不一致:源[{','.join(s_num)}] → 目标[{','.join(t_num)}]")
return issues
# --------------------------------------------------
# XLIFF处理
# --------------------------------------------------
def ensure_target_element(trans_unit, ns, tgt_lang: str, default_state: str = "translated"):
target = trans_unit.find(f"{{{ns}}}target")
if target is None:
target = etree.SubElement(trans_unit, "target")
target.set(f"{{{XML_NS}}}lang", tgt_lang)
if "state" not in target.attrib:
target.set("state", default_state)
return target
def build_qa_root_like(original_root):
qa_root = etree.Element(original_root.tag, nsmap=original_root.nsmap)
for k, v in original_root.attrib.items():
qa_root.set(k, v)
return qa_root
def process_xliff(file_path: Path, model_manager: ModelManager, glossary: dict, config: dict):
try:
logging.info(f"开始处理文件: {file_path.name}")
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(str(file_path), parser)
root = tree.getroot()
ns = root.nsmap.get(None, "urn:oasis:names:tc:xliff:document:1.2")
tool_ns = root.nsmap.get("tool", "https://cdn.wpml.org/xliff/custom-attributes.xsd")
file_elem = root.find(f".//{{{ns}}}file")
if file_elem is None:
logging.warning(f"文件 {file_path.name} 无 <file> 标签,跳过")
return
src_lang = file_elem.get("source-language") or config["source_language"]
tgt_lang = file_elem.get("target-language") or (
config["target_language"][0] if isinstance(config["target_language"], list) else config["target_language"]
)
if src_lang == tgt_lang:
logging.info(f"源语言与目标语言相同({src_lang}),跳过文件")
return
qa_root = build_qa_root_like(root)
qa_file = etree.SubElement(qa_root, "file", attrib=file_elem.attrib)
qa_body = etree.SubElement(qa_file, "body")
trans_units = file_elem.findall(f".//{{{ns}}}body/{{{ns}}}trans-unit")
total = len(trans_units)
if total == 0:
logging.warning(f"文件 {file_path.name} 无翻译单元,跳过")
return
logging.info(f"找到 {total} 个翻译单元")
write_translated = not bool(config.get("generate_qa_only", False))
for i, tu in enumerate(trans_units, 1):
if i % 10 == 0 or i == total:
logging.info(f"进度 {i}/{total}")
source_el = tu.find(f"{{{ns}}}source")
if source_el is None:
continue
src_text = source_el.text or ""
translated_text = translate_text(src_text, tgt_lang, model_manager, glossary, src_lang)
issues = check_consistency(src_text, translated_text, tgt_lang)
if write_translated:
target_el = ensure_target_element(tu, ns, tgt_lang)
target_el.text = translated_text
if issues:
tu_id = tu.get("id") or f"unit-{i}"
qa_tu = etree.SubElement(qa_body, "trans-unit", id=tu_id)
extradata = tu.find(f"{{{tool_ns}}}extradata")
if extradata is not None:
qa_ex = etree.SubElement(qa_tu, f"{{{tool_ns}}}extradata", attrib=extradata.attrib)
qa_ex.text = extradata.text
etree.SubElement(qa_tu, "source").text = src_text
t = etree.SubElement(qa_tu, "target")
t.set(f"{{{XML_NS}}}lang", tgt_lang)
t.set("state", "needs-review-translation")
t.text = translated_text
etree.SubElement(qa_tu, "note").text = "; ".join(issues)
if write_translated:
out_file = Path(config["output_dir"]) / f"{file_path.stem}_translated.xliff"
tree.write(str(out_file), encoding="utf-8", xml_declaration=True, pretty_print=True)
logging.info(f"已保存翻译文件: {out_file}")
if len(qa_body) > 0:
qa_file_out = Path(config["qa_report_dir"]) / f"{file_path.stem}_qa.xliff"
qa_tree = etree.ElementTree(qa_root)
qa_tree.write(str(qa_file_out), encoding="utf-8", xml_declaration=True, pretty_print=True)
logging.info(f"QA 文件已保存至 {qa_file_out}")
except Exception as e:
logging.error(f"处理文件 {file_path.name} 出错: {e}", exc_info=True)
def main():
parser = argparse.ArgumentParser(description="XLIFF翻译工具(修复版)")
parser.add_argument("--config", default="config.yaml", help="配置文件路径")
args = parser.parse_args()
config = load_config(args.config)
glossary = load_glossary()
model_manager = ModelManager(config["model_template"])
input_dir = Path(config["input_dir"])
for xliff_file in input_dir.glob("*.xliff"):
process_xliff(xliff_file, model_manager, glossary, config)
if __name__ == "__main__":
main()
下面的是config.yaml
最新发布