这个是我的源文件translate.py和配置文件config.yaml
import os
import re
import json
import yaml
import logging
import argparse
from pathlib import Path
from lxml import etree
from transformers import MarianMTModel, MarianTokenizer, logging as hf_logging
from typing import Dict, List, Tuple
# --------------------------------------------------
# 日志
# --------------------------------------------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[
logging.FileHandler("translation.log", encoding="utf-8"),
logging.StreamHandler()
]
)
hf_logging.set_verbosity_error()
XML_NS = "http://www.w3.org/XML/1998/namespace"
# --------------------------------------------------
# 配置 & 术语表
# --------------------------------------------------
def load_config(config_path: str = "config.yaml") -> dict:
try:
with open(config_path, "r", encoding="utf-8") as f:
cfg = yaml.safe_load(f) or {}
defaults = {
"source_language": "en",
"target_languages": ["fr", "de", "es"],
"input_dir": "xliff_in",
"output_dir": "output",
"qa_report_dir": "qa_reports",
"model_template": "Helsinki-NLP/opus-mt-{src}-{tgt}",
"generate_qa_only": False
}
for k, v in defaults.items():
cfg.setdefault(k, v)
# 处理旧的键名兼容
if "qa_report" in cfg:
cfg["qa_report_dir"] = cfg["qa_report"]
if "target_languages" in cfg:
cfg["target_language"] = cfg["target_languages"]
elif "target_language" not in cfg:
cfg["target_language"] = defaults["target_languages"]
# 创建输出目录
Path(cfg["output_dir"]).mkdir(exist_ok=True)
Path(cfg["qa_report_dir"]).mkdir(exist_ok=True)
return cfg
except FileNotFoundError:
logging.error(f"配置文件 {config_path} 未找到!")
raise
except Exception as e:
logging.error(f"加载配置失败: {e}")
raise
def load_glossary(glossary_path: str = "glossary.json") -> dict:
try:
if os.path.exists(glossary_path) and os.path.getsize(glossary_path) > 0:
with open(glossary_path, "r", encoding="utf-8") as f:
return json.load(f) or {}
logging.warning(f"术语表 {glossary_path} 不存在或为空,将不使用术语表")
return {}
except Exception as e:
logging.error(f"加载术语表失败({glossary_path}): {e}")
return {}
# --------------------------------------------------
# 模型管理
# --------------------------------------------------
class ModelManager:
def __init__(self, model_template: str):
self.model_cache = {}
self.model_template = model_template
self.supported_langs = {"en", "fr", "de", "es", "ar", "pt", "ja", "ru", "zh"}
def is_supported(self, src_lang: str, tgt_lang: str) -> bool:
return src_lang in self.supported_langs and tgt_lang in self.supported_langs
def get_model(self, src_lang: str, tgt_lang: str):
key = f"{src_lang}-{tgt_lang}"
if key not in self.model_cache:
model_name = self.model_template.format(src=src_lang, tgt=tgt_lang)
logging.info(f"加载模型: {model_name}")
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
self.model_cache[key] = {"tokenizer": tokenizer, "model": model}
logging.info(f"模型 {model_name} 加载成功")
return self.model_cache[key]
# --------------------------------------------------
# 内容保护函数
# --------------------------------------------------
def protect_content(text: str) -> Tuple[str, Dict[str, str]]:
"""保护HTML标签和短代码"""
if not text:
return text, {}
protected = text
tag_map = {}
idx = 0
# 保护短代码 [shortcode]
shortcodes = re.findall(r'(\[[^\]]+\])', protected)
for sc in shortcodes:
key = f"__SC{idx}__"
tag_map[key] = sc
protected = protected.replace(sc, key, 1)
idx += 1
# 保护HTML标签 <tag>
html_tags = re.findall(r'(<[^>]+>)', protected)
for tag in html_tags:
key = f"__HTML{idx}__"
tag_map[key] = tag
protected = protected.replace(tag, key, 1)
idx += 1
return protected, tag_map
def restore_content(text: str, tag_map: Dict[str, str]) -> str:
"""还原被保护的内容"""
if not text or not tag_map:
return text
restored = text
for key, original in tag_map.items():
restored = restored.replace(key, original)
return restored
# --------------------------------------------------
# 翻译 & 检查
# --------------------------------------------------
BRAND_NAMES = {"Flatsome", "WPML", "Helsinki-NLP", "MarianMT", "Petsva"}
def should_translate(text: str, tgt_lang: str, glossary: dict) -> bool:
s = (text or "").strip()
if not s:
return False
if s in BRAND_NAMES:
return False
if re.fullmatch(r'^[\d\.,\s]+(?:USD|EUR|¥|\$|€)?$', s):
return False
# 检查WPML短代码 - 如果整个文本都是短代码,则不翻译
if re.fullmatch(r'\[wpml[^\]]*\]', s, re.IGNORECASE):
return False
if re.fullmatch(r'^SKU[-_]\d+$', s, re.IGNORECASE):
return False
if re.fullmatch(r'https?://.*|www\..*|/[\w\-/]+', s):
return False
if s in glossary and tgt_lang in glossary[s]:
return False
if re.search(r'(email|password|username|account|phone|tel|contact)', s, re.IGNORECASE):
return False
return True
def translate_text(text: str, tgt_lang: str, model_manager: ModelManager, glossary: dict, src_lang: str = "en") -> str:
s = (text or "").strip()
# 1) 术语表优先
if s in glossary and tgt_lang in glossary[s]:
return glossary[s][tgt_lang]
# 2) 不需要翻译直接返回
if not should_translate(text, tgt_lang, glossary):
return text or ""
# 3) 语言支持
if not model_manager.is_supported(src_lang, tgt_lang):
logging.warning(f"不支持语言对 {src_lang}->{tgt_lang},返回原文")
return text or ""
# 4) 保护内容(HTML标签 + 短代码)
protected, tag_map = protect_content(text)
try:
model_pack = model_manager.get_model(src_lang, tgt_lang)
tokenizer, model = model_pack["tokenizer"], model_pack["model"]
# 5) 翻译
inputs = tokenizer(protected, return_tensors="pt", padding=True, truncation=True, max_length=512)
out = model.generate(**inputs)
translated = tokenizer.decode(out[0], skip_special_tokens=True)
# 6) 还原保护内容
translated = restore_content(translated, tag_map)
# 7) 修复标签完整性
translated = fix_tag_integrity(translated)
# 8) 有效性检查
if not is_valid_translation(translated):
logging.error(f"翻译结果无效,回退使用源文本")
return fix_tag_integrity(text or "")
return translated
except Exception as e:
logging.error(f"翻译失败: {s[:60]}... 错误: {e}")
return fix_tag_integrity(text or "")
def fix_tag_integrity(txt: str) -> str:
"""修复标签完整性"""
if not txt:
return txt
# 简单的标签平衡检查
open_tags = re.findall(r'<([a-zA-Z0-9]+)[^>]*>', txt)
close_tags = re.findall(r'</([a-zA-Z0-9]+)>', txt)
# 计算需要补全的标签
for tag in close_tags:
if tag in open_tags:
open_tags.remove(tag)
# 补全缺失的闭合标签
result = txt
for tag in reversed(open_tags):
result += f'</{tag}>'
logging.warning(f"补全未闭合标签:</{tag}>")
return result
def is_valid_translation(txt: str) -> bool:
s = (txt or "").strip()
if len(s) < 2:
return False
# 特殊字符占比
specials = len(re.findall(r'[*\-=_+#@]', s))
if specials and specials / max(1, len(s)) > 0.3:
return False
# 不全是重复符号
if re.fullmatch(r'([*\-=])\1{6,}', s):
return False
return True
def check_consistency(source: str, target: str) -> list:
issues = []
if not (source or "").strip():
return issues
# 数字/金额一致性 - 更精确的提取
def extract_important_numbers(text: str) -> List[str]:
"""只提取重要的数字(货币等),避免日期、页码等"""
patterns = [
r'\$\d+\.?\d*', # 美元金额
r'€\d+\.?\d*', # 欧元金额
r'¥\d+\.?\d*', # 日元金额
r'£\d+\.?\d*', # 英镑金额
r'\b\d+\.\d{2}\b', # 金额格式的小数
]
numbers = []
for pattern in patterns:
numbers.extend(re.findall(pattern, text))
return numbers
s_num = extract_important_numbers(source or "")
t_num = extract_important_numbers(target or "")
if s_num != t_num:
issues.append(f"数字或金额不一致:源[{','.join(s_num)}] → 目标[{','.join(t_num)}]")
# {} 占位符
def extract_placeholders(text: str) -> list:
return re.findall(r'\{[^}]+\}', text or "")
sp = extract_placeholders(source or "")
tp = extract_placeholders(target or "")
if len(sp) != len(tp):
issues.append(f"占位符数量不一致:源{len(sp)}个 → 目标{len(tp)}个")
else:
semantic_maps = {
"physical address": ["adresse physique", "physische adresse", "dirección física"],
"email": ["email", "e-mail", "correo electrónico"],
"phone": ["téléphone", "telefon", "teléfono"],
"name": ["nom", "name", "nombre"],
"order id": ["numéro de commande", "bestellnummer", "número de pedido"]
}
for s_ph, t_ph in zip(sp, tp):
if s_ph == t_ph:
continue
s_core = s_ph.strip("{}").lower()
t_core = t_ph.strip("{}").lower()
ok = False
if s_core in semantic_maps:
ok = t_core in semantic_maps[s_core]
else:
cores = [w for w in s_core.split() if len(w) > 2]
ok = any(c in t_core for c in cores)
if not ok:
issues.append(f"占位符内容不匹配:源{s_ph} → 目标{t_ph}")
# 短代码一致性检查
def extract_shortcodes(text: str) -> list:
return re.findall(r'\[[^\]]+\]', text or "")
source_shortcodes = extract_shortcodes(source or "")
target_shortcodes = extract_shortcodes(target or "")
if len(source_shortcodes) != len(target_shortcodes):
issues.append(f"短代码数量不一致:源{len(source_shortcodes)}个 → 目标{len(target_shortcodes)}个")
else:
for i, (src_sc, tgt_sc) in enumerate(zip(source_shortcodes, target_shortcodes)):
if src_sc != tgt_sc:
issues.append(f"短代码内容不匹配:位置{i+1}, 源={src_sc}, 目标={tgt_sc}")
# 标签一致性
def extract_tags(text: str) -> list:
tags = [t for t in re.findall(r"<[^>]+>", text or "") if t.strip()]
return [t for t in tags if not re.search(r'img', t, re.I)]
st = extract_tags(source or "")
tt = extract_tags(target or "")
if len(st) != len(tt):
issues.append(f"标签数量不一致:源{len(st)}个 → 目标{len(tt)}个")
# 目标有效性
t_stripped = (target or "").strip()
if not t_stripped or re.fullmatch(r'([*\-=\_\+\#\@\s])+', t_stripped) or len(t_stripped) < max(1, len((source or "").strip())) * 0.3:
issues.append("目标文本无效:全为特殊字符/空白/过度截断,无有效内容")
return issues
# --------------------------------------------------
# XLIFF 处理
# --------------------------------------------------
def ensure_target_element(trans_unit, ns, tgt_lang: str, default_state: str = "translated"):
target = trans_unit.find(f"{{{ns}}}target")
if target is None:
target = etree.SubElement(trans_unit, "target")
target.set(f"{{{XML_NS}}}lang", tgt_lang)
if "state" not in target.attrib:
target.set("state", default_state)
return target
def build_qa_root_like(original_root):
qa_root = etree.Element(original_root.tag, nsmap=original_root.nsmap)
for k, v in original_root.attrib.items():
qa_root.set(k, v)
return qa_root
def process_xliff(file_path: Path, model_manager: ModelManager, glossary: dict, config: dict):
try:
logging.info(f"开始处理文件: {file_path.name}")
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(str(file_path), parser)
root = tree.getroot()
ns = root.nsmap.get(None, "urn:oasis:names:tc:xliff:document:1.2")
tool_ns = root.nsmap.get("tool", "https://cdn.wpml.org/xliff/custom-attributes.xsd")
file_elem = root.find(f".//{{{ns}}}file")
if file_elem is None:
logging.warning(f"文件 {file_path.name} 无 <file> 标签,跳过")
return
src_lang = file_elem.get("source-language") or config["source_language"]
tgt_lang = file_elem.get("target-language") or (
config["target_language"][0] if isinstance(config["target_language"], list) else config["target_language"]
)
if src_lang == tgt_lang:
logging.info(f"源语言与目标语言相同({src_lang}),跳过文件 {file_path.name}")
return
qa_root = build_qa_root_like(root)
qa_file = etree.SubElement(qa_root, "file", attrib=file_elem.attrib)
qa_body = etree.SubElement(qa_file, "body")
trans_units = file_elem.findall(f".//{{{ns}}}body/{{{ns}}}trans-unit")
total = len(trans_units)
if total == 0:
logging.warning(f"文件 {file_path.name} 无翻译单元,跳过")
return
logging.info(f"找到 {total} 个翻译单元")
write_translated = not bool(config.get("generate_qa_only", False))
for i, tu in enumerate(trans_units, 1):
if i % 10 == 0 or i == total:
logging.info(f"进度 {i}/{total}")
source_el = tu.find(f"{{{ns}}}source")
if source_el is None:
continue
src_text = source_el.text or ""
translated_text = translate_text(src_text, tgt_lang, model_manager, glossary, src_lang)
issues = check_consistency(src_text, translated_text)
if write_translated:
target_el = ensure_target_element(tu, ns, tgt_lang, default_state="translated")
target_el.text = translated_text
if issues:
tu_id = tu.get("id") or f"unit-{i}"
qa_tu = etree.SubElement(qa_body, "trans-unit", id=tu_id)
extradata = tu.find(f"{{{tool_ns}}}extradata")
if extradata is not None:
qa_ex = etree.SubElement(qa_tu, f"{{{tool_ns}}}extradata", attrib=extradata.attrib)
qa_ex.text = extradata.text
etree.SubElement(qa_tu, "source").text = src_text
t = etree.SubElement(qa_tu, "target")
t.set(f"{{{XML_NS}}}lang", tgt_lang)
t.set("state", "needs-review-translation")
t.text = translated_text
etree.SubElement(qa_tu, "note").text = "; ".join(issues)
out_dir = Path(config["output_dir"])
out_dir.mkdir(exist_ok=True)
if write_translated:
out_file = out_dir / f"{file_path.stem}_translated.xliff"
tree.write(str(out_file), encoding="utf-8", xml_declaration=True, pretty_print=True)
logging.info(f"已保存翻译文件: {out_file}")
has_qa = len(qa_body) > 0
if has_qa:
qa_dir = Path(config["qa_report_dir"])
qa_dir.mkdir(exist_ok=True)
qa_file_out = qa_dir / f"{file_path.stem}_qa.xliff"
# 修复:保存QA根结构
qa_tree = etree.ElementTree(qa_root)
qa_tree.write(str(qa_file_out), encoding="utf-8", xml_declaration=True, pretty_print=True)
logging.info(f"QA 文件已保存至 {qa_file_out}")
except Exception as e:
logging.error(f"处理文件 {file_path.name} 出错: {e}")
def main():
parser = argparse.ArgumentParser(description="处理 XLIFF 文件的翻译")
parser.add_argument("--config", default="config.yaml", help="配置文件路径")
args = parser.parse_args()
config = load_config(args.config)
glossary = load_glossary()
model_manager = ModelManager(config["model_template"])
input_dir = Path(config["input_dir"])
for xliff_file in input_dir.glob("*.xliff"):
process_xliff(xliff_file, model_manager, glossary, config)
if __name__ == "__main__":
main()# 源语言
source_language: en
# 目标语言列表(随时可以扩展)
target_languages:
- fr
- de
- es
# 后期可新增:ar, pt, ja, ru, zh, ...
# 输入输出目录
input_dir: "xliff_in"
output_dir: "output"
qa_report: qa_reports
generate_qa_only: true
# 模型选择模板
# {src} 和 {tgt} 会自动替换为源语言和目标语言
model_template: "Helsinki-NLP/opus-mt-{src}-{tgt}"
# cmd 里面输入.venv\Scripts\activate.bat 激活环境