import os
import pandas as pd
from docx import Document
import win32com.client as win32
from win32com.client import constants
from tqdm import tqdm
def replace_in_docx(docx_path, replacements, output_path=None):
"""
替换docx文件中的文本内容,同时保持原有格式
:param docx_path: 输入docx文件路径
:param replacements: 替换字典,格式为 {原词: 替换词}
:param output_path: 输出docx文件路径,若为None则覆盖原文件
"""
if output_path is None:
output_path = docx_path
# 打开文档
doc = Document(docx_path)
# 处理所有文本内容(包括表格、页眉、页脚等)
for element in _get_all_text_elements(doc):
for old_text, new_text in replacements.items():
if old_text in element.text:
_replace_text_in_element(element, old_text, new_text)
# 保存文档
doc.save(output_path)
def _get_all_text_elements(doc):
"""获取文档中所有包含文本的元素"""
elements = []
# 正文段落
for paragraph in doc.paragraphs:
elements.append(paragraph)
# 表格中的段落
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
elements.append(paragraph)
# 页眉和页脚中的段落
for section in doc.sections:
# 处理页眉
for header in section.header.paragraphs:
elements.append(header)
# 处理页脚
for footer in section.footer.paragraphs:
elements.append(footer)
return elements
def _replace_text_in_element(element, old_text, new_text):
"""在元素中替换文本,保持原有格式"""
# 如果整个段落都匹配,直接替换
if element.text == old_text:
element.text = new_text
return
# 如果是部分匹配,需要逐run替换
runs = element.runs
if not runs:
return
# 创建一个新的运行列表
new_runs = []
# 遍历每个run
for run in runs:
if old_text in run.text:
# 找到所有匹配位置
start_idx = 0
while True:
idx = run.text.find(old_text, start_idx)
if idx == -1:
break
# 处理匹配前的文本
if idx > start_idx:
prefix = run.text[start_idx:idx]
new_run = element.add_run(prefix)
_copy_run_format(run, new_run)
new_runs.append(new_run)
# 处理匹配的文本
replacement_run = element.add_run(new_text)
_copy_run_format(run, replacement_run)
new_runs.append(replacement_run)
# 更新起始位置
start_idx = idx + len(old_text)
# 处理剩余文本
if start_idx < len(run.text):
suffix = run.text[start_idx:]
new_run = element.add_run(suffix)
_copy_run_format(run, new_run)
new_runs.append(new_run)
else:
# 没有匹配,直接复制
new_run = element.add_run(run.text)
_copy_run_format(run, new_run)
new_runs.append(new_run)
# 清除原有的所有run
for i in range(len(element.runs) - 1, -1, -1):
p = element._element
p.remove(element.runs[i]._element)
# 注意:这里不添加新的run,因为在上面的处理中已经添加了
def _copy_run_format(source_run, target_run):
"""复制一个run的格式到另一个run"""
target_run.bold = source_run.bold
target_run.italic = source_run.italic
target_run.underline = source_run.underline
target_run.font.color.rgb = source_run.font.color.rgb
target_run.font.size = source_run.font.size
target_run.font.name = source_run.font.name
# 可以根据需要添加更多格式属性
def convert_doc_to_docx(doc_path, docx_path):
"""
将doc文件转换为docx文件
:param doc_path: 输入doc文件路径
:param docx_path: 输出docx文件路径
"""
try:
# 创建Word应用实例
word = win32.gencache.EnsureDispatch('Word.Application')
word.Visible = False
# 打开doc文件
doc = word.Documents.Open(os.path.abspath(doc_path))
# 另存为docx文件
doc.SaveAs(os.path.abspath(docx_path), FileFormat=constants.wdFormatXMLDocument)
# 关闭文档和应用
doc.Close()
word.Quit()
except Exception as e:
print(f"转换 {doc_path} 失败: {e}")
# 确保关闭应用
try:
word.Quit()
except:
pass
return False
return True
def process_files(input_dir, output_dir, xlsx_path):
"""
处理目录中的所有doc和docx文件
:param input_dir: 输入目录
:param output_dir: 输出目录
:param xlsx_path: 替换对照表xlsx文件路径
"""
# 读取替换对照表
try:
df = pd.read_excel(xlsx_path, header=None)
# 确保有至少两列
if df.shape[1] < 2:
print("替换对照表至少需要两列数据!")
return
replacements = dict(zip(df.iloc[:, 0], df.iloc[:, 1]))
# 过滤掉无效的替换项
replacements = {str(k): str(v) for k, v in replacements.items() if pd.notna(k) and pd.notna(v)}
print(f"已加载 {len(replacements)} 条替换规则")
except Exception as e:
print(f"读取替换对照表失败: {e}")
return
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
# 获取所有doc和docx文件
doc_files = []
docx_files = []
for root, _, files in os.walk(input_dir):
for file in files:
file_path = os.path.join(root, file)
rel_path = os.path.relpath(file_path, input_dir)
if file.lower().endswith('.doc'):
doc_files.append((file_path, rel_path))
elif file.lower().endswith('.docx'):
docx_files.append((file_path, rel_path))
print(f"找到 {len(docx_files)} 个docx文件和 {len(doc_files)} 个doc文件")
# 处理docx文件
print("正在处理docx文件...")
for file_path, rel_path in tqdm(docx_files):
# 创建对应的输出路径
output_rel_dir = os.path.dirname(rel_path)
output_file_dir = os.path.join(output_dir, output_rel_dir)
os.makedirs(output_file_dir, exist_ok=True)
output_file_path = os.path.join(output_file_dir, os.path.basename(file_path))
try:
replace_in_docx(file_path, replacements, output_file_path)
except Exception as e:
print(f"处理 {rel_path} 失败: {e}")
# 处理doc文件
print("正在处理doc文件...")
for file_path, rel_path in tqdm(doc_files):
# 创建对应的输出路径
output_rel_dir = os.path.dirname(rel_path)
output_file_dir = os.path.join(output_dir, output_rel_dir)
os.makedirs(output_file_dir, exist_ok=True)
# 构建临时docx路径和最终输出路径
base_name = os.path.basename(file_path)
temp_docx_path = os.path.join(output_file_dir, base_name + 'x')
output_file_path = os.path.join(output_file_dir, base_name)
try:
# 转换为docx
if convert_doc_to_docx(file_path, temp_docx_path):
# 替换文本
replace_in_docx(temp_docx_path, replacements, temp_docx_path)
# 再转换回doc
convert_doc_to_docx(temp_docx_path, output_file_path)
# 删除临时docx文件
os.remove(temp_docx_path)
except Exception as e:
print(f"处理 {rel_path} 失败: {e}")
# 清理可能生成的临时文件
if os.path.exists(temp_docx_path):
os.remove(temp_docx_path)
print("处理完成!")
if __name__ == "__main__":
# 用户需要修改以下路径
INPUT_DIR = r"E:\\Pycharm project\\test1\\input" # 输入文档所在目录
OUTPUT_DIR = r"E:\\Pycharm project\\test1\\output" # 输出文档目录
XLSX_PATH = r"E:\\Pycharm project\\test1\\compare.xlsx" # 替换对照表xlsx文件路径
process_files(INPUT_DIR, OUTPUT_DIR, XLSX_PATH)