这份是比较完整的版本,116页的文档大概4秒左右,但是有很多for循环,大概是很不美观了。
# coding=utf8
import os
import re
import datetime
from win32com import client
from docx import Document
import pywintypes
CommaNumberPattern = re.compile(u'\d{1,3}([,,]\d\d\d)*([.]\d\d)')
NumberPattern = re.compile(u'[0-9]')
def doc_to_docx(path):
filenames = os.listdir(path)
doc_filenames = [os.path.join(path, filename) for filename in filenames if os.path.splitext(filename)[1] == '.doc']
if len(doc_filenames) != 0:
for doc_file in doc_filenames:
try:
word = client.Dispatch('Word.Application')
doc = word.Documents.Open(doc_file)
doc.SaveAs(os.path.splitext(doc_file)[0] + ".docx", 16)
doc.Close()
word.Quit()
os.remove(doc_file)
except pywintypes.com_error:
print(doc_file, '文件异常,请手动转换')
def replace_comma_number(file_path, file_out_path):
document = Document(file_path)
for paragraph in document.paragraphs:
for inline in paragraph.runs:
if not re.search(CommaNumberPattern, inline.text) is None:
inline.text = re.sub(NumberPattern, '1', inline.text)
for table in document.tables:
for row in table.rows:
for cell in row.cells:
cell_text = ''.join([paragraph.text for paragraph in cell.paragraphs])
if re.search(CommaNumberPattern, cell_text):
for inline in cell.paragraphs[0].runs:
inline.text = re.sub(NumberPattern, '1', inline.text)
document.save(file_out_path)
if __name__ == "__main__":
path = 'C:\\Users\\cn190441\\PycharmProjects\\KPMG_NLP\\Num_replace\\original_doc'
out_path = 'C:\\Users\\cn190441\\PycharmProjects\\KPMG_NLP\\Num_replace\\output'
doc_to_docx(path)
filenames = os.listdir(path)
time = datetime.datetime.now()
print('There is {} files in all.'.format(len(filenames)))
for file in filenames:
replace_comma_number(os.path.join(path, file), os.path.join(out_path, file))
print('Time Consumption: {}'.format(datetime.datetime.now() - time))