此功能需要安装 python的win32com模块
python -m pip install pypiwin32
# coding=utf-8
'''
word 文档信息提取
'''
import os,fnmatch
from win32com import client as wc
from win32com.client import Dispatch
def word2txt(filepath, savepath=''):
pass
# 1. 切分文件路径和文件名
dirs, filename = os.path.split(filepath)
# print(dirs, "\r\n", filename)
# 2. 修改文件后缀
new_name = ''
if fnmatch.fnmatch(filepath, '*.doc'):
new_name = filepath[:-4]+'.txt'
elif fnmatch.fnmatch(filepath, '*.docx'):
new_name = filepath[:-5]+'.txt'
else:
return print('仅支持 doc和docx格式')
# 3. 设置新文件保存路径
if savepath == '':
savepath = dirs
else:
savepath = savepath
new_path = os.path.join(savepath, new_name)
# print(filepath)
# 4. 加载文本处理程序
wordapp = wc.Dispatch('Word.Application')
mytxt = wordapp.Documents.Open(filepath)
# 5. 保存文本信息
# print(new_path)
mytxt.SaveAs(new_path, 4) # 参数4代表抽取文本
mytxt.Close()
if __name__ == '__main__':
filepath1 = os.path.abspath(r'文档1.doc')
filepath2 = os.path.abspath(r'文档2.docx')
filepath3 = os.path.abspath(r'pdf文档.pdf')
word2txt(filepath1)
亲测有效