将使用win32com包进行处理
读取doc文件
# coding=utf-8
import os, fnmatch
from win32com import client as wc
from win32com.client import Dispatch
def word2txt(filePath, savePath = ''):
dirs, filename = os.path.split(filePath)
print(dirs, '\n', filename)
new_name = ''
if fnmatch.fnmatch(filename, "*.docx"):
new_name = filename[:-5] + '.txt'
if fnmatch.fnmatch(filename, "*.doc"):
new_name = filename[:-4] + '.txt'
if savePath == '':
savePath = dirs
else:
savePath = savePath
word2txtPath = os.path.join(savePath, new_name)
print(word2txtPath)
wordappp = wc.Dispatch('Word.Application')
mytxt = wordappp.Documents.Open(filePath)
mytxt.SaveAs(word2txtPath, 4) # 4代表抽取结果保存为文本
mytxt.Close()
if __name__ == '__main__':
filePath = os.path.abspath(r'./专业课.docx')
word2txt(filePath)
读取pdf
# coding=utf-8
import os, fnmatch
from win32com import client as wc
from win32com.client import Dispatch
def pdf2txt(filePath, savePath=''):
dirs, filename = os.path.split(filePath)
print(dirs, '\n', filename)
new_name = ''
if fnmatch.fnmatch(filename, '*.pdf') or fnmatch.fnmatch(filename, '*.PDF'):
new_name = filename[:-4] + '.txt'
else:
print('格式不正确,仅支持pdf格式')
return
if savePath == '':
savePath = dirs
else:
savePath = savePath
pdf2txtPath = os.path.join(savePath, new_name)
print(pdf2txtPath)
wordappp = wc.Dispatch('Word.Application')
mytxt = wordappp.Documents.Open(filePath)
mytxt.SaveAs(pdf2txtPath, 4) # 4代表抽取文本
mytxt.Close()
if __name__ == '__main__':
filePath = os.path.abspath(r'./论文.pdf')
pdf2txt(filePath)