python利用win32com读取doc和pdf内容，并保存到文件

最新推荐文章于 2024-11-24 22:27:32 发布

vincent_hahaha

最新推荐文章于 2024-11-24 22:27:32 发布

阅读量2.8k

点赞数

CC 4.0 BY-SA版权

分类专栏： python

本文链接：https://blog.youkuaiyun.com/vincent_duan/article/details/116377326

python 专栏收录该内容

54 篇文章

订阅专栏

本文介绍如何使用Python的win32com库将Word（.doc, .docx）及PDF文件转换为文本文件。针对不同文件类型，通过指定路径加载文件，并利用Word.Application对象打开文件再保存为文本格式。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

将使用win32com包进行处理

读取doc文件

# coding=utf-8
import os, fnmatch
from win32com import client as wc
from win32com.client import Dispatch


def word2txt(filePath, savePath = ''):
    dirs, filename = os.path.split(filePath)
    print(dirs, '\n', filename)
    new_name = ''
    if fnmatch.fnmatch(filename, "*.docx"):
        new_name = filename[:-5] + '.txt'
    if fnmatch.fnmatch(filename, "*.doc"):
        new_name = filename[:-4] + '.txt'
    if savePath == '':
        savePath = dirs
    else:
        savePath = savePath
    word2txtPath = os.path.join(savePath, new_name)
    print(word2txtPath)
    wordappp = wc.Dispatch('Word.Application')
    mytxt = wordappp.Documents.Open(filePath)
    mytxt.SaveAs(word2txtPath, 4) # 4代表抽取结果保存为文本
    mytxt.Close()


if __name__ == '__main__':
    filePath = os.path.abspath(r'./专业课.docx')
    word2txt(filePath)

读取pdf

# coding=utf-8
import os, fnmatch
from win32com import client as wc
from win32com.client import Dispatch

def pdf2txt(filePath, savePath=''):
    dirs, filename = os.path.split(filePath)
    print(dirs, '\n', filename)
    new_name = ''
    if fnmatch.fnmatch(filename, '*.pdf') or fnmatch.fnmatch(filename, '*.PDF'):
        new_name = filename[:-4] + '.txt'
    else:
        print('格式不正确，仅支持pdf格式')
        return
    if savePath == '':
        savePath = dirs
    else:
        savePath = savePath
    pdf2txtPath = os.path.join(savePath, new_name)
    print(pdf2txtPath)
    wordappp = wc.Dispatch('Word.Application')
    mytxt = wordappp.Documents.Open(filePath)
    mytxt.SaveAs(pdf2txtPath, 4)  # 4代表抽取文本
    mytxt.Close()


if __name__ == '__main__':
    filePath = os.path.abspath(r'./论文.pdf')
    pdf2txt(filePath)