PHP 提取word与PDF文件文本信息

最新推荐文章于 2023-12-20 09:51:07 发布

assasinSteven

最新推荐文章于 2023-12-20 09:51:07 发布

阅读量667

点赞数 1

分类专栏： python 文章标签：文本抽取 doc docx pdf 文本抽取

本文链接：https://blog.youkuaiyun.com/assasin0308/article/details/103750098

版权

python 专栏收录该内容

16 篇文章

订阅专栏

最近遇到了一个海南什么恶心的什么会议系统,其中恶心的需求就是:"xx,你把用户上传的个人简历文本信息提取出来呗,让用户一上传就能看见自己的简历信息,格式有doc,docx,还可能是PDF文件哦.......用什么方式实现不重要，这个需求很重要,客户很重视!"。我面带笑容，说：“好啊，没问题，小case。交给我”，可心里已经十万个CNM过去了，大家懂得。。。。。。

好了，废话少说，上菜！

用PHP实现？这你就太年轻了，还是用python吧，哈哈哈哈哈。。。。。咳咳

环境说明： python3.6 还需一个重要内裤,哦不类库 win32com

具体是这样的,请看第一部,把doc,docx文本提取出来:

#  coding=utf-8

import os,fnmatch
from win32com import client
from win32com.client import Dispatch

def WordToTxt(filepath,savePath=''):
    dirs,filename = os.path.split(filepath)
    # print(dirs,'\n',filename)
    new_name = ""
    if fnmatch.fnmatch(filename,'*.doc'):
        new_name = filename[:-4]+'.txt'
    elif fnmatch.fnmatch(filename,'*.docx'):
        new_name = filename[:-5]+'.txt'
    else:
        print("文件格式不正确,只支持doc,docx格式文件")

    if savePath=='':
        savePath = dirs
    else:
        savePath = savePath
    wordToTxtPath = os.path.join(savePath,new_name)
    # print("--->",wordToTxtPath,new_name)

    wordapp = client.Dispatch("Word.Application")
    mytxt = wordapp.Documents.Open(filepath)

    mytxt.SaveAs(wordToTxtPath,4)
    mytxt.Close()


if __name__=='__main__':
    filepath = os.path.abspath(r'../shell/resume.docx')
    WordToTxt(filepath)

第二部,把PDF文件文本提取出来:

# coding = utf-8
import os,fnmatch
from win32com import client
from win32com.client import Dispatch

def pdfToTxt(filePath,savePath = ''):
    dirs, filename = os.path.split(filePath)

    new_name = ""
    if fnmatch.fnmatch(filename, '*.pdf') or fnmatch.fnmatch(filename, '*.PDF'):
        new_name = filename[:-4] + '.txt'
    else:
        print("文件格式不正确,只支持doc,docx格式文件")
        return

    if savePath == '':
        savePath = dirs
    else:
        savePath = savePath
    pdfToTxtPath = os.path.join(savePath, new_name)

    pdfdapp = client.Dispatch("Word.Application")
    mytxt = pdfdapp.Documents.Open(filePath)


    mytxt.SaveAs(pdfToTxtPath,4)
    mytxt.Close()





if __name__ == '__main__':
    filepath = os.path.abspath(r'../shell/resume.pdf')
    pdfToTxt(filepath)

第三部,把他们揉在一起,来个小封装:

# coding = utf-8
"""
多格式文档文本数据抽取
"""
import os,fnmatch
from win32com import client
from win32com.client import Dispatch
import random

def Files2Txt(filePath,savePath = ''):
    dirs, filename = os.path.split(filePath)

    file_ext = os.path.splitext(filename)[-1].lower()

    new_name = TansType(filename,file_ext)

    if savePath=='':
        savePath = dirs
    else:
        savePath = savePath
    wordToTxtPath = os.path.join(savePath,new_name)

    app = client.Dispatch("Word.Application")
    mytxt = app.Documents.Open(filePath)

    mytxt.SaveAs(wordToTxtPath, 4)
    mytxt.Close()

"""
根据文件后缀修改文件名
返回修改后的文件名
"""
def TansType(filename,file_extension):
    new_name = ''
    round_str = random.randint(100000,999999)
    if file_extension == '.pdf':
        if fnmatch.fnmatch(filename,'*.pdf'):
            new_name = filename[:-4] + str(round_str) + '.txt'
        else:return
    elif file_extension == '.docx' or file_extension == '.doc':
        if fnmatch.fnmatch(filename,'*.doc'):
            new_name = filename[:-4] + str(round_str) + '.txt'
        elif fnmatch.fnmatch(filename,'*.docx'):
            new_name = filename[:-5] + str(round_str) + '.txt'
        else:return
    else:
        print("警告: \n 您输入的数据不合法,本脚本只支持doc,docx,pdf格式文件!")
        return

    return new_name


if __name__ == '__main__':
    # filepath = os.path.abspath(r'../shell/resume.docx')
    filepath = os.path.abspath(r'../shell/resume.pdf')
    Files2Txt(filepath)

第四部,当然是面向女朋友(对象)啦,批处理操作:

# coding = utf-8
"""
遍历目录及文件,打印所有文件的完整路径
并批量抽取文本信息
"""
import os,time
import extractTxt

# 遍历目录处理子文件
class Traversal():
    # 初始化目标文件路径
    def __init__(self,rootDir,func = None,saveDir = ''):
        self.rootDir = rootDir
        self.saveDir = saveDir
        self.func = func

    # 遍历目录文件
    def travelfile(self):
        # 切分文件目录/文件名
        dirs,filename = os.path.split(self.rootDir)
        # 保存目录
        save_dir = ""
        if self.rootDir == "":
            save_dir = os.path.abspath(os.path.join(dirs,'news_'+ filename))
        else:
            save_dir = self.rootDir
        # 创建保存路径
        if not os.path.exists(save_dir):
            os.mkdir(save_dir)
        # print("save dir:\n",save_dir)
        # 遍历文件抽取TXT
        Traversal.allfiles(self,self.rootDir)

    # 递归算法遍历所有文件,并打印文件名
    def allfiles(self,rootDir,save_dir = ''):
        for lists in os.listdir(rootDir):
            path = os.path.join(rootDir,lists)
            if os.path.isfile(path):
                # print(os.path.abspath(path))
                self.func(os.path.abspath(path),os.path.abspath(save_dir))
            elif os.path.isdir(path):
                new_save_dir = os.path.join(save_dir,lists)
                if not os.path.exists(new_save_dir):
                    os.mkdir(new_save_dir)
                Traversal.allfiles(self,path,new_save_dir)


if __name__ == '__main__':
    start_time = time.time()
    end_time = time.time()
    rootdir = r"../shell/"
    # trans = Traversal(rootdir,extractTxt,extractTxt.Files2Txt) # extractTxt 第三部的文件名 看不懂就算了
    # trans.travelfile()
    print("cost_total_time",(end_time - start_time),'s')