最近遇到了一个海南什么恶心的什么会议系统,其中恶心的需求就是:"xx,你把用户上传的个人简历文本信息提取出来呗,让用户一上传就能看见自己的简历信息,格式有doc,docx,还可能是PDF文件哦.......用什么方式实现不重要,这个需求很重要,客户很重视!"。我面带笑容,说:“好啊,没问题,小case。交给我”,可心里已经十万个CNM过去了,大家懂得。。。。。。
好了,废话少说,上菜!
用PHP实现?这你就太年轻了,还是用python吧,哈哈哈哈哈。。。。。咳咳
环境说明: python3.6 还需一个重要内裤,哦不类库 win32com
具体是这样的,请看第一部,把doc,docx文本提取出来:
# coding=utf-8
import os,fnmatch
from win32com import client
from win32com.client import Dispatch
def WordToTxt(filepath,savePath=''):
dirs,filename = os.path.split(filepath)
# print(dirs,'\n',filename)
new_name = ""
if fnmatch.fnmatch(filename,'*.doc'):
new_name = filename[:-4]+'.txt'
elif fnmatch.fnmatch(filename,'*.docx'):
new_name = filename[:-5]+'.txt'
else:
print("文件格式不正确,只支持doc,docx格式文件")
if savePath=='':
savePath = dirs
else:
savePath = savePath
wordToTxtPath = os.path.join(savePath,new_name)
# print("--->",wordToTxtPath,new_name)
wordapp = client.Dispatch("Word.Application")
mytxt = wordapp.Documents.Open(filepath)
mytxt.SaveAs(wordToTxtPath,4)
mytxt.Close()
if __name__=='__main__':
filepath = os.path.abspath(r'../shell/resume.docx')
WordToTxt(filepath)
第二部,把PDF文件文本提取出来:
# coding = utf-8
import os,fnmatch
from win32com import client
from win32com.client import Dispatch
def pdfToTxt(filePath,savePath = ''):
dirs, filename = os.path.split(filePath)
new_name = ""
if fnmatch.fnmatch(filename, '*.pdf') or fnmatch.fnmatch(filename, '*.PDF'):
new_name = filename[:-4] + '.txt'
else:
print("文件格式不正确,只支持doc,docx格式文件")
return
if savePath == '':
savePath = dirs
else:
savePath = savePath
pdfToTxtPath = os.path.join(savePath, new_name)
pdfdapp = client.Dispatch("Word.Application")
mytxt = pdfdapp.Documents.Open(filePath)
mytxt.SaveAs(pdfToTxtPath,4)
mytxt.Close()
if __name__ == '__main__':
filepath = os.path.abspath(r'../shell/resume.pdf')
pdfToTxt(filepath)
第三部,把他们揉在一起,来个小封装:
# coding = utf-8
"""
多格式文档文本数据抽取
"""
import os,fnmatch
from win32com import client
from win32com.client import Dispatch
import random
def Files2Txt(filePath,savePath = ''):
dirs, filename = os.path.split(filePath)
file_ext = os.path.splitext(filename)[-1].lower()
new_name = TansType(filename,file_ext)
if savePath=='':
savePath = dirs
else:
savePath = savePath
wordToTxtPath = os.path.join(savePath,new_name)
app = client.Dispatch("Word.Application")
mytxt = app.Documents.Open(filePath)
mytxt.SaveAs(wordToTxtPath, 4)
mytxt.Close()
"""
根据文件后缀修改文件名
返回修改后的文件名
"""
def TansType(filename,file_extension):
new_name = ''
round_str = random.randint(100000,999999)
if file_extension == '.pdf':
if fnmatch.fnmatch(filename,'*.pdf'):
new_name = filename[:-4] + str(round_str) + '.txt'
else:return
elif file_extension == '.docx' or file_extension == '.doc':
if fnmatch.fnmatch(filename,'*.doc'):
new_name = filename[:-4] + str(round_str) + '.txt'
elif fnmatch.fnmatch(filename,'*.docx'):
new_name = filename[:-5] + str(round_str) + '.txt'
else:return
else:
print("警告: \n 您输入的数据不合法,本脚本只支持doc,docx,pdf格式文件!")
return
return new_name
if __name__ == '__main__':
# filepath = os.path.abspath(r'../shell/resume.docx')
filepath = os.path.abspath(r'../shell/resume.pdf')
Files2Txt(filepath)
第四部,当然是面向女朋友(对象)啦,批处理操作:
# coding = utf-8
"""
遍历目录及文件,打印所有文件的完整路径
并批量抽取文本信息
"""
import os,time
import extractTxt
# 遍历目录处理子文件
class Traversal():
# 初始化目标文件路径
def __init__(self,rootDir,func = None,saveDir = ''):
self.rootDir = rootDir
self.saveDir = saveDir
self.func = func
# 遍历目录文件
def travelfile(self):
# 切分文件目录/文件名
dirs,filename = os.path.split(self.rootDir)
# 保存目录
save_dir = ""
if self.rootDir == "":
save_dir = os.path.abspath(os.path.join(dirs,'news_'+ filename))
else:
save_dir = self.rootDir
# 创建保存路径
if not os.path.exists(save_dir):
os.mkdir(save_dir)
# print("save dir:\n",save_dir)
# 遍历文件抽取TXT
Traversal.allfiles(self,self.rootDir)
# 递归算法遍历所有文件,并打印文件名
def allfiles(self,rootDir,save_dir = ''):
for lists in os.listdir(rootDir):
path = os.path.join(rootDir,lists)
if os.path.isfile(path):
# print(os.path.abspath(path))
self.func(os.path.abspath(path),os.path.abspath(save_dir))
elif os.path.isdir(path):
new_save_dir = os.path.join(save_dir,lists)
if not os.path.exists(new_save_dir):
os.mkdir(new_save_dir)
Traversal.allfiles(self,path,new_save_dir)
if __name__ == '__main__':
start_time = time.time()
end_time = time.time()
rootdir = r"../shell/"
# trans = Traversal(rootdir,extractTxt,extractTxt.Files2Txt) # extractTxt 第三部的文件名 看不懂就算了
# trans.travelfile()
print("cost_total_time",(end_time - start_time),'s')