utils系列：提取office和pdf中的内容图片_# 定义保存excel的位置 pdf = pdfplumber.open(file)-优快云博客

本文链接：https://blog.youkuaiyun.com/qq_38284951/article/details/117927186
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import pptx
from pptx import Presentation
import time
import os, re
import docx
import shutil
from PIL import Image
import xlrd
import zipfile
import fitz
import sys
import importlib
import xlwt
import pdfplumber
# from win32com import client as wc
import win32com.client as win32
importlib.reload(sys)


def iter_shape(shape, text_shapes):
    if type(shape) == pptx.shapes.group.GroupShape:
        for sshape in shape.shapes:
            iter_shape(sshape, text_shapes)
    else:
        if shape.has_text_frame:
            text_shapes.append(shape)


def makedir(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

# pptx中段落文本
def pptx_paragraph_text(file):
    text_shapes = []
    value = []
    shapes = [shape for slide in file.slides for shape in slide.shapes]
    for shape in shapes:
        iter_shape(shape, text_shapes)
    paragraphs = [paragraph for shape in text_shapes \
                  for paragraph in shape.text_frame.paragraphs]
    result = [p.text for p in paragraphs if p.text !=""]
    if result != []:
        for i in result:
            value.append(i)
    return value


# pptx中表格翻译
def pptx_table_text(file):
    value = []
    shapes = [shape for slide in file.slides for shape in slide.shapes if shape.has_table]
    cells = [cell for shape in shapes for cell in shape.table.iter_cells() if cell.text_frame]
    result = [cell.text for cell in cells if cell.text != ""]
    if result != []:
        for i in result:
            value.append(i)
    if value==[]:
        value = ["table content is None"]
    return value


def pptx_file_translate(ppt_file,pathlist,filetype):
    txt_dir = pathlist[0]
    img_dir = pathlist[1]
    start = time.time()
    filename = os.path.basename(ppt_file).replace(".", "_")
    # 打开PPT
    file = Presentation(ppt_file)
    # 段落文本
    para_text = pptx_paragraph_text(file)
    # 表格文本
    wirte_txt(para_text, ppt_file, txt_dir, filetype)
    table_text = pptx_table_text(file)
    wirte_txt(table_text, ppt_file, txt_dir, filetype)
    dict_rel = file.part._rels
    for rel1 in dict_rel:
        rel1 = dict_rel[rel1]
        if not hasattr(rel1.target_part, 'part'):
            continue
        dict_rels = rel1.target_part.part._rels
        for rel in dict_rels:
            rel = dict_rels[rel]
            if "image" in rel.target_ref:

                img_name =filename+"_"+os.path.basename(re.findall("/(.*)", rel.target_ref)[0])

                word_name = os.path.splitext(ppt_file)[0]

                img_path = os.path.join(img_dir, img_name)

                with open(img_path, "wb") as f:
                    f.write(rel.target_part.blob)
    end = time.time()-start
    wirte_txt(end, ppt_file, txt_dir, filetype)
    return 0

#read   DOCX
def read_word(word_path,pathlist,filetype):

    """
    提取word文档内的图片,文本，表格
    :param word_path: word文件
    :param result_path: 结果目录
    :return:
    """
    start = time.time()
    doc = docx.Document(word_path)
    img_dir = pathlist[1]
    txt_dir = pathlist[0]

    for paragraph in doc.paragraphs:
        if paragraph.text != "":
            wirte_txt(paragraph.text, word_path, txt_dir, filetype)

    dict_rel = doc.part._rels
    tables = doc.tables
    for i in range(len(tables)):
        tb = tables[i]
        # 获取表格的行
        tb_rows = tb.rows
        # 读取每一行内容
        for i in range(len(tb_rows)):
            row_data = []
            row_cells = tb_rows[i].cells
            # 读取每一行单元格内容
            for cell in row_cells:
                # 单元格内容
                text = cell.text.replace("\n", "")
                if text not in row_data:
                    row_data.append(text)
            wirte_txt(row_data, word_path, txt_dir, filetype)


    for rel in dict_rel:
        rel = dict_rel[rel]
        if "image" in rel.target_ref:
            img_name = re.findall("/(.*)", rel.target_ref)[0]

            word_name = os.path.splitext(word_path)[0]
            if os.sep == "/":
                new_name = word_name.split('/')[-1]
            else:
                new_name = word_name.split('\\')[-1]
            img_name = f'{new_name}{filetype}_{img_name}'

            with open(f'{img_dir}/{img_name}', "wb") as f:
                f.write(rel.target_part.blob)
    end = time.time()-start
    wirte_txt(end, word_path, txt_dir, filetype)
    return 0


def read_excel_content(file, pathlist, filetype):
    start = time.time()
    img_dir = pathlist[1]
    txt_dir = pathlist[0]
    xlsx = xlrd.open_workbook(file)
    # 查看所有sheet列表

    result = []
    for i in range(len(xlsx.sheet_names())):
        dic = {}
        sheet1 = xlsx.sheets()[i]  # 获得第1张sheet，索引从0开始
        sheet1_name = sheet1.name  # 获得名称
        sheet1_cols = sheet1.ncols  # 获得列数
        sheet1_nrows = sheet1.nrows  # 获得行数
        value = []
        for i in range(sheet1_nrows):  # 逐行打印sheet1数据
            values = sheet1.row_values(i)
            sheet_result = [j for j in values if j != '']
            if sheet_result != []:
                for i in sheet_result:
                    value.append(i)
        dic[sheet1_name] =value
        result.append(dic)
        wirte_txt(result, file, txt_dir, filetype)
    zip_file = read_excel_image(file, img_dir, filetype)

    if zip_file:
        os.remove(zip_file)
    end = time.time()-start
    wirte_txt(end, file, txt_dir, filetype)
    return result

def read_excel_image(file, img_dir, filetype):
    filename = os.path.basename(file).split(".xlsx")[0]
    if os.path.exists(file) and file.endswith(('.xlsx', '.XLSX')):
        new_file = file.replace(".xlsx", ".zip").replace(".XLSX", ".zip")
        if not os.path.exists(new_file):
            shutil.copy(file, new_file)

        if os.path.exists(new_file):
            try:
                azip = zipfile.ZipFile(new_file)
                namelist = (azip.namelist())
                number = 1
                for idx in range(0, len(namelist)):
                    if namelist[idx][:9] == 'xl/media/':  # 图片是在这个路径下
                        img_name = os.path.join(img_dir, str(filename)+str(filetype)+str(number) + '.jpg')
                        f = azip.open(namelist[idx])
                        img = Image.open(f)
                        img = img.convert("RGB")
                        img.save(img_name, "JPEG")
                        number += 1
                azip.close()  # 关闭文件，必须有，释放内存
            except Exception as e:
                print("Excel error is :{}".format(e))
                # file = save_as_xlsx(path)
                # read_pdf_img(file, filetype)
            return new_file

#读取pdf
def read_pdf_img(path,img_dir,filetype):
    '''
    # 从pdf中提取图片
    pip install pyumpdf==1.16.8
    :param path: pdf的路径
    :param pic_path: 图片保存的路径
    :return:
    '''

    t0 = time.clock()
    # 使用正则表达式来查找图片
    checkXO = r"/Type(?= */XObject)"
    checkIM = r"/Subtype(?= */Image)"
    doc = fitz.open(path)
    imgcount = 0
    lenXREF = doc._getXrefLength()
    try:
        for i in range(1, lenXREF):
            text = doc._getXrefString(i)
            isXObject = re.search(checkXO, text)
            isImage = re.search(checkIM, text)
            if not isImage:
                continue
            imgcount += 1
            pix = fitz.Pixmap(doc, i)
            new_name = os.path.basename(path).split(".pdf")[0] + str(filetype) +"_img{}.png".format(imgcount)
            # new_name = new_name.replace(':', '')
            if pix.n < 5:
                pix.writePNG(os.path.join(img_dir, new_name))
            else:
                pix0 = fitz.Pixmap(fitz.csRGB, pix)
                pix0.writePNG(os.path.join(img_dir, new_name))
                pix0 = None
            pix = None
    except Exception as e:
        print("Exception is :{}".format(e))
    return 0

#读取pdf
def read_pdf_content(file, pathlist, filetype):
    img_dir = pathlist[1]
    txt_dir = pathlist[0]
    start = time.time()
    pdf = pdfplumber.open(file)
    try:
        for page in pdf.pages:
            text = page.extract_text()
            filename = wirte_txt(text, file, txt_dir, filetype)
    except Exception as e:
        print("read_pdf_word :{}".format(e))
    #定义保存Excel的位置
    try:
        workbook = xlwt.Workbook()
        sheet = workbook.add_sheet("sheet_table")
        i = 0
        for page in pdf.pages:#获取页面中的所有信息
            for table in page.extract_tables():#获取页面的所有表格
                for row in table:#遍历表格中的每一行
                    for j in range(len(row)):
                        sheet.write(i, j, row[j])
                    i = i + 1
        pdf.close()
        path = os.path.join(txt_dir, os.path.basename(filename) + ".xls")
        workbook.save(path)
    except Exception as e:
        print("read_pdf_table :{}".format(e))
    read_pdf_img(file, img_dir, filetype)
    print('\n')
    print('写入成功!!!')
    end = time.time() - start
    wirte_txt(end, file, txt_dir, filetype)
    return 0


#写入txt
def wirte_txt(dic,file_path,txt_dir,flietype):
    file_name = os.path.splitext(file_path)[0]
    file_name = file_name+flietype
    txt_path = os.path.join(txt_dir, os.path.basename(file_name) + ".txt")
    with open(txt_path,'a',encoding = 'utf-8') as f1 :
        f1.write(str(dic)+"\n")
    return file_name

# def doc_to_docx(file):
#     word = wc.Dispatch("Word.Application") # 打开word应用程序
#     doc = word.Documents.Open(file) #打开word文件
#     doc.SaveAs("{}x".format(file), 12)#另存为后缀为".docx"的文件，其中参数12指docx文件
#     doc.Close() #关闭原来word文件
#     word.Quit()
#     print("完成！")
#     return "{}x".format(file)

# def save_as_xlsx(fname):
#     excel = win32.gencache.EnsureDispatch('Excel.Application')
#     wb = excel.Workbooks.Open(fname)
#     print(fname)
#     if fname.endswith(".xls"):
#         save_name = fname+"x"
#     elif fname.endswith(".xlsx"):
#         save_name = fname
#     wb.SaveAs(save_name, FileFormat = 51)
#     # os.remove(fname)
#     wb.Close()                               
#     excel.Application.Quit()
#     return save_name

def solve_path(path):
    if path.endswith(('.docx', '.DOCX', 'doc', 'DOC')):
        file_type = "_doc"
        # if path.endswith(('doc','DOC')):
        # path = doc_to_docx(path)
        read_word(path, pathlist, file_type)

    elif path.endswith(('.pptx', '.PPTX', 'ppt')):
        file_type = "_ppt"
        pptx_file_translate(path, pathlist, file_type)

    elif path.endswith(('.xlsx', '.XLSX', '.xls', ".XLS")):
        file_type = "_excel"
        # if path.endswith(('.xls',".XLS")):
        # path = save_as_xlsx(path)
        # time.sleep(0.5)
        read_excel_content(path, pathlist, file_type)

    elif path.endswith(('.pdf', '.PDF')):
        file_type = "_pdf"
        read_pdf_content(path, pathlist, file_type)
    else:
        print("格式不支持")


if __name__ == '__main__':

    dir_path = r'xxx.xlsx'
    save_path = r"xxx"
    # dir_path = sys.argv[1]
    # save_path = sys.argv[2]
    makedir(save_path)
    txt_path = os.path.join(save_path, "txt")
    makedir(txt_path)
    image_path = os.path.join(save_path, "img")
    makedir(image_path)
    pathlist = [txt_path,image_path]
    if os.path.isfile(dir_path):
        solve_path(dir_path)
    else:
        list_dirs = os.walk(dir_path)
        for root, dir, file in list_dirs:
            for f in file:
                path = os.path.join(root, f)
                print(path)
                solve_path(path)
会出现doc和xls不支持，需要转为docx和xlsx 都在里面