import fitz
import docx
#PyMuPDF对于中文识别还是比较理想的,安装模块pip3 install PyMuPDF
pdf_document1 = "2.pdf"
doc = fitz.open(pdf_document1)
# print ("number of pages: %i" % doc.pageCount)
# print(doc.metadata)
#word文件用于存储提取的内容
'''提取文本'''
file =docx.Document()
for current_page in range(doc.page_count):
page = doc.load_page(current_page)
pagetext = page.get_text("text")
file.add_paragraph(pagetext)
file.save('3.docx')
'''提取图片'''
pdf_document2 = "2.pdf"
pdf_document = fitz.open(pdf_document2)
count=0
for current_page in range(pdf_document.page_count):
for image in pdf_document.get_page_images(current_page):
xref = image[0]
pix = fitz.Pixmap(pdf_document, xref)
count=count+1
if pix.n < 5: # this is GRAY or RGB
pix.save(".\image\\"+str(current_page)+"_"+str(count)+".png")
else: # CMYK: convert to RGB first
pix1 = fitz.Pixmap(fitz.csRGB, pix)
pix.save(".\image\\"+str(current_page)+"_"+str(count)+".png")
pix1 = None
pix = None