# -*- coding: utf-8 -*-
# @Time : 2023/8/1 13:14
# @Author : Cocktail_py
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
file_name =r'xx.pdf'
output_string = StringIO()
with open(file_name, 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for number,page in enumerate(PDFPage.create_pages(doc)):
interpreter.process_page(page)
print("**********************************")
print("{0}页的内容为".format(number+1),output_string.getvalue())
output_string.truncate(0)
output_string.seek(0)
# print(output_string.getvalue())
pdfminer读取PDF文本内容
最新推荐文章于 2024-10-24 17:21:05 发布