# -*- coding: utf-8 -*-
# @Time : 2023/8/1 13:14
# @Author : Cocktail_py
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
file_name =r'xx.pdf'
output_string = StringIO()
with open(file_name, 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for number,page in enumerate(PDFPage.create_pages(doc)):
interpreter.process_page(page)
print("**********************************")
print("{0}页的内容为".format(number+1),output_string.getvalue())
output_string.truncate(0)
output_string.seek(0)
# print(output_string.getvalue())
pdfminer读取PDF文本内容
最新推荐文章于 2024-10-24 17:21:05 发布
本文介绍了如何使用Python的PDFMiner库从PDF文件中提取文本内容,包括设置解析器、资源管理器、设备和逐页处理的过程。
435

被折叠的 条评论
为什么被折叠?



