from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
def parse():
fn = open('D:\\search\\1804.03567.pdf','rb') #转成word好点 TXT中RELATED WORK键字找不到 word中可以
parser = PDFParser(fn)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize("")
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
resource = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(resource,laparams=laparams)
interpreter = PDFPageInterpreter(resource,device)
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for out in layout:
if h
pdfminer PDF转word txt
最新推荐文章于 2025-06-14 12:39:53 发布
