ICCV 1000多篇论文,用 python 提取出每篇论文的摘要。
论文改名,原文件名称是:作者_标题_ICCV_2019_paper.pdf, 去掉开始和结尾,缩短标题
import os
import glob
path = "F:\\BaiduNetdiskDownload\\ICCV2019"
files = glob.glob("F:\\BaiduNetdiskDownload\\ICCV2019\\**.pdf")
for file in files:
base = file.split("\\")[3]
author = base.split("_")[0]
post = "_ICCV_2019_paper"
main = base.replace(author+"_", "").replace(post, "").replace("_", " ")
filename = os.path.join(path, main)
os.rename(file, filename)
print('done')
提取摘要(少部分文章的排版不一样,未能提取到有效摘要)
# -*- coding: utf-8 -*-
'''
ICCV 2019 论文摘要提取
'''
import glob
import sys
import importlib
importlib.reload(sys)
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import *
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
# 以下 PDF 处理代码是从某篇 优快云 博文中抄来的,没记住原文链接
def parse(pdffile):
fp = open(pdffile, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
# 检测文档是否提供txt转换,不提供就忽略
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
# 创建 PDf 资源管理器 来管理共享资源
rsrcmgr = PDFResourceManager()
# 创建一个PDF设备对象
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# 创建一个PDF解释器对象
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in doc.get_pages(): # doc.get_pages() 获取page列表
found = False
interpreter.process_page(page)
# 接受该页面的LTPage对象
layout = device.get_result()
for x in layout:
if isinstance(x, LTTextBoxHorizontal): # 获取文本内容
results = x.get_text()
abstract = str(results).replace("-\n", "").replace("\n", " ")
if found:
return abstract
else:
if abstract.startswith("Abstract"):
found = True
# 只处理第一页上的 abstract
return "-------- error --------"
if __name__ == '__main__':
path = "F:\\BaiduNetdiskDownload\\ICCV2019"
files = glob.glob("F:\\BaiduNetdiskDownload\\ICCV2019\\**.pdf")
with open("abstract_ICCV2019.txt", "w", encoding="utf-8") as fout:
for pdf in files:
abstract = parse(pdf)
basefile = pdf.split("\\")[-1]
print(basefile)
fout.write("Title: " + basefile + "\n")
fout.write("Abstract: " + abstract + "\n\n")
print("done!")