记录一次使用python读取pdf的经历
pdf格式如下图所示:
该文件包含了一些论文信息:题目和作者
现在需要将论文和题目进行提取,输入到excel中。
处理程序如下,虽然有一些错误,但是可以作为一种参考方法。
处理方法
import pdfplumber
import os
import openpyxl as xl
def process(words, base_top, tgt):
top_list = {}
paper_dict = {}
num = 1
for word in words:
# author = []
# paper = []
top = str(word['top']+base_top).split('.')[0]
text = word['text']
# print(text)
if text.strip(':').isdigit():
continue
text = text.replace('-\xad‐','-')
if top in top_list:
paper_dict[top_list[top]][0].append(text)
else:
top_list[top] = num
paper_dict[top_list[top]] = ([text], top)
num += 1
paper_info = []
for _, p_dict in paper_dict.items():
paper_info.append(p_dict)
# print(p_dict)
'''
p_dict = ( ['xiao', 'huo', 'zi'], 100 )
'''
paper_info_len = len(paper_info)
i = 0
paper = []
# print(paper_info_len)
while i < paper_info_len-1:
line_num = int(paper_info[i][1])
paper.append(paper_info[i])
j = i+1
# print(paper_info[i])
try:
while int(paper_info[j][1]) - line_num == 14 :#根据不同行之间的间距,判断是否属于同一个paper
line_num = int(paper_info[j][1])
paper.append(paper_info[j])
j += 1
if j == paper_info_len:
break
except:
print(j) #输出错误行
# print(paper)
paper_name, author = get_name_and_author(paper)
'''
按照每一行的word所处的位置制定规则,匹配出论文题目和作者
'''
if paper_name:
# 写入excel
# headers = ["AAAI-18", "", paper_name, author]
# sheet.append(headers)
print(paper_name)
print(author)
print('------------------')
# pass
i = j
paper = []
def get_name_and_author(paper):
paper_len = len(paper)
if paper_len == 1:
return None, None
elif paper_len == 2:
paper_name = ' '.join(paper[0][0])
author = ' '.join(paper[1][0])
# paper_name = paper[0][0]
# author = paper[1][0]
return paper_name, author
elif paper_len == 3:
# len1 = int(paper[0][1])
len2 = int(paper[1][1])
len3 = int(paper[2][1])
if len2 <= len3:#表明第二行仍然是paper_name
paper_name = paper[0][0]
paper_name.extend(paper[1][0])
author = paper[2][0]
# print(author)
elif len3 < len2:
paper_name = paper[0][0]
author = paper[1][0]
author.extend(paper[2][0])
else:
return None, None
paper_name = ' '.join(paper_name)
author = ' '.join(author)
return paper_name, author
elif paper_len >= 4:
len1 = int(paper[0][1])
len2 = int(paper[1][1])
# len3 = int(paper[2][1])
if len1 < len2: # 表明第二行不是paper_name
paper_name = paper[0][0]
author = paper[1][0]
l = 2
while l < paper_len:
author.extend(paper[l][0])
l += 1
elif len1 > len2:
paper_name = paper[0][0]
paper_name.extend(paper[1][0])
author = paper[2][0]
l = 3
while l < paper_len:
author.extend(paper[l][0])
l += 1
else:
return None, None
paper_name = ' '.join(paper_name)
author = ' '.join(author)
return paper_name, author
else:
return None, None
if __name__ == '__main__':
pdf = pdfplumber.open("AAAI-15.pdf")# 需要处理的pdf
tgt = 'AAAI-15.xlsx' # 提取后的Excel
pages = pdf.pages
base_top = 0
# 打开Excel
if os.path.exists(tgt):
workbook = xl.load_workbook(tgt)
else:
workbook = xl.Workbook()
workbook.save(tgt)
sheet = workbook.active
for i, page in enumerate(pages):#遍历所有页面
# if i == 2:
# break
words = page.extract_words()
# for word in words:
# print(word)
# print('------------------------')
process(words, base_top, sheet)
base_top += 710 #每增加一页,会top会从头开始记录,所以需要设置一个递增变量,710是根据每一页的top最大值进行的取值
workbook.save(tgt) #保存Excel