python读取pdf

记录一次使用python读取pdf的经历

pdf格式如下图所示:
该文件包含了一些论文信息:题目和作者
现在需要将论文和题目进行提取,输入到excel中。
处理程序如下,虽然有一些错误,但是可以作为一种参考方法。

在这里插入图片描述

处理方法

import pdfplumber
import os
import openpyxl as xl

def process(words, base_top, tgt):
    top_list = {}
    paper_dict = {}
    num = 1
    for word in words:
        # author = []
        # paper = []
        top = str(word['top']+base_top).split('.')[0]
        text = word['text']
        # print(text)
        if text.strip(':').isdigit():
            continue
        text = text.replace('-\xad‐','-')
        if top in top_list:
            paper_dict[top_list[top]][0].append(text)
        else:
            top_list[top] = num
            paper_dict[top_list[top]] = ([text], top)
            num += 1

    paper_info = []
    for _, p_dict in paper_dict.items():
        paper_info.append(p_dict)
        # print(p_dict)
        '''
            p_dict = ( ['xiao', 'huo', 'zi'], 100 )
        '''
    paper_info_len = len(paper_info)
    i = 0
    paper = []
    # print(paper_info_len)
    while i < paper_info_len-1:
        line_num = int(paper_info[i][1])
        paper.append(paper_info[i])
        j = i+1
        # print(paper_info[i])
        try:
            while int(paper_info[j][1]) - line_num == 14 :#根据不同行之间的间距,判断是否属于同一个paper
                line_num = int(paper_info[j][1])
                paper.append(paper_info[j])
                j += 1
                if j == paper_info_len:
                    break
        except:
            print(j)    #输出错误行
        # print(paper)
        paper_name, author = get_name_and_author(paper)
        '''
            按照每一行的word所处的位置制定规则,匹配出论文题目和作者
        '''
        if paper_name:
            # 写入excel
            # headers = ["AAAI-18", "", paper_name, author]
            # sheet.append(headers)
            print(paper_name)
            print(author)
            print('------------------')
            # pass

        i = j
        paper = []

def get_name_and_author(paper):
    paper_len = len(paper)
    if paper_len == 1:
        return None, None
    elif paper_len == 2:
        paper_name = ' '.join(paper[0][0])
        author = ' '.join(paper[1][0])
        # paper_name = paper[0][0]
        # author = paper[1][0]
        return paper_name, author
    elif paper_len == 3:
        # len1 = int(paper[0][1])
        len2 = int(paper[1][1])
        len3 = int(paper[2][1])

        if len2 <= len3:#表明第二行仍然是paper_name
            paper_name = paper[0][0]
            paper_name.extend(paper[1][0])
            author = paper[2][0]
            # print(author)
        elif len3 < len2:
            paper_name = paper[0][0]
            author = paper[1][0]
            author.extend(paper[2][0])
        else:
            return None, None

        paper_name = ' '.join(paper_name)
        author = ' '.join(author)
        return paper_name, author

    elif paper_len >= 4:
        len1 = int(paper[0][1])
        len2 = int(paper[1][1])
        # len3 = int(paper[2][1])
        if len1 < len2:  # 表明第二行不是paper_name
            paper_name = paper[0][0]
            author = paper[1][0]
            l = 2
            while l < paper_len:
                author.extend(paper[l][0])
                l += 1
        elif len1 > len2:
            paper_name = paper[0][0]
            paper_name.extend(paper[1][0])
            author = paper[2][0]
            l = 3
            while l < paper_len:
                author.extend(paper[l][0])
                l += 1
        else:
            return None, None
        paper_name = ' '.join(paper_name)
        author = ' '.join(author)
        return paper_name, author

    else:
        return None, None


if __name__ == '__main__':
    pdf = pdfplumber.open("AAAI-15.pdf")#   需要处理的pdf
    tgt = 'AAAI-15.xlsx'                #   提取后的Excel
    pages = pdf.pages
    base_top = 0

    #   打开Excel
    if os.path.exists(tgt):
        workbook = xl.load_workbook(tgt)
    else:
        workbook = xl.Workbook()
        workbook.save(tgt)
    sheet = workbook.active

    for i, page in enumerate(pages):#遍历所有页面
        # if i == 2:
        #     break
        words = page.extract_words()
        # for word in words:
        #     print(word)
        # print('------------------------')
        process(words, base_top, sheet)
        base_top += 710 #每增加一页,会top会从头开始记录,所以需要设置一个递增变量,710是根据每一页的top最大值进行的取值
    workbook.save(tgt)  #保存Excel







评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值