环境python2.7
# coding=utf-8
import PyPDF2
def read_pdf_test1(file_path):
with open(file_path, 'rb') as f:
reader = PyPDF2.PdfFileReader(f)
if reader.isEncrypted:
reader.decrypt('')
page_num = reader.getNumPages()
contents = ''
for page_num in xrange(page_num):
contents += reader.getPage(page_num).extractText()
contents = contents.replace('\n', '').replace(' ', '').replace('\r', '')
print("contents = {}".format(contents))
def test2(file_path):
import pdfplumber # pdfplumber==0.5.13
contents = ''
with pdfplumber.open(file_path) as pdf:
# page_count = len(pdf.pages)
# print(page_count) # 得到页数
for page in pdf.pages:
print('---------- 第[%d]页 ----------' % page.page_number)
# 获取当前页面的全部文本信息,包括表格中的文字
contents += page.extract_text()
print("contents = {}".format(contents))
if __name__ == "__main__":
# file_path = './dlp_dengxian.pdf'
# file_path = './dlp_yuanyue.pdf'
file_path = './dlp_yuanyue2.pdf'
# read_pdf_test1(file_path)
pass
test2(file_path)
该博客展示了如何使用PyPDF2和pdfplumber库从PDF文件中提取文本。首先通过PyPDF2读取并解密PDF,然后遍历每一页提取文本。接着,使用pdfplumber打开文件,逐页提取文本并打印。内容涉及PDF文件加密处理和文本清理。
22万+

被折叠的 条评论
为什么被折叠?



