tabula
需要有Java环境才能运行
安装
pip install tabula-py
import tabula
import os
df = tabula.read_pdf(r"C:\Users\86159\Desktop\helloworld\mypdf.pdf", pages='all')
print(df)
camelot
pip install camelot-py[cv]
安装不成功可以参考:
https://blog.youkuaiyun.com/huiyinimen/article/details/105031160
import camelot
tables = camelot.read_pdf('mypdf.pdf',flavor='stream')
print(tables)
# tables[0].df.to_excel('temp1.xlsx')
tables[0].df.to_html('temp.html')
pdfplumber
安装
pip install pdfplumber
import pdfplumber
import pandas as pd
with pdfplumber.open("mypdf.pdf") as pdf:
# 获取第一页
first_page = pdf.pages[0]
# 解析文本
text &