需求
输入为中文PDF格式的表格,包含名为key的列,根据Excel中key,value两列对应的值,输出PDF中的表格,并将key列的值替换为Value,格式为Excel。
解决方案
import pandas as pd
import pdfplumber
def dealPdfandExcel(pdfFilePath, excelFilePath, outPath):
#pdfFilePath为PDF输入路径
pdf = pdfplumber.open(pdfFilePath)
pages = pdf.pages
page = pages[0]
#str1 = page.extract_text()[14:106].split(',')
#str2 = page.extract_text()[106:163].split(',')
tables = page.extract_tables()
table = tables[0]
df1 = pd.DataFrame(table[:])
#excelFilePath为EXCEL输入路径
df2 = pd.read_excel(excelFilePath) # sheet_name不指定时默认返回全表数据
df2 = df2[1:]
#替换df1第0列的值
after = []
for i in df1[0]:
temp = 0
for zw, yw in zip(df2['key'], df2['value']):
if i == zw:
after.append(yw)
temp = 1
break
if temp == 0:
after.append(i)
df1[0] = pd.Series(after)
#str1 = "".join(str1)
#str2 = "".join(str2)
#outPath为输出的EXCEL路径
writer = pd.ExcelWriter(outPath)
df1.to_excel(writer, index=None, header=None, startrow=1)
ws = writer.sheets['Sheet1']
ws.write_string(0, 4, '标题')
#ws.write_string(1, 0, str1)
#ws.write_string(2, 0, str2)
writer.save()