import os
import base64
import requests
import pandas as pd
class BaiduOCR(object):
'''
调用百度OCR识别发票信息保存至excel文件
'''
def __init__(self):
self.AppID = ""
self.APIKey = ""
self.SecretKey = ""
def getAccessToken(self):
"""
:param APIKey:
:param SecretKey:
:return:
"""
url = "https://aip.baidubce.com/oauth/2.0/token"
params = {"grant_type": "client_credentials", "client_id": self.APIKey, "client_secret": self.SecretKey}
return str(requests.post(url, params=params).json().get("access_token"))
def getContent(self, accessToken, pdfFile):
"""
:param accessToken
:param pdfFile:
:return:
"""
f = open(pdfFile, 'rb')
pdf = base64.b64encode(f.read())
request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
params = {"pdf_file": pdf}
request_url = request_url + "?access_token=" + accessToken
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
return response.json()
def getContentPng(self, accessToken, pngFile):
"""
:param accessToken
:param pngFile:
:return:
"""
f = open(pngFile, 'rb')
pdf = base64.b64encode(f.read())
request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
params = {"image": pdf}
access_token = accessToken
request_url = request_url + "?access_token=" + access_token
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
return response.json()
def getUsefulInfo(self, content, fp_path):
"""
:param content
:param pdf_name:
:return:
"""
words_result = content['words_result']
info = {'发票文件路径': fp_path,
'发票号码': str(words_result['InvoiceNum']),
'开票日期': words_result['InvoiceDate'],
'货物名称': words_result['CommodityName'][0]['word'],
'未税金额': words_result['CommodityAmount'][0]['word'],
'货物税率': words_result['CommodityTaxRate'][0]['word'],
'货物税额': words_result['CommodityTax'][0]['word'],
'合计金额': words_result['TotalAmount'],
'合计税额': words_result['TotalTax'],
'价税合计(小写)': words_result['AmountInFiguers'],
'价税合计(大写)': words_result['AmountInWords'],
'销售方名称': words_result['SellerName'],
'销售方纳税人识别号': words_result['SellerRegisterNum'],
'销售方银行及账户': words_result['SellerBank'],
'销售方地址及电话': words_result['SellerAddress']}
return info
if __name__ == '__main__':
ocr = BaiduOCR()
fp_base = r"C:\代理手续费\发票"
save_path = r'C:\代理手续费\发票识别数据'
pdfFilelist = os.listdir(fp_base)
infolist = []
for pdfFile in pdfFilelist:
if pdfFile.split(".")[-1] == 'pdf':
access_token = ocr.getAccessToken()
fp_path = os.path.join(fp_base, pdfFile)
print(fp_path)
content = ocr.getContent(access_token, fp_path)
info = ocr.getUsefulInfo(content, fp_path)
infolist.append(info)
df = pd.DataFrame(infolist)
with pd.ExcelWriter(os.path.join(save_path, '增值税发票信息统计.xlsx')) as writer:
df.to_excel(writer, sheet_name='增值税发票信息', index=False)