def getPdfText2(path):
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
text = page.extract_text()
print(text)
# 提取发票表格上方内容
invoice = {}
ftype: int = 0 # 取购销方纳税识别号方式 1 是 纳税人识别号: 91320213586657279T ,2是只有18位数字
item = re.search(r'发票号码(:|: |:)(\d+)', text)
if item is not None:
item = item.group()
item = re.sub(r'发票号码(:|: |:)', '', item)
item = item.replace(' ', '')
invoice.update({"发票号码(FPHM)": item})
item = re.search(r'开票日期(:|: |:)(.*)', text)
if item is not None:
item = item.group()
item = re.sub(r'开票日期(:|: |:)', '', item)
item = item.replace(' ', '')
invoice.update({"开票日期(KPRQ)": item})
item = re.search(r'机器编号(:|: |:)(\d+)', text)
if item is not None:
item = item.group()
item = re.sub(r'机器编号(:|: |:)', '', item)
item = item.replace(' ', '')
invoice.update({"机器编号(JQBH)": item})
item = re.search(r'发票代码(:|: |:)(\d+)', text)
if item is not None:
item = item.group()
item = re.sub(r'发票代码(:|: |:)', '', item)
item = item.replace(' ', '')
invoice.update({"发票代码(FPDM)": item})
item = re.search(r'校验码(:|: |:)(\d+)', text)
if item is not None:
item = item.group()
item = re.sub(r'校验码(:|: |:)', '', item)
item = item.replace(' ', '')
invoice.update({"校验码(JYM)": item})
item = re.search(r'合(\s+)计(.*)', text)
if item is not None:
item = item.group()
item = item.replace(' ', '').replace('合计', '')
item = re.search(r'¥(\d+).(\d+)', item).group()
item = item.replace('¥', '')
invoice.update({"合计(HJ)": item})
# 购销方 纳税人识别号 方式1 (纳税人识别号: 913202006829704176)
items = re.findall(r'纳税人识别号(:|: |:)(\w+)', text)
if len(items) >= 2:
invoice.update({"购方": items[0][1]})
invoice.update({"销方": items[1][1]})
ftype = 1
if ftype == 0:
# 购销方 纳税人识别号 方式2 (只有18位数字)
items = re.findall(r'[0-9A-HJ-NPQRTUWXY]{2}\d{6}[0-9A-HJ-NPQRTUWXY]{10}', text)
if len(items) == 2:
invoice.update({"购方": items[0]})
invoice.update({"销方": items[1]})
if len(items) >= 2:
invoice.update({"购方": items[1]})
invoice.update({"销方": items[2]})
item = re.search(r'(小写)(.*)', text)
if item is not None:
item = item.group()
item = item.replace(' ', '').replace('小写)¥', '').replace('小写)¥', '')
invoice.update({"价税合计(JSHJ)": item})
items = re.findall(r'\*[\u4e00-\u9fa5]+\*(.*)', text)
i: int = 1
for item in items:
invoice.update({"项目(XM)-" + str(i): item})
i = i + 1
print(invoice)
python提取发票信息
最新推荐文章于 2025-03-23 16:38:05 发布