python将OCR识别的表格还原到excel中

最新推荐文章于 2025-09-09 15:01:58 发布

原创

最新推荐文章于 2025-09-09 15:01:58 发布 · 2.2k 阅读

11 ·

CC 4.0 BY-SA版权

文章标签：

#python

本文介绍如何使用Python将OCR识别的表格数据整理并存入Excel文件，提供了相关代码示例，包括处理单元格宽高调整的方法。

将OCR识别结果写入excel，直接上代码：

# -*- coding: UTF-8 -*-
import pdfplumber
import pandas as pd
import time, json
import os, openpyxl
from openpyxl.utils import get_column_letter
from openpyxl.styles import Border, Side

file_path = r"D:\Date11\data_pdf_txt"
save_path = r"D:\Date11\data_pdf_excel111"
os.makedirs(save_path, exist_ok=True)
# 设置边框样式
def set_border(t_border, b_border, l_border, r_border, t_color='000000', b_color='000000', l_color='000000', r_color='000000'):
    border = Border(top=Side(border_style=t_border, color=t_color),
                    bottom=Side(border_style=b_border, color=b_color),
                    left=Side(border_style=l_border, color=l_color),
                    right=Side(border_style=r_border, color=r_color))
    return border
#设置单元格的边框
def format_border(ws, start_row, end_row, start_col, end_col):
    # 内部
    for row in tuple(ws[start_row:end_row]):
        for cell in row[start_col-1:end_col]:
            # cell.border = set_border('dotted', 'dotted', 'dotted', 'dotted')
            cell.border = set_border('medium', 'medium', 'medium', 'medium')
            # cell.border = set_border(cell.border.top.style, cell.border.bottom.style, cell.border.left.style, 'medium')
    # 左側
    for cell in [row[start_col-1] for row in ws[start_row:end_row]]:
        cell.border = set_border(cell.border.top.style, cell.border.bottom.style, 'medium', cell.border.right.style)
        # cell.border = set_border(cell.border.top.style, cell.border.bottom.style, cell.border.left.style, 'medium')
    # 右側
    for cell in [row[end_col-1] for row in ws[start_row:end_row]]:
        cell.border = set_border(cell.border.top.style, cell.border.bottom.style, cell.border.left.style, 'medium')
    # 上側
    for cell in ws[start_row][start_col-1:end_col]:
        cell.border = set_border('medium', cell.border.bottom.style, cell.border.left.style, cell.border.right.style)
        # cell.border = set_border(cell.border.top.style, cell.border.bottom.style, cell.border.left.style, 'medium')
    # 下側
    for cell in ws[end_row][start_col-1:end_col]:
        cell.border = set_border(cell.border.top.style, 'medium', cell.border.left.style, cell.border.right.style)
    return ws

def tranform_table(file_name, hd_name):
    print(file_name)
    head_name = os.path.splitext(hd_name)[0]
    ele_dict = {1: "A", 2: "B", 3: "C", 4: "D", 5: "E", 6: "F", 7: "G", 8: "H", 9: "I", 10: "J", 11: "K", 12: "L"}
    wb = openpyxl.Workbook()

    a = 0
    with open(file_name, 'r', encoding='utf-8') as load_json:
        data_json = json.load(load_json)
        data = data_json["pages"]

        for pag in data:
            tables = pag["tables"]
            for table in tables:  # 遍历所有的表排序
                print(table["cells"])
                table_list = table["cells"].sort(key=lambda x: (x["start_row"], x["start_column"]))
            for table in tables:

                a += 1
                # ws = wb.active
                # ws.title =
                ws = wb.create_sheet('sheet{}'.format(a))
                last_column = []
                last_row = []
                start_row=[]
                start_column=[]
                # for cell in table["cells"]:
                #     print(cell)
                #     if cell["start_row"] == cell["end_row"] :
                #     if cell["start_column"] == cell["end_column"]:
                #
                #     # 调整行高
                #     ws.row_dimensions[1].height = 20
                #     ws.row_dimensions[2].height = 20
                #     # 调整列宽
                #     ws.column_dimensions['A'].width = 40.0
                for dan in table["cells"]:
                    # for dan in cells:

                    print(dan["start_row"])
                    print(dan["content"])
                    print(type(dan["start_row"]))

                    try:


                        if dan["start_row"] == dan["end_row"] and dan["start_column"] == dan["end_column"]:
                            last_column.append(dan["end_column"])
                            last_row.append(dan["end_row"])
                            ws.cell(row=dan["start_row"], column=dan["start_column"]).value = dan["content"]
                        elif dan["start_row"] == dan["end_row"] and dan["start_column"] != dan["end_column"]:
                            font = ele_dict[dan["start_column"]] + str(dan["sta