pubtabnet数据清洗

原创已于 2022-09-04 20:30:24 修改 · 1.1k 阅读

2 ·

CC 4.0 BY-SA版权

文章标签：

#python #机器学习 #pytorch

于 2022-02-25 17:32:17 首次发布

表格识别专栏专栏收录该内容

9 篇文章

订阅专栏

这篇博客介绍了如何处理PubTabNet数据集中坐标标注不准确的问题。通过编写Python代码，利用IoU（Intersection over Union）来计算两个边界框的重叠比例，筛选出重叠超过30%的边界框，标记为错误数据，并将清洗后的数据写入新的文件。同时，对含有错误坐标的图片进行了保存，以便进一步分析和修正。

部署运行你感兴趣的模型镜像

一. 问题描述

puttabnet中数据不太干净，特别是坐标的标注，如下图：
在这里插入图片描述

二. 数据清洗

直接看代码

"""
把pubtabnet数据集中坐标标注错误的数据筛选除去
"""
import jsonlines
import os
import cv2


def iou(bbox1, bbox2):
    """
    Calculates the intersection-over-union of two bounding boxes.
    """
    bbox1 = [float(x) for x in bbox1]
    bbox2 = [float(x) for x in bbox2]
    (x0_1, y0_1, x1_1, y1_1) = bbox1
    (x0_2, y0_2, x1_2, y1_2) = bbox2
    # get the overlap rectangle
    overlap_x0 = max(x0_1, x0_2)
    overlap_y0 = max(y0_1, y0_2)
    overlap_x1 = min(x1_1, x1_2)
    overlap_y1 = min(y1_1, y1_2)
    # check if there is an overlap
    if overlap_x1 - overlap_x0 <= 0 or overlap_y1 - overlap_y0 <= 0:
        return 0
    # if yes, calculate the ratio of the overlap to each ROI size and the unified size
    size_1 = (x1_1 - x0_1) * (y1_1 - y0_1)
    size_2 = (x1_2 - x0_2) * (y1_2 - y0_2)
    size_intersection = (overlap_x1 - overlap_x0) * (overlap_y1 - overlap_y0)
    # size_union = size_1 + size_2 - size_intersection
    size_union = min(size_1, size_2)
    return size_intersection / size_union


def get_flag(box_list):
    length = len(box_list)
    for i in range(length-1):
        for j in range(i+1, length):
            box1 = box_list[i]
            box2 = box_list[j]
            threshold = iou(box1, box2)
            if threshold > 0.3:
                print(threshold)
                return False
    return True

if __name__ == "__main__":
    with jsonlines.open("PubTabNet_2.0.0_val.jsonl", "r") as f:
        with jsonlines.open("PubTabNet_2.0.0_val_new.jsonl", "w") as train_f:
            for data in f:
                filename = data["filename"]
                img_path = os.path.join("val", filename)
                img = cv2.imread(img_path)
                cells = data["html"]["cells"]
                box_list = []
                for idx, cell in enumerate(cells):
                    if len(cell["tokens"]) == 0 or "bbox" not in cell.keys():
                        continue
                    box = cell['bbox']
                    x1, y1, x2, y2 = box
                    box_list.append(box)
                    cv2.rectangle(img, (x1,y1), (x2,y2), (0,0,255), 1)
                flag = get_flag(box_list)
                if not flag:
                    cv2.imwrite(os.path.join("badcase", filename), img)
                else:
                    train_f.write(data)