一. 问题描述
puttabnet中数据不太干净,特别是坐标的标注,如下图:
二. 数据清洗
直接看代码
"""
把pubtabnet数据集中坐标标注错误的数据筛选除去
"""
import jsonlines
import os
import cv2
def iou(bbox1, bbox2):
"""
Calculates the intersection-over-union of two bounding boxes.
"""
bbox1 = [float(x) for x in bbox1]
bbox2 = [float(x) for x in bbox2]
(x0_1, y0_1, x1_1, y1_1) = bbox1
(x0_2, y0_2, x1_2, y1_2) = bbox2
# get the overlap rectangle
overlap_x0 = max(x0_1, x0_2)
overlap_y0 = max(y0_1, y0_2)
overlap_x1 = min(x1_1, x1_2)
overlap_y1 = min(y1_1, y1_2)
# check if there is an overlap
if overlap_x1 - overlap_x0 <= 0 or overlap_y1 - overlap_y0 <= 0:
return 0
# if yes, calculate the ratio of the overlap to each ROI size and the unified size
size_1 = (x1_1 - x0_1) * (y1_1 - y0_1)
size_2 = (x1_2 - x0_2) * (y1_2 - y0_2)
size_intersection = (overlap_x1 - overlap_x0) * (overlap_y1 - overlap_y0)
# size_union = size_1 + size_2 - size_intersection
size_union = min(size_1, size_2)
return size_intersection / size_union
def get_flag(box_list):
length = len(box_list)
for i in range(length-1):
for j in range(i+1, length):
box1 = box_list[i]
box2 = box_list[j]
threshold = iou(box1, box2)
if threshold > 0.3:
print(threshold)
return False
return True
if __name__ == "__main__":
with jsonlines.open("PubTabNet_2.0.0_val.jsonl", "r") as f:
with jsonlines.open("PubTabNet_2.0.0_val_new.jsonl", "w") as train_f:
for data in f:
filename = data["filename"]
img_path = os.path.join("val", filename)
img = cv2.imread(img_path)
cells = data["html"]["cells"]
box_list = []
for idx, cell in enumerate(cells):
if len(cell["tokens"]) == 0 or "bbox" not in cell.keys():
continue
box = cell['bbox']
x1, y1, x2, y2 = box
box_list.append(box)
cv2.rectangle(img, (x1,y1), (x2,y2), (0,0,255), 1)
flag = get_flag(box_list)
if not flag:
cv2.imwrite(os.path.join("badcase", filename), img)
else:
train_f.write(data)