https://nlpr.ia.ac.cn/databases/handwriting/Home.html
CASIA在线和离线中文手写数据库
https://nlpr.ia.ac.cn/databases/handwriting/Offline_database.html
CASIA-HWDB2.0-2.2
离线文本数据库是由孤立字符数据集的作者制作的。每人撰写了五页给定文本。由于数据丢失,缺少一位作者(编号 371)和四页内容。每一页都存储在以作者索引和页码命名的 *.dgrl 文件中。除灰度图像外,数据文件还包括文本行分割的基本事实和字符类别标签(GB 码)。(训练集和测试集的作者是互斥的)
Dataset | #writers | #pages | #lines | #character/#class | #out-of-class sample |
---|---|---|---|---|---|
HWDB2.0 | 419 | 2,092 | 20,495 | 538,868/1,222 | 1,106 |
HWDB2.1 | 300 | 1,500 | 17,292 | 429,553/2,310 | 172 |
HWDB2.2 | 300 | 1,499 | 14,443 | 380,993/1,331 | 581 |
Total | 1,019 | 5,091 | 52,230 | 1,349,414/2,703 | 1,859 |
截图(使用https://nlpr.ia.ac.cn/databases/Download/DGRLView.exe查看)
使用准备
把原训练集和测试集的数据合并到一个文件夹里,再根据作者重新分配训练集和测试集,确保训练集中存在每一个作者的笔迹.还要把dgrl 转成 png.
使用dgrl 查看002-P18.dgrl 是正常的(不会出现上下行重叠)
使用代码导出 png 时,发生了重叠(每行的坐标和长宽使用的是文件里面解析的结果)
软件应该是进行了渲染的处理,比如正片叠底
这里使用简单的方法处理,记录每行添加进大图后的最下边的 y 值,这样虽然上下距离变远了但不影响使用.
007-P19
这里的白边切不掉
修改后的代码(参考https://blog.youkuaiyun.com/DaGongJiGuoMaLu09/article/details/107050519)
import os
import struct
from pathlib import Path
import cv2 as cv
import numpy as np
from tqdm import tqdm
#切割白边
def remove_white(image):
# 将图像反相
inverted_image = cv.bitwise_not(image)
# 计算上下左右投影
horizontal_projection = np.sum(inverted_image, axis=1)
vertical_projection = np.sum(inverted_image, axis=0)
# 找到非零元素的索引范围
non_zero_horizontal = np.where(horizontal_projection > 0)
non_zero_vertical = np.where(vertical_projection > 0)
# 获取边界
top = non_zero_horizontal[0][0]
bottom = non_zero_horizontal[0][-1]
left = non_zero_vertical[0][0]
right = non_zero_vertical[0][-1]
# 切割图像并返回
cropped_image = image[top:bottom, left:right]
return cropped_image
def read_from_dgrl(dgrl):
if not os.path.exists(dgrl):
print("DGRL not exists!")
return
# if(os.path.basename(dgrl)!="385-P20.dgrl"):
# return
gray_output_dir = "Y:/RawData/CASIA-HWDB2-png/gray/test"
binary_output_dir = "Y:/RawData/CASIA-HWDB2-png/binary/test"
with open(dgrl, "rb") as f:
# 读取表头尺寸
header_size = np.fromfile(f, dtype="uint8", count=4)
header_size = sum([j << (i * 8) for i, j in enumerate(header_size)])
# print(header_size)
# 读取表头剩下内容,提取 code_length
header = np.fromfile(f, dtype="uint8", count=header_size - 4)
code_length = sum([j << (i * 8) for i, j in enumerate(header[-4:-2])])
# print(code_length)
# 读取图像尺寸信息,提取图像中行数量
image_record = np.fromfile(f, dtype="uint8", count=12)
height = sum([j << (i * 8) for i, j in enumerate(image_record[:4])])
width = sum([j << (i * 8) for i, j in enumerate(image_record[4:8])])
line_num = sum([j << (i * 8) for i, j in enumerate(image_record[8:])])
# print("图像尺寸:")
# print(height, width, line_num)
# 创建一个空白图像来存储所有行的图像
big_image = np.zeros((height+3000, 9000), dtype=np.uint8)
big_image.fill(255)
now_x=0
now_y=0
# 读取每一行的信息
for k in range(line_num):
# print(k + 1)
# 读取该行的字符数量
char_num = np.fromfile(f, dtype="uint8", count=4)
char_num = sum([j << (i * 8) for i, j in enumerate(char_num)])
# print("字符数量:", char_num)
# 读取该行的标注信息
label = np.fromfile(f, dtype="uint8", count=code_length * char_num)
label = [
label[i] << (8 * (i % code_length))
for i in range(code_length * char_num)
]
label = [
sum(label[i * code_length : (i + 1) * code_length])
for i in range(char_num)
]
label = [struct.pack("I", i).decode("gbk", "ignore")[0] for i in label]
# print("合并前:", label)
label = "".join(label)
# 去掉不可见字符 \x00,这一步不加的话后面保存的内容会出现看不见的问题
label = "".join(label.split(b"\x00".decode()))
# print("合并后:", label)
# 读取该行的位置和尺寸
pos_size = np.fromfile(f, dtype="uint8", count=16)
y = sum([j << (i * 8) for i, j in enumerate(pos_size[:4])])
x = sum([j << (i * 8) for i, j in enumerate(pos_size[4:8])])
h = sum([j << (i * 8) for i, j in enumerate(pos_size[8:12])])
w = sum([j << (i * 8) for i, j in enumerate(pos_size[12:])])
# print(x, y, w, h)
# 读取该行的图片
bitmap = np.fromfile(f, dtype="uint8", count=h * w)
bitmap = np.array(bitmap).reshape(h, w)
base_name = os.path.basename(dgrl)
filename, _ = os.path.splitext(base_name)
# 保存图片(每行的图片)
bitmap=remove_white(bitmap)
# image_file = os.path.join(gray_output_dir, filename + "_"+str(k)+".png")
# cv.imwrite(image_file, bitmap)
h, w = bitmap.shape[0], bitmap.shape[1]
# if np.all(bitmap == 0) or np.all(bitmap == 255):
# pass
# 将该行图像放入大图中的相应位置
if(k==0):
big_image[y : y + h, x : x + w] = bitmap[:h, :w]
now_y=y+h
else:
# print(x+w)
# print(big_image.shape[:2])
# print(big_image[now_y : now_y + h, x : x + w].shape[:2])
if(x<0):
x=abs(x)
big_image[now_y : now_y + h, x : x + w] = bitmap[:h, :w]
now_y=now_y + h
cropped_image=remove_white(big_image)
cropped_image_file = os.path.join(gray_output_dir, filename + ".png")
cv.imwrite(cropped_image_file, cropped_image)
# 使用大津法进行二值化处理
_, binaryx_image =cv.threshold(cropped_image, 0, 255, cv.THRESH_BINARY | cv.THRESH_OTSU)
# 将二值图像保存到文件中
cv.imwrite(os.path.join(binary_output_dir, filename + ".png"), binaryx_image)
if __name__ == "__main__":
dgrl_paths = Path("Y:/RawData/CASIA-HWDB2/test").iterdir()
dgrl_paths = list(dgrl_paths)
la=False
for dgrl_path in tqdm(dgrl_paths):
print(dgrl_path)
# if(os.path.basename(dgrl_path)=="385-P20.dgrl"):
# la=True
# if(la):
# read_from_dgrl(dgrl_path)
read_from_dgrl(dgrl_path)
提取结果(保存灰度图和二值图)