一.数据格式
其实.ptts文件就是个二进制文件,只要按照规则读就行了,难度不是很大。数据格式如下:
二.主要代码
def read_ptts_from_dir(ptts_dir):
with open(ptts_dir, 'rb') as f:
header_size = struct.unpack('l', f.read(4))[0]
#print(header_size)
Format_code = f.read(8)
#print(Format_code)
Ill = f.read(header_size - 54)
#print(Ill)
code_type = f.read(20)
#print(code_type)
code_length = struct.unpack('h',f.read(2))[0]
#print(code_length)
data_type = f.read(20)
#print(data_type)
sample_length = struct.unpack('i', f.read(4))[0]
#print(sample_length)
page_index = struct.unpack('i',f.read(4))[0]
#print(page_index)
stroke_num = struct.unpack('i', f.read(4))[0]
#print('笔画数:%d' %stroke_num)
traj = []
point_num = []
for i in range(stroke_num):
pointnum = struct.unpack('h', f.read(2))[0]
point_num.append(pointnum)
#print('采样点数:%d' %pointnum)
#traj = []
for j in range(pointnum):
x = struct.unpack('H', f.read(2))[0]
y = struct.unpack('H', f.read(2))[0]
traj.append([x, y])
#print(traj)
line_num = struct.unpack('H', f.read(2))[0]
#print('行数:%d' %line_num)
tagcode = []
char_stroke_index = []
line_char_nmu_index = []
char_stroke_num_index = []
for i in range(line_num):
line_stroke_nmu = struct.unpack('H', f.read(2))[0]
#print('行笔画数:%d' %line_stroke_nmu)
line_stroke_index = []
for j in range(line_stroke_nmu):
stroke_index = struct.unpack('H', f.read(2))[0]
line_stroke_index.append(stroke_index)
#print(line_stroke_index)
line_char_nmu = struct.unpack('H', f.read(2))[0]
line_char_nmu_index.append(line_char_nmu)
#print('该行有:%d字符' %line_char_nmu)
for char in range(line_char_nmu):
tag_code = f.read(code_length)
tag_code = tag_code.decode('gb18030')
tag_code = tag_code.replace('\x00', '')
tagcode.append(tag_code)
char_stroke_num = struct.unpack('H', f.read(2))[0]
char_stroke_num_index.append(char_stroke_num)
#print('字符笔画数为:%d' %char_stroke_num)
char_stroke_index_1 = []
for index in range(char_stroke_num):
char_stroke_index_0 = struct.unpack('H', f.read(2))[0]
char_stroke_index_1.append(char_stroke_index_0)
char_stroke_index.append(char_stroke_index_1)
f.close()
return stroke_num, point_num, traj, line_num, line_char_nmu_index, char_stroke_num_index, char_stroke_index, tagcode
三.结果
①ground truth
②单字
③文本
总结
解析过程挺简单的,只要注意读取的顺序就行。