import os import sys import cv2 from cv2 import resize import numpy as np import matplotlib.pyplot as plt import argparse from PIL import Image import torch import src.utils as utils import src.dataset as dataset import crnn.seq2seq as crnn def seq2seq_decode(encoder_out, decoder, decoder_input, decoder_hidden, max_length): decoded_words = [] alph = "ABCDEFGHIJKLMNOPQRSTUVWXYZŽŠŪ-\'" converter = utils.ConvertBetweenStringAndLabel(alph) prob = 1.0 for di in range(max_length): decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_out) probs = torch.exp(decoder_output) _, topi = decoder_output.data.topk(1) ni = topi.squeeze(1) decoder_input = ni prob *= probs[:, ni] if ni == utils.EOS_TOKEN: break else: decoded_words.append(converter.decode(ni)) words = ''.join(decoded_words) prob = prob.item() return words, prob def find_median(array_vals): array_vals.sort() mid = len(array_vals) // 2 return array_vals[mid] def detect_centerline(array_vals): max_val = max(array_vals) index_list = [index for index in range(len(array_vals)) if array_vals[index] == max_val] return find_median(index_list) def rotate_image(image, angle): image_center = tuple(np.array(image.shape[1::-1]) / 2) rot_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0) result = cv2.warpAffine(image, rot_mat, image.shape[1::-1], flags=cv2.INTER_LINEAR) return result def extract_peak_ranges_from_array(array_vals, minimum_val=100, minimum_range=2): start_i = None end_i = None peak_ranges = [] for i, val in enumerate(array_vals): if val >= minimum_val and start_i is None: start_i = i elif val >= minimum_val and start_i is not None: pass elif val < minimum_val and start_i is not None: end_i = i if end_i - start_i > minimum_range: peak_ranges.append((start_i, end_i)) start_i = None end_i = None elif val < minimum_val and start_i is None: pass else: raise ValueError("Cannot Parse") return peak_ranges parser = argparse.ArgumentParser() parser.add_argument('--img_path', type=str, default='', help='the path of the input image') parser.add_argument('--rot_angle', type=int, default=0, help='the global rotation image') parser.add_argument('--padding', type=int, default=10, help='paddings at the head of the image') parser.add_argument('--block_size', type=int, default=33, help='threshold for binarizing image, odd number only') parser.add_argument('--threshold', type=int, default=32, help='radius to calculate the average for thresholding, even number only') parser.add_argument('--vertical_minimum', type=int, default=800, help='minimal brightness of each VERTICAL line') parser.add_argument('--word_minimum', type=int, default=200, help='minimal brightness of each WORD') parser.add_argument('--blur', type=bool, default=False, help='apply blur to words?') parser.add_argument('--pretrained', type=int, default=1, help='which pretrained model to use') cfg = parser.parse_args() def main(): global_rot_angle = cfg.rot_angle global_padding = cfg.padding imagename = cfg.img_path if cfg.pretrained == 0: my_encoder = "./model/encoder_0.pth" my_decoder = "./model/decoder_0.pth" elif cfg.pretrained == 1: my_encoder = "./model/encoder_1.pth" my_decoder = "./model/decoder_1.pth" else: sys.exit("Unknown Pretrained Model!") print("Analyzing: "+imagename) alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZŽŠŪ-\'" print("Using Möllendorff Alphabet List: " + alphabet + "\n") # len(alphabet) + SOS_TOKEN + EOS_TOKEN num_classes = len(alphabet) + 2 transformer = dataset.ResizeNormalize(img_width=480, img_height=64) image_color = cv2.imread(imagename) image_shape = (image_color.shape[0], image_color.shape[1]) image_binary = cv2.cvtColor(image_color, cv2.COLOR_BGR2GRAY) image = cv2.rotate(image_binary, cv2.ROTATE_90_COUNTERCLOCKWISE) adaptive_threshold = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, cfg.block_size, cfg.threshold) adaptive_threshold = rotate_image(adaptive_threshold, global_rot_angle) adaptive_threshold = cv2.copyMakeBorder(adaptive_threshold, 20, 20, 20, 20, cv2.BORDER_CONSTANT, 0) adaptive_threshold = adaptive_threshold[10:adaptive_threshold.shape[0]-10, 10:adaptive_threshold.shape[1]-10] image_blur = cv2.GaussianBlur(adaptive_threshold,(3,3),cv2.BORDER_DEFAULT) cv2.imshow('Binary Image', cv2.rotate(adaptive_threshold, cv2.ROTATE_90_CLOCKWISE)) cv2.waitKey(1) vertical_sum = np.sum(image_blur, axis=1) peak_ranges = extract_peak_ranges_from_array(vertical_sum,minimum_val=cfg.vertical_minimum,minimum_range=5) img_display = np.copy(adaptive_threshold) #peak_ranges.append((peak_ranges[-1][1],adaptive_threshold.shape[0])) peak_ranges.reverse() horizontal_peak_ranges2d = [] for peak_range in peak_ranges: start_y = 0 end_y = img_display.shape[1] image_x = image_blur[peak_range[0]:peak_range[1], start_y:end_y] horizontal_sum = np.sum(image_x,axis = 0) # plt.plot(horizontal_sum, range(horizontal_sum.shape[0])) # plt.gca().invert_yaxis() # plt.show() horizontal_peak_ranges = extract_peak_ranges_from_array(horizontal_sum,minimum_val=cfg.word_minimum,minimum_range=5) horizontal_peak_ranges2d.append(horizontal_peak_ranges) for hor in horizontal_peak_ranges: cv2.rectangle(img_display, (hor[0], peak_range[0]), (hor[1], peak_range[1]), 140, 1) word_piece = adaptive_threshold[peak_range[0]:peak_range[1],hor[0]:hor[1]] if cfg.blur: word_piece = cv2.GaussianBlur(word_piece,(1,1),cv2.BORDER_DEFAULT) else: pass image_dimension = (word_piece.shape[0], word_piece.shape[1]) #cv2.imshow('Words', word_piece) #print(word_piece.shape) if image_dimension[0] < 30 or image_dimension[1] < 20: pass else: factor = 1 image_resized = cv2.resize(word_piece, (int(image_dimension[1]*factor),int(image_dimension[0]*factor)), interpolation = cv2.INTER_AREA) hor_sum = np.sum(image_resized, axis=1) ctr_line = detect_centerline(hor_sum) image_dimension_new = (image_resized.shape[0], image_resized.shape[1]) add_padding = max([ctr_line, image_dimension_new[0]-ctr_line]) # cv2.imshow('current Image', image_resized) # cv2.waitKey(0) if image_dimension_new[1]<=500: padded = cv2.copyMakeBorder(image_resized, add_padding-ctr_line, add_padding-image_dimension_new[0]+ctr_line, 0, 0, cv2.BORDER_CONSTANT, 0) else: padded = image_resized factor = 64/padded.shape[0] padded = cv2.resize(padded, (int(padded.shape[1]*factor),int(padded.shape[0]*factor)), interpolation = cv2.INTER_AREA) padded = cv2.copyMakeBorder(padded, 0, 0, global_padding, 480 - global_padding - padded.shape[0], cv2.BORDER_CONSTANT, 0) padded = Image.fromarray(np.uint8(padded)).convert('L') padded = transformer(padded) padded = padded.view(1, *padded.size()) padded = torch.autograd.Variable(padded) encoder = crnn.Encoder(1, 1024) # no dropout during inference decoder = crnn.Decoder(1024, num_classes, dropout_p=0.0, max_length=121) map_location = 'cpu' encoder.load_state_dict(torch.load(my_encoder, map_location=map_location)) decoder.load_state_dict(torch.load(my_decoder, map_location=map_location)) encoder.eval() decoder.eval() encoder_out = encoder(padded) max_length = 121 decoder_input = torch.zeros(1).long() decoder_hidden = decoder.initHidden(1) words, prob = seq2seq_decode(encoder_out, decoder, decoder_input, decoder_hidden, max_length) print(words+" ", end = '') print("\n") cv2.destroyAllWindows() cv2.imshow('Current Line', cv2.rotate(img_display, cv2.ROTATE_90_CLOCKWISE)) cv2.waitKey(1) input("Reading Completed, Press Any Key to Exit. Ambula Baniha.") # color = (0, 0, 255) # for i, peak_range in enumerate(peak_ranges): # for horizontal_range in horizontal_peak_ranges2d[i]: # x = peak_range[0] # y = horizontal_range[0] # w = peak_range[1] # h = horizontal_range[1] # patch = adaptive_threshold[x:w,y:h] # cv2.rectangle(img_display, (y,x), (h,w), 255, 2) # # print(cnt) # # cv2.imwrite("/Users/zhuohuizhang/Downloads/ManchuOCR/Data/"+fontname+"/Result/"+'%d' %cnt + '.jpg', patch) # cnt += 1 # # cv2.imshow('Vertical Segmented Image', line_seg_blur) # cv2.waitKey(0) if __name__ == "__main__": main() 根据上面代码打包的readmanchu.exe创建一个新的图形化py脚本,新的图形化py脚本可以执行readmanchu.exe和参数:--img_path(该参数是必填,该选项是选择图片)、--rot_angle(旋转角度,默认是0,选填)、--padding(图形头部的填充,默认值是10,选填)、--block_size(图像二值化阈值,仅限奇数,默认值是33,选填)、--threshold(计算阈值平均值的半径,仅限偶数,默认值32,选填)、--vertical_minimum(每条垂直线的最小亮度,默认值800,选填)、--word_minimum(每个单词的最小亮度,默认值200,选填)、--blur(对文字应用模糊效果?,默认值False,选填,如果填'1'对应的就是True,如果填'0'对应的就是False,要求该参数也是填写项,填写1或0)、----pretrained(使用哪个预训练模型?默认值1,int类型),输入参数后点击确认即可将readmanchu.exe打印的信息显示出来,并将打印的信息输出到当前文件夹下的.\output\选中图片的名称.txt(自动在当前目录下创建目录和文件);点击确认后就会开始调用 readmanchu.exe,实际运行效果是:(举例) readmanchu.exe --img_path .\examples\001.png --blur False 就相当于后台调用cmd执行readmanchu.exe文件