from cnocr import CnOcr
# import numpy as np
import fnmatch,os
# from PIL import Image
import re
from pdf2image import convert_from_path
ocr = CnOcr()
def pdf2img(PDF_file):
# PDF 转为图片
pages = convert_from_path(PDF_file, 200)
# 获得pages:每个pdf的总页数
image_counter = 1
for page in pages:
filename = "./img/page_" + str(image_counter) + ".png"
page.save(filename, 'png')
image_counter += 1
return image_counter, PDF_file
# return 26,H51001.pdf
def img2txt(image_counter, PDF_file):
# 图片中提取文本
filelimit = image_counter-1 # pdf页数的限制 H51001:26-1=25
outfile = newfilename(PDF_file) # outfile = './OCR/H51001.txt'
outfile = os.path.join('./OCR/', outfile)
f = open(outfile, "a")
for i in range(1, filelimit + 1):
filename = "./img/page_"+str(i)+".png" # ./page_1.png
text = ""
for ls in ocr.ocr(filename):
# text = text + ls[0]
text = text+ls["text"]
# text += '\n'
# 获取内容
# print(text)
# print(text)
# text = str((ocr.ocr(filename))[:][0]) # chi_sim 表示简体中文
# 文本处理
text = text.replace('\n', '') #处理掉全部换行
text = text.replace(' ', '')
text = text.replace('(', '')
text = text.replace(')', '')
text = re.sub('[a-zA-Z]','',text) #将英文数字全部替换
# text = re.sub(r'\d\d','\n',text)
text = text.replace('.','')
text = text.replace('ü', '')
text = text.replace('é', '')
text = text.replace('à', '')
print(text)
f.write(text)
f.close()
print(outfile + ' over\n')
def newfilename(filePath,outfile=''):
dirs,filename = os.path.split(filePath)
# 2、修改切分后的文件后缀
outfile = ""
if fnmatch.fnmatch(filename,'*.pdf') or fnmatch.fnmatch(filename,'*PDF'):
outfile = filename[:-4] + '.txt' # 更新文件后缀名
return outfile
filePath = './pdf/'
# filelist = os.listdir(filePath)
filelist = ['H21003.pdf']
#['H51001.pdf','H51002.pdf',...]
for filename in filelist:
PDF_file = os.path.join(filePath, filename)
image_counter, PDF_file = pdf2img(PDF_file)
img2txt(image_counter, PDF_file)
# cnocr包
# https://gitee.com/cyahua/cnocr?_from=gitee_search
# 要想使用pdf2image
# 要在Windows配置poppler
# https://stackoverflow.com/questions/18381713/how-to-install-poppler-on-windows
CnOCR文本抽取
于 2023-03-07 09:27:32 首次发布