这个识别还是很准的,可以直接调用的,下次有时间写一个调用接口的
'''
百度图片文字识别接口封装
'''
import base64
from .httplib import HEADERS, retry_get, retry_post
class GeneralOcr(object):
'''百度云通用文字识别'''
__client_id = 'z9ILc5DopWA5rm4NuAou64GY'
__client_secret = 'fAHDaKibDPPN8G80qTTZXxjcBA6yHYUs'
def __init__(self, client_id=None, client_secret=None, access_token=None):
self.client_id = client_id or self.__client_id
self.client_secret = client_id or self.__client_secret
self.access_token = access_token or self.oauth()
@classmethod
def oauth(cls, client_id=None, client_secret=None):
client_id = client_id or cls.__client_id
client_secret = client_secret or cls.__client_secret
oauth_url = f'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={client_id}&client_secret={client_secret}'
oauth_data = retry_get(oauth_url).json()
return oauth_data['access_token']
def basic_general(self, image):
general = f"https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic?access_token={self.access_token}"
data = {'image': base64.b64encode(image)}
HEADERS['Content-Type'] = 'application/x-www-form-urlencoded'
resp = retry_post(general, data=data, headers=HEADERS)
return resp.json() if resp else ({'words_result': '请求错误'})
# 在确定某链接是图片的情况下可设置 `certain=True`
def basic_ocr(self, addr, sep='\n', certain=False):
if addr.startswith('http'):
if certain or addr.endswith('png') or addr.endswith('jpg'):
resp = retry_get(addr)
image = resp.content if resp else b''
else:
raise Exception("The link doesn't have a suffix, make sure it's a image.")
else:
with open(addr, 'rb') as fp:
image = fp.read()
_wds = self.basic_general(image).get('words_result', tuple())
wds = ''
for _wd in _wds:
wds += _wd.get('words', '') + sep
return wds
当我们写的时候引入这个类就行
#这里的src就是图片的链接
from renderer.utils import GeneralOcr
ocr = GeneralOcr()
annex_url = response.urljoin(src)
response.string = ocr.basic_ocr(annex_url, sep='', certain=True)
#ocr = GeneralOcr().basic_ocr('图片链接', sep='', certain=True)
这里就可以直接打印了,然后拿到自己想要的内容,这个方法内置的,可以去调用