Windows系统
示例代码
import sys
import time
import urllib
import shutil
import pytesser3
import requests
from lxml import etree
config={'gid':1}
def parse(s, html, idx):
result = {}
tree = etree.HTML(html)
valimg = None
valimgs = tree.xpath('//img[@id="imgCode"]/@src')
if len(valimgs) > 0:
valimg = valimgs[0]
validateCode = None
if valimg:
fname = 'img/' + str(0) + '_' + str(config['gid']) + '.jpg'
config['gid'] = config['gid'] + 1
ri = s.get("https://sojump.com/jq/16276361.aspx?from=timeline" + valimg)
with open(fname, 'wb') as f:
for chk in ri:
f.write(chk)
f.close()
validateCode = pytesser.image_file_to_string(fname)
validateCode = validateCode.strip()
validateCode = validateCode.replace(' ', '')
validateCode = validateCode.replace('\n', '')
result['validateCode'] = validateCode
return result
s = requests.Session()
r = s.get('https://sojump.com/jq/16276361.aspx?from=timeline')
while True:
res = parse(s, r.text, 0)
print(res)
一:安装pytesser3
1.安装pytesser3
pip3 install pytesser3
2.安装PIL(前者的依赖)
pip install pillow
3.安装tesseract-ocr引擎(没有会识别的很慢)
http://101.96.10.43/internode.dl.sourceforge.net/project/tesseract-ocr-alt/tesseract-ocr-setup-3.02.02.exe(win)
http://blog.csdn.net/strugglerookie/article/details/71606540(linux:centos)
二:安装lxml
pip3 install lxml
**
Linux系统
**
示例代码:
mport random
import requests
import urllib.parse
import urllib.request
from PIL import Image
import pytesseract
from time import time,strftime, localtime
def download(qid,header,i):
url='https://www.wjx.cn/AntiSpamImageGen.aspx?q='+qid+'&t='+str(int(time() * 1000))
req = urllib.request.Request(url,headers=header)def download(qid,header,i):
url='https://www.wjx.cn/AntiSpamImageGen.aspx?q='+qid+'&t='+str(int(time() * 1000))
req = urllib.request.Request(url,headers=header)
data = urllib.request.urlopen(req).read()
pic = open('%d.gif'%(i),'wb')
pic.write(data)
pic.close()
def binarizing(img): #input: gray image 对图像灰度值低的像素点处理,去除噪声
threshold=30
pixdata = img.load()
w, h = img.size
for y in range(h):
for x in range(w):
if pixdata[x, y] > threshold:
pixdata[x, y] = 255
else:
pixdata[x, y] = 0
return img
def depoint(img): #input: gray image
pixdata = img.load()
w,h = img.size
for y in range(1,h-1):#图像扩展防止溢出
for x in range(1,w-1):
count = 0
if pixdata[x,y-1] > 245:
count = count + 1
if pixdata[x,y+1] > 245:
count = count + 1
if pixdata[x-1,y] > 245:
count = count + 1
if pixdata[x+1,y] > 245:
count = count + 1
if count >2:
pixdata[x,y] = 255
return img
def shibie(img):
imgry = img.convert('L')#convert对图片处理(参数L是对图像灰度处理)
threshold = 140
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
out = imgry.point(table, '1')
print(str(pytesseract.image_to_string(out)).strip())#识别
return(str(pytesseract.image_to_string(out)).strip())
1.
安装pytesseract
pip3 install pytesseract
2.安装PIL
pip3 install pillow
3.剩下的看提示吧-_-||
本文介绍了如何利用Python中的pytesseract库配合PIL库来识别网页上的验证码图像,并通过lxml库解析网页内容。文章提供了Windows和Linux系统下详细的安装步骤及示例代码。
1376

被折叠的 条评论
为什么被折叠?



