本篇博客实现的是用Python实现爬虫爬取武汉大学教务系统首页的验证码:
下面是简单的实现,分为两部分:
一、从网页上爬取图片;
二、把图片保存到本地;
下面是实现代码:
# coding:utf-8
import re
import urllib
import urllib2
# 在网页上方爬取图片
def scriper():
src ="http://210.42.121.134/"
request = urllib2.Request(src)
response = urllib2.urlopen(request)
result = response.read().decode('gbk')
pattern = re.compile('<img id="captcha-img" alt=".*?" src="(.*?)".*?',re.S)
items = re.findall(pattern,result)
out = ""
for item in items:
out = item
print out
i =1
for i in range(1,100):
saveImg(src+out,str(i)+".jpg")
i +=1
# 对爬取的图片进行保存
def saveImg(imageURL, fileName):
u = urllib.urlopen(imageURL)
data = u.read()
f = open(fileName, 'wb')
f.write(data)
print u"保存图片为", fileName
f.close()
scriper()