目的:构造基于广域网的主从分布式爬虫系统,以asp300网站为例,主机用来爬取任务地址,从机在每次任务完成会向主机发送任务请求,由主机分配任务。
原理:
整个爬虫系统分为两块,一块是主机,一块是从机,从机数量不限,视主机的瓶颈压力而定!主机一般只用一台,但如果性能不够或者达到性能瓶颈,那主机就成了限制整个爬虫系统的最大效率的瓶颈,这也就是所谓的“木桶效应”。
如何提高主机效率在本篇文章中不做研究,大家自行探索。
实现工具:
语言:Python
网址:ASP300.COM
环境:腾讯云学生主机(配置:1GHZ 单核 1G内存)
代码示例:
主机代码:
#coding=gb2312
import urllib2
import urllib
import random
import socket
from bs4 import BeautifulSoup
user_agent = ["User-Agent,Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
"User-Agent,Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"User-Agent,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"User-Agent,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",
"User-Agent,Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
"User-Agent,Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"]
def iteration():
url_list = []
for i in range(1, 100):
url = 'http://www.asp300.com/SoftList/27/27_%d.html' % i
# 伪造用户身份
headers = {'User-Agent': user_agent[random.randint(0, 6)]} # 随机headers
req = urllib2.Request(url=url, headers=headers)
#测试页面是否能丢失
try:
response = urllib2.urlopen(req)
except urllib2.URLError, e:
if hasattr(e, 'code') and e.code == 404:
continue
url_list.append(url)
return url_list
def server(url_list):
HOST = ''
PORT = 21567
BUFSIZ = 1024
ADDR = (HOST,PORT)
#创建套接字
tcpSerSock = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
tcpSerSock.bind(ADDR)
tcpSerSock.listen(5)
while True:
#死循环,使主机保持接受从机访问
print 'waiting for connection......'
tcpCliSock, addr = tcpSerSock.accept()
print '...connected from:',addr
while True:
data = tcpCliSock.recv(BUFSIZ)
if not data:
break
#发送任务到从机
tcpCliSock.send('%s'%url_list.pop)
tcpCliSock.close()
tcpSerSock.close()
if __name__ == 'main':
url_list = iteration()
server(url_list)
从机代码:
#coding=gb2312
import urllib2
import urllib
import cookielib
import re
from PIL import Image
from bs4 import BeautifulSoup
import socket
import time
class MyError(Exception):
#定义一个错误,留引发
pass
class IgnoError(Exception):
#定义一个错误,留引发
pass
class RedirctHandler(urllib2.HTTPRedirectHandler):
#重构302,301页面处理类
"""docstring for RedirctHandler"""
def http_error_301(self, req, fp, code, msg, headers):
if 'location' in headers:
newurl = headers.getheaders('location')[0]
elif 'uri' in headers:
newurl = headers.getheaders('uri')[0]
else:
return
return newurl
def http_error_302(self, req, fp, code, msg, headers):
if 'location' in headers:
newurl = headers.getheaders('location')[0]
elif 'uri' in headers:
newurl = headers.getheaders('uri')[0]
else:
return
return newurl
def Download(url,headers,num_retries=9):
#下载url所指向的页面
req = urllib2.Request(url, headers=headers)
try:
response = urllib2.urlopen(req,timeout=60)
the_page = response.read()
response.close()
except urllib2.URLError,e:
if num_retries > 0 and hasattr(e,'code') and (500 <= e.code <600):
return Download(url,headers,num_retries - 1)
elif hasattr(e, 'code') and (e.code == 404):
raise IgnoError
else:
print 'Download Error:', e.reason
raise MyError
except socket.timeout:
if num_retries > 0 :
return Download(url, headers, num_retries - 1)
raise socket.timeout
return the_page
def resolve(html):
#提取源码信息
image_url = []
soup = BeautifulSoup(html,"lxml")
title = unicode(soup.head.title.string)
title = re.search('(.*?)_ASP300',title).groups()[0]
size = soup.find('div',class_='box').find('div',class_='box_1').find('div',id='goodsInfo').find('div',class_='textInfo').ul.find_all('li')[2].dd
size = unicode(size)
size = float(re.search(u'软件大小:(.*?)\D',size).groups()[0])
summary_tag = soup.find('div',class_='s')
summary_content = unicode(summary_tag).strip()
summary_content = summary_content.split('<br/>')
summary_content[0] = summary_content[0][15:]
del summary_content[len(summary_content)-1]
for a,b in enumerate(summary_content):
if b == '\n':
del summary_content[a]
summary_cahe = u''
for c in summary_content:
summary_cahe += (c + u'<br/>')
summary_content = summary_cahe
for i in summary_tag.p.find_all('img'):
image_url.append('http://www.asp300.com' + i['src'])
#获取图片下载地址,放入image_url中,image_url中的元素为str,非unicode
return title,size,summary_content,image_url#title,summary是unicode,size是float,image_url中的元素均为str
def download_image(name,url,headers,num_tries=9):
#下载图片
req = urllib2.Request(url=url,headers=headers)
try:
f = urllib2.urlopen(req,timeout=60)
except urllib2.URLError, e:
if num_tries > 0 and hasattr(e,'code') and 500 <= e.code <600:
return download_image(name,url,headers,num_tries - 1)
else:
print '下载图片出错:',e.reason
raise MyError
except socket.timeout:
if num_tries > 0 :
return download_image(name,url,headers,num_tries - 1)
raise socket.timeout
image = open(name,'wb')
image.write(f.read())
f.close()
image.close()
def screenshot(name,change,format):
# 去除水印
im = Image.open(name)
w,h = im.size
box = (0,0,w,h-change)
region = im.crop(box)
region.save(name,format)
def soft_url(url,headers,num_retries=9):
#获取软件真实下载地址
id = int(re.search('SoftView_(.*?).html',url).groups()[0])
url1 = 'http://www.asp300.com/2012dll/Down.jsp?CodeID=%d&id=1'%id
#这一步是为了获取商品的ookie
cookie = cookielib.CookieJar()
handler = urllib2.HTTPCookieProcessor(cookie)
opener1 = urllib2.build_opener(handler)
req1 = urllib2.Request(url=url1,headers=headers)
try:
opener1.open(req1,timeout=60)
print '%s:获取下载COOKIE成功'%time.ctime()
except urllib2.URLError, e:
if num_retries > 0 and hasattr(e,'code') and (500 <= e.code <600):
return soft_url(url,headers,num_retries - 1)
else:
print 'SOFT_URL1 Error:', e.reason
raise MyError
except socket.timeout:
if num_retries > 0:
return soft_url(url, headers, num_retries - 1)
raise socket.timeout
#cookie获取完毕
#这一步是基于上一步的cookie,获取真是下载地址
#debug_handler = urllib2.HTTPHandler(debuglevel=1)
opener2 = urllib2.build_opener(RedirctHandler,handler)
url2 = 'http://www.asp300.com/2012dll/DownBJ.jsp?CodeID=%d'%id
req2 = urllib2.Request(url=url2,headers=headers)
try:
html = opener2.open(req2,timeout=60)
print '%s:获取下载地址成功'%time.ctime()
except urllib2.URLError, e:
if num_retries > 0 and hasattr(e,'code') and (500 <= e.code <600):
return soft_url(url,headers,num_retries - 1)
else:
print 'SOFT_URL2 Error:', e.reason
raise MyError
except socket.timeout:
if num_retries > 0:
return soft_url(url, headers, num_retries - 1)
raise socket.timeout
return html
def clicent():
HOST = ''#这里填主机ip,自己根据情况修改
PORT = 21567#这里填主机端口,自己根据情况修改
BUFSIZ = 1024
ADDR = (HOST,PORT)
#创建套接字
tcpCliSock = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
tcpCliSock.connect(ADDR)
data = "ask for task"
tcpCliSock.send(data)
data = tcpCliSock.recv(BUFSIZ)
return data
def main():
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 '}
url = clicent()
html = Download(url, headers)
print '页面下载完成'
title, size, summary_content, image_url = resolve(html)
print '信息提取完成'
id = 0
for i in image_url:
name = './image/image_cache%d.jpg' % id
download_image(name, i, headers)
print '图片下载完成%d' % id
screenshot(name, 52, 'jpeg')
print '图片转换完成'
id += 1
download_url = soft_url(url, headers)
print title
print size
summary_content = summary_content.replace(u'\u200b',u'')#unicode特殊字符'\u200b',不能转为gb2312
print summary_content,type(summary_content)
print summary_content.encode('gb2312')
print image_url
print download_url
if __name__ == '__main__':
main()