基于广域网的主从分布式爬虫系统

最新推荐文章于 2025-06-23 22:08:38 发布

THMAIL

最新推荐文章于 2025-06-23 22:08:38 发布

阅读量1k

点赞数

CC 4.0 BY-SA版权

分类专栏： Python

本文链接：https://blog.youkuaiyun.com/THMAIL/article/details/62846362

Python 专栏收录该内容

78 篇文章

订阅专栏

本文介绍了一种基于广域网的主从分布式爬虫系统的设计与实现，使用Python语言，并以asp300网站作为示例进行详细说明。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

241133427088097

目的：构造基于广域网的主从分布式爬虫系统，以asp300网站为例，主机用来爬取任务地址，从机在每次任务完成会向主机发送任务请求，由主机分配任务。

原理：

整个爬虫系统分为两块，一块是主机，一块是从机，从机数量不限，视主机的瓶颈压力而定！主机一般只用一台，但如果性能不够或者达到性能瓶颈，那主机就成了限制整个爬虫系统的最大效率的瓶颈，这也就是所谓的“木桶效应”。

如何提高主机效率在本篇文章中不做研究，大家自行探索。

实现工具：

语言：Python

网址：ASP300.COM

环境：腾讯云学生主机（配置：1GHZ 单核 1G内存）

代码示例：

主机代码：

#coding=gb2312
import urllib2
import urllib
import random
import socket
from bs4 import BeautifulSoup
user_agent = ["User-Agent,Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
              "User-Agent,Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
              "User-Agent,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
              "User-Agent,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",
              "User-Agent,Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
              "User-Agent,Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
              "User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"]

def iteration():
    url_list = []
    for i in range(1, 100):
        url = 'http://www.asp300.com/SoftList/27/27_%d.html' % i
        # 伪造用户身份
        headers = {'User-Agent': user_agent[random.randint(0, 6)]}  # 随机headers
        req = urllib2.Request(url=url, headers=headers)
        #测试页面是否能丢失
        try:
            response = urllib2.urlopen(req)
        except urllib2.URLError, e:
            if hasattr(e, 'code') and e.code == 404:
                continue
        url_list.append(url)
    return url_list


def server(url_list):
    HOST = ''
    PORT = 21567
    BUFSIZ = 1024
    ADDR = (HOST,PORT)
    #创建套接字
    tcpSerSock = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
    tcpSerSock.bind(ADDR)
    tcpSerSock.listen(5)

    while True:
        #死循环，使主机保持接受从机访问
        print 'waiting for connection......'
        tcpCliSock, addr = tcpSerSock.accept()
        print '...connected from:',addr

        while True:
            data = tcpCliSock.recv(BUFSIZ)
            if not data:
                break
            #发送任务到从机
            tcpCliSock.send('%s'%url_list.pop)

            tcpCliSock.close()

    tcpSerSock.close()

if __name__ == 'main':
    url_list = iteration()
    server(url_list)

从机代码：

#coding=gb2312
import urllib2
import urllib
import cookielib
import re
from PIL import Image
from bs4 import BeautifulSoup
import socket
import time

class MyError(Exception):
    #定义一个错误，留引发
    pass

class IgnoError(Exception):
    #定义一个错误，留引发
    pass

class RedirctHandler(urllib2.HTTPRedirectHandler):
    #重构302,301页面处理类
  """docstring for RedirctHandler"""
  def http_error_301(self, req, fp, code, msg, headers):
      if 'location' in headers:
          newurl = headers.getheaders('location')[0]
      elif 'uri' in headers:
          newurl = headers.getheaders('uri')[0]
      else:
          return
      return newurl
  def http_error_302(self, req, fp, code, msg, headers):
      if 'location' in headers:
          newurl = headers.getheaders('location')[0]
      elif 'uri' in headers:
          newurl = headers.getheaders('uri')[0]
      else:
          return
      return newurl

def Download(url,headers,num_retries=9):
    #下载url所指向的页面
    req = urllib2.Request(url, headers=headers)
    try:
        response = urllib2.urlopen(req,timeout=60)
        the_page = response.read()
        response.close()
    except urllib2.URLError,e:
        if num_retries > 0 and hasattr(e,'code') and (500 <= e.code <600):
            return Download(url,headers,num_retries - 1)
        elif hasattr(e, 'code') and (e.code == 404):
            raise IgnoError
        else:
            print 'Download Error:', e.reason
            raise MyError
    except socket.timeout:
        if num_retries > 0 :
            return Download(url, headers, num_retries - 1)
        raise socket.timeout
    return the_page

def resolve(html):
    #提取源码信息
    image_url = []
    soup = BeautifulSoup(html,"lxml")
    title = unicode(soup.head.title.string)
    title = re.search('(.*?)_ASP300',title).groups()[0]
    size = soup.find('div',class_='box').find('div',class_='box_1').find('div',id='goodsInfo').find('div',class_='textInfo').ul.find_all('li')[2].dd
    size = unicode(size)
    size = float(re.search(u'软件大小：(.*?)\D',size).groups()[0])
    summary_tag = soup.find('div',class_='s')
    summary_content = unicode(summary_tag).strip()
    summary_content = summary_content.split('<br/>')
    summary_content[0] = summary_content[0][15:]
    del summary_content[len(summary_content)-1]
    for a,b in enumerate(summary_content):
        if b == '\n':
            del summary_content[a]
    summary_cahe = u''
    for c in summary_content:
        summary_cahe += (c + u'<br/>')
    summary_content = summary_cahe
    for i in summary_tag.p.find_all('img'):
        image_url.append('http://www.asp300.com' + i['src'])
    #获取图片下载地址，放入image_url中,image_url中的元素为str，非unicode
    return title,size,summary_content,image_url#title,summary是unicode，size是float，image_url中的元素均为str

def download_image(name,url,headers,num_tries=9):
    #下载图片
    req = urllib2.Request(url=url,headers=headers)
    try:
        f = urllib2.urlopen(req,timeout=60)
    except urllib2.URLError, e:
        if num_tries > 0 and hasattr(e,'code') and 500 <= e.code <600:
            return download_image(name,url,headers,num_tries - 1)
        else:
            print '下载图片出错：',e.reason
            raise MyError
    except socket.timeout:
        if num_tries > 0 :
            return download_image(name,url,headers,num_tries - 1)
        raise socket.timeout
    image = open(name,'wb')
    image.write(f.read())
    f.close()
    image.close()

def screenshot(name,change,format):
    # 去除水印
    im = Image.open(name)
    w,h = im.size
    box = (0,0,w,h-change)
    region = im.crop(box)
    region.save(name,format)

def soft_url(url,headers,num_retries=9):
    #获取软件真实下载地址
    id = int(re.search('SoftView_(.*?).html',url).groups()[0])
    url1 = 'http://www.asp300.com/2012dll/Down.jsp?CodeID=%d&id=1'%id
    #这一步是为了获取商品的ookie
    cookie = cookielib.CookieJar()
    handler = urllib2.HTTPCookieProcessor(cookie)
    opener1 = urllib2.build_opener(handler)
    req1 = urllib2.Request(url=url1,headers=headers)
    try:
        opener1.open(req1,timeout=60)
        print '%s：获取下载COOKIE成功'%time.ctime()
    except urllib2.URLError, e:
        if num_retries > 0 and hasattr(e,'code') and (500 <= e.code <600):
            return soft_url(url,headers,num_retries - 1)
        else:
            print 'SOFT_URL1 Error:', e.reason
            raise MyError
    except socket.timeout:
        if num_retries > 0:
            return soft_url(url, headers, num_retries - 1)
        raise socket.timeout
    #cookie获取完毕
    #这一步是基于上一步的cookie，获取真是下载地址
    #debug_handler = urllib2.HTTPHandler(debuglevel=1)
    opener2 = urllib2.build_opener(RedirctHandler,handler)
    url2 = 'http://www.asp300.com/2012dll/DownBJ.jsp?CodeID=%d'%id
    req2 = urllib2.Request(url=url2,headers=headers)
    try:
        html = opener2.open(req2,timeout=60)
        print '%s：获取下载地址成功'%time.ctime()
    except urllib2.URLError, e:
        if num_retries > 0 and hasattr(e,'code') and (500 <= e.code <600):
            return soft_url(url,headers,num_retries - 1)
        else:
            print 'SOFT_URL2 Error:', e.reason
            raise MyError
    except socket.timeout:
        if num_retries > 0:
            return soft_url(url, headers, num_retries - 1)
        raise socket.timeout
    return html

def clicent():
    HOST = ''#这里填主机ip，自己根据情况修改
    PORT = 21567#这里填主机端口，自己根据情况修改
    BUFSIZ = 1024
    ADDR = (HOST,PORT)
    #创建套接字
    tcpCliSock = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
    tcpCliSock.connect(ADDR)

    data = "ask for task"
    tcpCliSock.send(data)
    data = tcpCliSock.recv(BUFSIZ)

    return data

def main():
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 '}
    url = clicent()
    html = Download(url, headers)
    print '页面下载完成'
    title, size, summary_content, image_url = resolve(html)
    print '信息提取完成'
    id = 0
    for i in image_url:
        name = './image/image_cache%d.jpg' % id
        download_image(name, i, headers)
        print '图片下载完成%d' % id
        screenshot(name, 52, 'jpeg')
        print '图片转换完成'
        id += 1
    download_url = soft_url(url, headers)
    print title
    print size
    summary_content = summary_content.replace(u'\u200b',u'')#unicode特殊字符'\u200b'，不能转为gb2312
    print summary_content,type(summary_content)
    print summary_content.encode('gb2312')
    print image_url
    print download_url


if __name__ == '__main__':
    main()