python2:通用的抓取网页函数: get、post、自动管理cookie

python2:通用的抓取网页函数: get、post、自动管理cookie

import urllib
import urllib2
import cookielib
class WebCrawler(object):
    """docstring for WebCrawler web association function"""
    def __init__(self):
        self.userAgentCount = 0#记录是第几次调用打开浏览器,给readUrl使用
        self.userAgentList = ['User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)', 
                          'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)']
        self.cookiefile = ''
        self.url = ' '
        self.isFirstWeb = False

    #访问http网页 url,可自动分辨是需要保存还是使用上一次的cookie,成功返回页面内容,失败则返回错误码、错误原因,并退出程序
    def readUrl(self,url,data = None,referer = ''):
        '''readUrl(url):open url and ruturn totalHtml if success,or return error code,reason and exit application if error'''
        starttime = time.time()

        #url:判断url的正确性
        if url == '':
            print "readUrl error:url is none"
            return
        #url:判断是否是首次访问该网页,是的话保存cookie,不是的话取用原有的cookie
        self.isFirstWeb = False
        if url.find(self.url) == -1:
            self.userAgentCount = 0
            if self.cookiefile:
                os.remove(self.cookiefile)
            self.url = url
            self.isFirstWeb = True

        #设置data数据:
        if data:
            data = urllib.urlencode(data)

        #header:需要使用的user agent,使用全局变量 userAgentCount
        header = {'User-Agent' :self.userAgentList[self.userAgentCount],'Referer':referer }
        if not self.isFirstWeb:
            self.userAgentCount = self.userAgentCount + 1
            if self.userAgentCount >= len(self.userAgentList):
                self.userAgentCount = 0

        request = urllib2.Request(url,data,header)

        #设置cookie
        if self.isFirstWeb:
            self.cookiefile = time.strftime("%Y%m%d%H%M%S",time.localtime()) + "_cookie.txt"
            cookie = cookielib.MozillaCookieJar(self.cookiefile)
        else:
            cookie = cookielib.MozillaCookieJar()
            cookie.load(self.cookiefile,ignore_discard = True,ignore_expires = True)

        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))

        try:
            response = opener.open(request)
        except urllib2.URLError, e:
            if hasattr(e,"code"):
                print 'readurl URLError code',e.code
            if hasattr(e,"reason"):
                print 'readurl URLError reason',e.reason
            print '503 :换个伪装的浏览器或许可行 ,500 :可能没有下一页了'
            exit(0)

        if self.isFirstWeb:
            cookie.save(ignore_discard=True, ignore_expires=True)

        #测试阶段使用,记录最后一次成功访问的网页,用于后面程序的调试
        totalHtml = response.read().decode('utf-8')
        wfile = open('log_thelastweb.html','w')
        wfile.write(totalHtml.encode('utf-8'))
        wfile.close()

        endtime = time.time()
        print "usetime = " + str(endtime - starttime) + ",read url had done......"
        return totalHtml

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值