python2：通用的抓取网页函数： get、post、自动管理cookie

最新推荐文章于 2025-03-14 11:23:25 发布

原创最新推荐文章于 2025-03-14 11:23:25 发布 · 536 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#python

python 专栏收录该内容

9 篇文章

订阅专栏

本文介绍了一个基于Python2的通用网页抓取类，包括GET、POST请求及自动管理Cookie的功能。通过实例展示了如何使用该类进行网页访问并获取内容。

python2：通用的抓取网页函数： get、post、自动管理cookie

import urllib
import urllib2
import cookielib
class WebCrawler(object):
    """docstring for WebCrawler web association function"""
    def __init__(self):
        self.userAgentCount = 0#记录是第几次调用打开浏览器，给readUrl使用
        self.userAgentList = ['User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)', 
                          'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)']
        self.cookiefile = ''
        self.url = ' '
        self.isFirstWeb = False

    #访问http网页 url，可自动分辨是需要保存还是使用上一次的cookie，成功返回页面内容，失败则返回错误码、错误原因，并退出程序
    def readUrl(self,url,data = None,referer = ''):
        '''readUrl(url):open url and ruturn totalHtml if success,or return error code,reason and exit application if error'''
        starttime = time.time()

        #url:判断url的正确性
        if url == '':
            print "readUrl error:url is none"
            return
        #url:判断是否是首次访问该网页，是的话保存cookie，不是的话取用原有的cookie
        self.isFirstWeb = False
        if url.find(self.url) == -1:
            self.userAgentCount = 0
            if self.cookiefile:
                os.remove(self.cookiefile)
            self.url = url
            self.isFirstWeb = True

        #设置data数据：
        if data:
            data = urllib.urlencode(data)

        #header:需要使用的user agent,使用全局变量 userAgentCount
        header = {'User-Agent' :self.userAgentList[self.userAgentCount],'Referer':referer }
        if not self.isFirstWeb:
            self.userAgentCount = self.userAgentCount + 1
            if self.userAgentCount >= len(self.userAgentList):
                self.userAgentCount = 0

        request = urllib2.Request(url,data,header)

        #设置cookie
        if self.isFirstWeb:
            self.cookiefile = time.strftime("%Y%m%d%H%M%S",time.localtime()) + "_cookie.txt"
            cookie = cookielib.MozillaCookieJar(self.cookiefile)
        else:
            cookie = cookielib.MozillaCookieJar()
            cookie.load(self.cookiefile,ignore_discard = True,ignore_expires = True)

        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))

        try:
            response = opener.open(request)
        except urllib2.URLError, e:
            if hasattr(e,"code"):
                print 'readurl URLError code',e.code
            if hasattr(e,"reason"):
                print 'readurl URLError reason',e.reason
            print '503 :换个伪装的浏览器或许可行 ,500 :可能没有下一页了'
            exit(0)

        if self.isFirstWeb:
            cookie.save(ignore_discard=True, ignore_expires=True)

        #测试阶段使用，记录最后一次成功访问的网页，用于后面程序的调试
        totalHtml = response.read().decode('utf-8')
        wfile = open('log_thelastweb.html','w')
        wfile.write(totalHtml.encode('utf-8'))
        wfile.close()

        endtime = time.time()
        print "usetime = " + str(endtime - starttime) + ",read url had done......"
        return totalHtml