python2:通用的抓取网页函数: get、post、自动管理cookie
import urllib
import urllib2
import cookielib
class WebCrawler(object):
"""docstring for WebCrawler web association function"""
def __init__(self):
self.userAgentCount = 0#记录是第几次调用打开浏览器,给readUrl使用
self.userAgentList = ['User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)',
'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)']
self.cookiefile = ''
self.url = ' '
self.isFirstWeb = False
#访问http网页 url,可自动分辨是需要保存还是使用上一次的cookie,成功返回页面内容,失败则返回错误码、错误原因,并退出程序
def readUrl(self,url,data = None,referer = ''):
'''readUrl(url):open url and ruturn totalHtml if success,or return error code,reason and exit application if error'''
starttime = time.time()
#url:判断url的正确性
if url == '':
print "readUrl error:url is none"
return
#url:判断是否是首次访问该网页,是的话保存cookie,不是的话取用原有的cookie
self.isFirstWeb = False
if url.find(self.url) == -1:
self.userAgentCount = 0
if self.cookiefile:
os.remove(self.cookiefile)
self.url = url
self.isFirstWeb = True
#设置data数据:
if data:
data = urllib.urlencode(data)
#header:需要使用的user agent,使用全局变量 userAgentCount
header = {'User-Agent' :self.userAgentList[self.userAgentCount],'Referer':referer }
if not self.isFirstWeb:
self.userAgentCount = self.userAgentCount + 1
if self.userAgentCount >= len(self.userAgentList):
self.userAgentCount = 0
request = urllib2.Request(url,data,header)
#设置cookie
if self.isFirstWeb:
self.cookiefile = time.strftime("%Y%m%d%H%M%S",time.localtime()) + "_cookie.txt"
cookie = cookielib.MozillaCookieJar(self.cookiefile)
else:
cookie = cookielib.MozillaCookieJar()
cookie.load(self.cookiefile,ignore_discard = True,ignore_expires = True)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
try:
response = opener.open(request)
except urllib2.URLError, e:
if hasattr(e,"code"):
print 'readurl URLError code',e.code
if hasattr(e,"reason"):
print 'readurl URLError reason',e.reason
print '503 :换个伪装的浏览器或许可行 ,500 :可能没有下一页了'
exit(0)
if self.isFirstWeb:
cookie.save(ignore_discard=True, ignore_expires=True)
#测试阶段使用,记录最后一次成功访问的网页,用于后面程序的调试
totalHtml = response.read().decode('utf-8')
wfile = open('log_thelastweb.html','w')
wfile.write(totalHtml.encode('utf-8'))
wfile.close()
endtime = time.time()
print "usetime = " + str(endtime - starttime) + ",read url had done......"
return totalHtml