# -*- coding: utf-8 -*-
import urllib2
# 下载网页
def download(url):
return urllib2.urlopen(url).read()
# 可能会遇到一些无法遇见的错误,可能会抛出异常
# 捕捉异常版
def download(url):
print 'Downloading:', url
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print 'Download Error:', e.reason
html = None
return html
# 重试下载版(有些错误是临时的,我们可以尝试重新下载,5xx服务器端问题)
def download(url, num_retries=2):
print 'Downloading:',url
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print 'Download Error:', e.reason
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url, num_retries-1)
return html
# 设置用户代理,重试次数
def download(url, user_aget='wswp', num_retries=2):
print 'Downloading:',url
headers = {'User-agent':user_aget}
request = urllib2.Request(url, headers=headers)
try:
html = urllib2.Request(request).read()
except urllib2.URLError as e:
print 'Download Error:', e.reason
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url, num_retries-1)
return html
爬虫系列6下载一个网页(异常处理,用户代理,重试次数)
最新推荐文章于 2024-11-04 14:54:01 发布
本文介绍了一个简单的网页下载器实现过程,包括基本下载、异常处理、重试机制及用户代理设置等关键技术点。
3万+

被折叠的 条评论
为什么被折叠?



