对《用Python写网络爬虫》进行了总结归纳。
1.爬相关地址
#coding=utf-8
import robotparser
import urlparse
import urllib2
import urllib
import re
import datetime
import time
import Queue
#初始化robotparser
def get_robots(url):
rp=robotparser.RobotFileParser()
rp.set_url(urlparse.urljoin(url,'/robots.txt'))
rp.read()
return rp
def download(url, headers, proxy, num_retries, data=None):
print 'Downloading:', url
request = urllib2.Request(url, data, headers)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
code = response.code
except urllib2.URLError as e:
print 'Download error:', e.reason
html = ''
if hasattr(e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
return download(url, headers, proxy, num_retries-1, data)
else:
code = None
return html
def same_domain(u1,u2):
return urlparse.urlparse(u1).netloc == urlparse.urlparse(u2).netloc
#下载限速:
class Throttle:
def __init__(self,delay):
self.delay=delay
self.domains={}
def wait(self,url):
domain=urlparse.urlparse(url).netloc#服务器位置
last_accessed=self.domains.get(domain)
if self.delay>0 and last_accessed is not None:
sleep_secs=self.delay -(datetime.datetime.now()-last_accessed)