为了避免造成服务器过载,可以在两次下载之间添加时延,从而降低爬虫下载速度。
class Throttle:
def __init__(self, delay):
self.delay = delay
self.domains = {}
def wait(self, url):
domain = urlparse.urlparse(url).netloc
last_accessed = self.domains.get(domain)#get函数在domain不存在时返回None
if self.delay > 0 and last_accessed is not None