1.单线程实现网页下载
#coding=utf-8
import datetime
import csv
from day3 import link_crawler #略有改动。same_domain修改一下。html编码改一下。
from mongoDB import MongoCache#前面实现的
def getUrl(number):
urls=[]
path='D:/top-1m.csv/top-1m.csv'
for _,website in csv.reader(open(path)):
urls.append('http://'+website)
if len(urls)!=number:
print _,website
else :
return urls
from datetime import timedelta
def main():
starttime = datetime.datetime.now()
cache = MongoCache(expires=timedelta())
cache.clear()
link_crawler('http://example.webscraping.com'
,scrape_callback=getUrl(10), cache=cache,delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')
endtime = datetime.datetime.now()
print (endtime - starttime).seconds
if __name__ == '__main__':
main()
80多秒
https://bitbucket.org/wswp/code/src/9e6b82b47087c2ada0e9fdf4f5e037e151975f0f?at=default
这本书的源码网站。都是片段式的。贼烦。
运行出现bson.errors.InvalidStringData: strings in documents must be valid UTF-8
插入数据库的时候出现了问题,s.decode(“unicode_escape”);直接搞定。http://blog.youkuaiyun.com/woshicsdn7547/article/details/41678093
2.多线程
#coding=utf-8
from day3 import Downloader
from day3 import normalize
from mongoDB import MongoCache
from chapter4 import getUrl
import datetime
import threading
import time
SLEEP_TIME=1
def threaded_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1,cache=None,scrape_callback=None,max_threads=10):
crawl_queue=[seed_url]
seen=set([seed_url])
D=Downloader(cache=cache,delay=delay,user_agent=user_agent,proxies=proxy,num_retries=num_retries)
def process_queue():
while True:
try:
url = crawl_queue.pop()
except IndexError:
# crawl queue is empty
break
else:
html = D(url)#调用__call__,获得内容
if scrape_callback:
try:
links = scrape_callback or []
except Exception as e:
print 'Error in callback for: {}: {}'.format(url, e)
else:
for link in links:
link = normalize(seed_url, link)
# check whether already crawled this link
if link not in seen:
seen.add(link)
# add this new link to queue
crawl_queue.append(link)
threads = []
while threads or crawl_queue:
for thread in threads:
if not thread.is_alive():
threads.remove(thread)
while len(threads) < max_threads and crawl_queue:
# 创建进程
thread = threading.Thread(target=process_queue)
thread.setDaemon(True) # 设置为守护线程,完成了之后不管其他的线程有木有完成直接结束
thread.start()#开始
threads.append(thread)
# all threads have been processed
# sleep temporarily so CPU can focus execution on other threads
# time.sleep(SLEEP_TIME)
def main():
starttime = datetime.datetime.now()
cache = MongoCache(expires=datetime.timedelta())
cache.clear()
threaded_crawler('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
,scrape_callback=getUrl(10), cache=cache)
endtime = datetime.datetime.now()
print (endtime - starttime).seconds
if __name__ == '__main__':
main()
23秒
3.多进程
之前爬虫队列存储在本地的,这样多个进程无法处理,首先实现了利用mongodb模拟队列。0代表刚进去,1代表正处理,2代表处理完成。源码在上面。贴一下关键代码供自己日后复习。
def process_crawler(args, **kwargs):
num_cpus = multiprocessing.cpu_count()#进程数目
print 'Starting {} processes'.format(num_cpus)
processes = []
for i in range(num_cpus):
p = multiprocessing.Process(target=threaded_crawler, args=[args], kwargs=kwargs)
#parsed = pool.apply_async(threaded_link_crawler, args, kwargs)
p.start()
processes.append(p)
# wait for processes to complete
for p in processes:
p.join()
4.总结
利用线程和进程的方法提高效率。晚上再找些实例
17/9/10