爬虫入门四（多线程爬虫）

最新推荐文章于 2023-04-19 09:48:31 发布

helloworldsv

最新推荐文章于 2023-04-19 09:48:31 发布

阅读量382

点赞数

CC 4.0 BY-SA版权

分类专栏：爬虫-python 文章标签：多线程爬虫

本文链接：https://blog.youkuaiyun.com/qq_34059989/article/details/77922951

爬虫-python 专栏收录该内容

4 篇文章

订阅专栏

1.单线程实现网页下载

#coding=utf-8
import  datetime
import csv
from day3 import link_crawler #略有改动。same_domain修改一下。html编码改一下。
from mongoDB import MongoCache#前面实现的

def getUrl(number):
    urls=[]
    path='D:/top-1m.csv/top-1m.csv'
    for _,website in csv.reader(open(path)):
        urls.append('http://'+website)
        if len(urls)!=number:
            print _,website
        else :
            return urls


from  datetime import  timedelta
def main():

    starttime = datetime.datetime.now()
    cache = MongoCache(expires=timedelta())
    cache.clear()
    link_crawler('http://example.webscraping.com'
,scrape_callback=getUrl(10), cache=cache,delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')
    endtime = datetime.datetime.now()
    print (endtime - starttime).seconds

if __name__ == '__main__':
    main()

80多秒
https://bitbucket.org/wswp/code/src/9e6b82b47087c2ada0e9fdf4f5e037e151975f0f?at=default
这本书的源码网站。都是片段式的。贼烦。
运行出现bson.errors.InvalidStringData: strings in documents must be valid UTF-8
插入数据库的时候出现了问题，s.decode(“unicode_escape”);直接搞定。http://blog.youkuaiyun.com/woshicsdn7547/article/details/41678093

2.多线程

#coding=utf-8
from day3 import Downloader
from day3 import normalize
from mongoDB import MongoCache
from chapter4 import getUrl
import  datetime
import  threading
import  time
SLEEP_TIME=1
def threaded_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1,cache=None,scrape_callback=None,max_threads=10):
    crawl_queue=[seed_url]
    seen=set([seed_url])
    D=Downloader(cache=cache,delay=delay,user_agent=user_agent,proxies=proxy,num_retries=num_retries)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                # crawl queue is empty
                break
            else:
                html = D(url)#调用__call__，获得内容
                if scrape_callback:
                    try:
                        links = scrape_callback or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            link = normalize(seed_url, link)
                            # check whether already crawled this link
                            if link not in seen:
                                seen.add(link)
                                # add this new link to queue
                                crawl_queue.append(link)

    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            # 创建进程
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)  # 设置为守护线程，完成了之后不管其他的线程有木有完成直接结束
            thread.start()#开始
            threads.append(thread)
        # all threads have been processed
        # sleep temporarily so CPU can focus execution on other threads
       # time.sleep(SLEEP_TIME)
def main():

    starttime = datetime.datetime.now()
    cache = MongoCache(expires=datetime.timedelta())
    cache.clear()
    threaded_crawler('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
,scrape_callback=getUrl(10), cache=cache)
    endtime = datetime.datetime.now()
    print (endtime - starttime).seconds

if __name__ == '__main__':
    main()

23秒

3.多进程

之前爬虫队列存储在本地的，这样多个进程无法处理，首先实现了利用mongodb模拟队列。0代表刚进去，1代表正处理，2代表处理完成。源码在上面。贴一下关键代码供自己日后复习。

def process_crawler(args, **kwargs):
    num_cpus = multiprocessing.cpu_count()#进程数目

    print 'Starting {} processes'.format(num_cpus)
    processes = []
    for i in range(num_cpus):
        p = multiprocessing.Process(target=threaded_crawler, args=[args], kwargs=kwargs)
        #parsed = pool.apply_async(threaded_link_crawler, args, kwargs)
        p.start()
        processes.append(p)
    # wait for processes to complete
    for p in processes:
        p.join()