爬虫性能

Catalog

异步非阻塞

from gevent import monkey
monkey.patch_all()
import requests, gevent
# 待访问的URL
def get_urls():
    jd_url = 'https://search.jd.com/Search?keyword=%E7%88%AC%E8%99%AB&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E7%88%AC%E8%99%AB&page={}&click=0'
    return [jd_url.format(i) for i in range(1, 200, 2)]
# 网页请求
def request(url):
    ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
    header = {'User-Agent': ua}
    response = requests.get(url, headers=header)
    print(response.status_code, response.url)
# 异步非阻塞
def grequest():
    from gevent.pool import Pool
    pool = Pool(16)
    urls = get_urls()
    gevent.joinall([pool.spawn(request, url) for url in urls])
# 时间测试
if __name__ == '__main__':
    from time import time
    t = time()
    grequest()
    print(time() - t)

多线程+异步非阻塞

from gevent import monkey
monkey.patch_all()
import requests, gevent
# 待访问的URL
def get_urls():
    jd_url = 'https://search.jd.com/Search?keyword=%E7%88%AC%E8%99%AB&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E7%88%AC%E8%99%AB&page={}&click=0'
    return [jd_url.format(i) for i in range(1, 200, 2)]
# 网页请求
def request(url):
    ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
    header = {'User-Agent': ua}
    response = requests.get(url, headers=header)
    print(response.status_code, response.url)
# 异步非阻塞
def grequest(urls):
    from gevent.pool import Pool
    pool = Pool(4)
    gevent.joinall([pool.spawn(request, url) for url in urls])
# 多线程
def concurrent(n=4):
    from concurrent.futures import ThreadPoolExecutor
    pool = ThreadPoolExecutor(n)
    url_ls = get_urls()
    length = len(url_ls)
    step = int(length / n) + 1
    for i in range(0, length, step):
        urls = url_ls[i: i + step]
        pool.submit(grequest, urls)
    pool.shutdown(True)
# 时间测试
if __name__ == '__main__':
    from time import time
    t = time()
    concurrent()
    print(time() - t)

多进程

import requests
# 待访问的URL
def get_urls():
    jd_url = 'https://search.jd.com/Search?keyword=%E7%88%AC%E8%99%AB&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E7%88%AC%E8%99%AB&page={}&click=0'
    return [jd_url.format(i) for i in range(1, 200, 2)]
# 网页请求
def request(urls):
    ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
    header = {'User-Agent': ua}
    for url in urls:
        response = requests.get(url, headers=header)
        print(response.status_code, response.url)
# 多进程
def concurrent(n=16):
    from concurrent.futures import ProcessPoolExecutor
    pool = ProcessPoolExecutor(n)
    url_ls = get_urls()
    length = len(url_ls)
    step = int(length / n) + 1
    for i in range(0, length, step):
        urls = url_ls[i: i + step]
        pool.submit(request, urls)
    pool.shutdown(True)
# 时间测试
if __name__ == '__main__':
    from time import time
    t = time()
    concurrent()
    print(time() - t)

测试结果

方法时间(秒)
异步3.4
多线程+异步3.5
多进程4.8
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小基基o_O

您的鼓励是我创作的巨大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值