scrapy

最新推荐文章于 2025-09-07 01:02:55 发布

转载最新推荐文章于 2025-09-07 01:02:55 发布 · 83 阅读

0 ·

CC 4.0 BY-SA版权

原文链接：http://www.cnblogs.com/Neeo/p/7816506.html

文章标签：

#python #shell #操作系统

环境：

Windows
Python3.6

下载与安装

# linux
　　pip install scrapy

# Windows比较操蛋,首先解决依赖
　　1，pip3 install wheel  # wheel官网： https://www.lfd.uci.edu/~gohlke/pythonlibs
　　2，pip3 install lxml
　　3，pip3 install pyopenssl
　　4.下载并安装：pywin32  # https://sourceforge.net/projects/pywin32/files/pywin32/
　　　　cmd命令行：pip3 install 你的下载目录\Twisted-17.9.0-cp36-cp36m-win_amd64.whl
　　5，pip3 install scrapy

使用:

# 查看帮助
scrapy -h 　# 查看帮助
scrapy <command> -h　　# 查看某个命令的帮助信息，如: scrapy shell -h

# 有两种命令，其中Project-only必须切到项目目录文件夹下才能执行的，而Global的命令则在全局都可以使用
# Global commands：
　　scrapy startproject  # 创建项目
　　scrapy genspider  # 创建爬虫程序
　　scrapy runspider  # 运行一个独立的python文件，不必创建项目
　　scrapy shell   # 


scrapy version    # 查看scrapy版本信息
scrapy version -v  # 查看scrapy及相关依赖包的版本信息

　性能相关

# 同步调用

import requests
import time

def get_page(url):

    response = requests.get(url)
    if response.status_code == 200:
        return len(response.text)

urls = [
    'https://www.baidu.com/',
    'http://www.jianshu.com/',
    'https://www.sina.com.cn/',
    'https://www.python.org/',
    'https://www.cnblogs.com/',
]

stat_time = time.time()
for url in urls:
    res = get_page(url)  # 调用一个任务，就在原地等待任务结束拿到结果后才继续往后执行
    print(res)
stop_time = time.time()
print(stop_time - stat_time)  # 16.94821572303772

同步调用

# 优化1，使用简单的多线程或者多进程

from multiprocessing import Process
from threading import Thread
import requests
import time


def timmer(func):
    def warpper(*args,**kwargs):
        start_time = time.time()
        res=func(*args,**kwargs)
        stop_time = time.time()
        print('run time is %s'%(stop_time-start_time))
        return res
    return warpper



def get_page(url):

    response = requests.get(url)
    if response.status_code == 200:
        response = len(response.text)
        print(response)
        return response




if __name__ == '__main__':

    urls = [
        'https://www.baidu.com/',
        'http://www.jianshu.com/',
        'https://www.sina.com.cn/',
        'https://www.python.org/',
        'https://www.cnblogs.com/',
    ]

    @timmer
    def bar(urls):

        for url in urls:
            # 使用多进程
            # p = Process(target=get_page,args=(url,))
            # p.start()
            # p.join()
            # 耗时： 6.365699052810669

            # 使用多线程
            t = Thread(target=get_page,args=(url,))
            t.start()
            t.join()
            # 耗时： 4.563170671463013
    bar(urls)

"""
在多进程比多线程耗时更多的情况：
    一、是因为开启进程本身就耗费时间
    二、可能因为网络环境影响
    三、根电脑本身的性能也有关系
    
该方案的问题是：
    开启多进程或都线程的方式，我们是无法无限制地开启多进程或多线程的：
    在遇到要同时响应成百上千路的连接请求，则无论多线程还是多进程都会严重占据系统资源，
    降低系统对外界响应效率，而且线程与进程本身也更容易进入假死状态。
"""

简单的多线程/多进程

# 优化2 进程池与线程池

from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import requests
import time


def timmer(func):
    def warpper(*args, **kwargs):
        start_time = time.time()
        res = func(*args, **kwargs)
        stop_time = time.time()
        print('run time is %s'%(stop_time-start_time))
        return res

    return warpper


def get_page(url):
    print("GET: %s" %url)
    response = requests.get(url)
    if response.status_code == 200:
        response = len(response.text)
        # print(response)
        return response


if __name__ == '__main__':
    urls = [
        'https://www.baidu.com/',
        'http://www.jianshu.com/',
        'https://www.sina.com.cn/',
        'https://www.python.org/',
        'https://www.cnblogs.com/',
    ]
    p = ProcessPoolExecutor(2)
    t = ThreadPoolExecutor(5)


    @timmer
    def bar(urls):
        # 使用进程池
        # for url in urls:
        #     p.submit(get_page, url)
        # p.shutdown(wait=True)

        # 使用线程池
        for url in urls:
            t.submit(get_page, url)
        t.shutdown(wait=True)


    bar(urls)
"""
在进程池决定开几个进程，也会影响耗时时间，在我测试几次，发现开2个进程耗时最短，
开5个进程，耗时最长，开一个耗时也长
"""

线程池/进程池

# 优化3 进程/线程 + 回调函数

from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import requests
import time
import os


def timmer(func):
    def warpper(*args, **kwargs):
        start_time = time.time()
        res = func(*args, **kwargs)
        stop_time = time.time()
        print('run time is %s'%(stop_time-start_time))
        return res

    return warpper


def get_page(url):
    # print("GET: %s" %url)
    print('子',os.getppid())  # 每个子进程/线程
    response = requests.get(url)
    if response.status_code == 200:
        response = len(response.text)
        # print(response)
        return response

def callback(res):
    res =res.result()
    # print('%s parsing' % os.getpid())  # 回调函数也是主进程执行

if __name__ == '__main__':
    urls = [
        'https://www.baidu.com/',
        'http://www.jianshu.com/',
        'https://www.sina.com.cn/',
        'https://www.python.org/',
        'https://www.cnblogs.com/',
    ]
    p = ProcessPoolExecutor(3)
    t = ThreadPoolExecutor(2)


    @timmer
    def bar(urls):
        # 使用进程池
        # for url in urls:
        #     # print(os.getpid())   # 主进程ID
        #     p.submit(get_page, url).add_done_callback(callback)
        # p.shutdown(wait=True)
        # 使用线程池
        for url in urls:
            print('主',os.getppid())
            t.submit(get_page, url).add_done_callback(callback)
        t.shutdown(wait=True)
    bar(urls)


"""
通过多进程或多线程等方式，都能提高性能，但是存在I/O阻塞时的进程线程的浪费，所以，我们继续优化

"""

线/进程池 + 回调函数

# 优化4 asyncio方式

import asyncio

@asyncio.coroutine
def fetch_async(host,url='/'):
    print(host,url)
    reader, writer = yield from asyncio.open_connection(host,80)
    request_header_content = "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n"%(url,host)
    request_header_content = bytes(request_header_content,encoding='utf8')
    writer.write(request_header_content)
    yield from writer.drain()
    text = yield from reader.read()
    print(host, url, text)
    writer.close()

tasks = [
    fetch_async('www.cnblogs.com','/neeo/'),
    fetch_async('www.chouti.com', '/pic/show?nid=4073644713430508&lid=10273091')
]


loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()

asyncio

# 优化5 asyncio + aiohttp

import asyncio
import aiohttp

@asyncio.coroutine
def fetch_async(url):
    print(url)
    response = yield from aiohttp.request('GET', url)
    # data = yield from response.read()
    # print(url, data)
    print(url, response)
    response.close()


tasks = [fetch_async('http://www.baidu.com/'), fetch_async('http://www.chouti.com/')]

event_loop = asyncio.get_event_loop()
results = event_loop.run_until_complete(asyncio.gather(*tasks))
event_loop.close()

asyncio + aiohttp

# 优化6 asyncio + requests

import asyncio
import requests


@asyncio.coroutine
def fetch_async(func, *args):
    loop = asyncio.get_event_loop()
    future = loop.run_in_executor(None, func, *args)
    response = yield from future
    print(response.url, response.content)


tasks = [
    fetch_async(requests.get, 'http://www.cnblogs.com/wupeiqi/'),
    fetch_async(requests.get, 'http://dig.chouti.com/pic/show?nid=4073644713430508&lid=10273091')
]

loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()

asyncio + requests

# 优化7 gevent + requests

import gevent
from gevent import monkey
import requests


monkey.patch_all()

def fetch_async(method, url, req_kwargs):
    print(method, url, req_kwargs)
    response = requests.request(method=method, url=url, **req_kwargs)
    print(response.url, response.content)

# ##### 发送请求 #####
# gevent.joinall([
#     gevent.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}),
#     gevent.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}),
#     gevent.spawn(fetch_async, method='get', url='https://github.com/', req_kwargs={}),
# ])

# ##### 发送请求（协程池控制最大协程数量） #####
from gevent.pool import Pool
pool = Pool(None)
gevent.joinall([
    pool.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}),
    pool.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}),
    pool.spawn(fetch_async, method='get', url='https://www.github.com/', req_kwargs={}),
])

gevent + requests

# 优化8 grequests

import grequests
urls = [
    'http://www.heroku.com',
    'http://python-tablib.org',
    'http://httpbin.org',
    'http://python-requests.org',
    'http://fakedomain/',
    'http://kennethreitz.com'
]
# 创建没有发送的request集合
rs = (grequests.get(u) for u in urls)
# 发送
grequests.map(rs)
# 为了防止超时和异常发生，可以指定一个异常处理器
def exception_handler(request, exception):
    print("Request failed")
reqs = [
   grequests.get('http://httpbin.org/delay/1', timeout=0.001),
   grequests.get('http://fakedomain/'),
   grequests.get('http://httpbin.org/status/500')]
grequests.map(reqs, exception_handler=exception_handler)
# 另外，可以使用imap来提高性能

# github: https://github.com/kennethreitz/grequests

grequests

# 优化9 Twisted

# 示例1

from twisted.web.client import getPage, defer
from twisted.internet import reactor


def all_done(arg):
    reactor.stop()


def callback(contents):
    print(contents)


deferred_list = []

url_list = ['http://www.bing.com', 'http://www.baidu.com', ]
for url in url_list:
    deferred = getPage(bytes(url, encoding='utf8'))
    deferred.addCallback(callback)
    deferred_list.append(deferred)

dlist = defer.DeferredList(deferred_list)
dlist.addBoth(all_done)

reactor.run()

Twisted 示例1

# 示例2


from twisted.internet import reactor
from twisted.web.client import getPage
import urllib.parse


def one_done(arg):
    print(arg)
    reactor.stop()

post_data = urllib.parse.urlencode({'check_data': 'adf'})
post_data = bytes(post_data, encoding='utf8')
headers = {b'Content-Type': b'application/x-www-form-urlencoded'}
response = getPage(bytes('http://dig.chouti.com/login', encoding='utf8'),
                   method=bytes('POST', encoding='utf8'),
                   postdata=post_data,
                   cookies={},
                   headers=headers)
response.addBoth(one_done)

reactor.run()

Twisted 示例2

# 优化10 tornado

from tornado.httpclient import AsyncHTTPClient
from tornado.httpclient import HTTPRequest
from tornado import ioloop


def handle_response(response):
    """
    处理返回值内容（需要维护计数器，来停止IO循环），调用 ioloop.IOLoop.current().stop()
    :param response:
    :return:
    """
    if response.error:
        print("Error:", response.error)
    else:
        print(response.body)


def func():
    url_list = [
        'http://www.baidu.com',
        'http://www.bing.com',
    ]
    for url in url_list:
        print(url)
        http_client = AsyncHTTPClient()
        http_client.fetch(HTTPRequest(url), handle_response)


ioloop.IOLoop.current().add_callback(func)
ioloop.IOLoop.current().start()

tornado

# 可以说是史上最牛的异步I/O模块了

import select
import socket
import time


class AsyncTimeoutException(TimeoutError):
    """
    请求超时异常类
    """

    def __init__(self, msg):
        self.msg = msg
        super(AsyncTimeoutException, self).__init__(msg)


class HttpContext(object):
    """封装请求和相应的基本数据"""

    def __init__(self, sock, host, port, method, url, data, callback, timeout=5):
        """
        sock: 请求的客户端socket对象
        host: 请求的主机名
        port: 请求的端口
        port: 请求的端口
        method: 请求方式
        url: 请求的URL
        data: 请求时请求体中的数据
        callback: 请求完成后的回调函数
        timeout: 请求的超时时间
        """
        self.sock = sock
        self.callback = callback
        self.host = host
        self.port = port
        self.method = method
        self.url = url
        self.data = data

        self.timeout = timeout

        self.__start_time = time.time()
        self.__buffer = []

    def is_timeout(self):
        """当前请求是否已经超时"""
        current_time = time.time()
        if (self.__start_time + self.timeout) < current_time:
            return True

    def fileno(self):
        """请求sockect对象的文件描述符，用于select监听"""
        return self.sock.fileno()

    def write(self, data):
        """在buffer中写入响应内容"""
        self.__buffer.append(data)

    def finish(self, exc=None):
        """在buffer中写入响应内容完成，执行请求的回调函数"""
        if not exc:
            response = b''.join(self.__buffer)
            self.callback(self, response, exc)
        else:
            self.callback(self, None, exc)

    def send_request_data(self):
        content = """%s %s HTTP/1.0\r\nHost: %s\r\n\r\n%s""" % (
            self.method.upper(), self.url, self.host, self.data,)

        return content.encode(encoding='utf8')


class AsyncRequest(object):
    def __init__(self):
        self.fds = []
        self.connections = []

    def add_request(self, host, port, method, url, data, callback, timeout):
        """创建一个要请求"""
        client = socket.socket()
        client.setblocking(False)
        try:
            client.connect((host, port))
        except BlockingIOError as e:
            pass
            # print('已经向远程发送连接的请求')
        req = HttpContext(client, host, port, method, url, data, callback, timeout)
        self.connections.append(req)
        self.fds.append(req)

    def check_conn_timeout(self):
        """检查所有的请求，是否有已经连接超时，如果有则终止"""
        timeout_list = []
        for context in self.connections:
            if context.is_timeout():
                timeout_list.append(context)
        for context in timeout_list:
            context.finish(AsyncTimeoutException('请求超时'))
            self.fds.remove(context)
            self.connections.remove(context)

    def running(self):
        """事件循环，用于检测请求的socket是否已经就绪，从而执行相关操作"""
        while True:
            r, w, e = select.select(self.fds, self.connections, self.fds, 0.05)

            if not self.fds:
                return

            for context in r:
                sock = context.sock
                while True:
                    try:
                        data = sock.recv(8096)
                        if not data:
                            self.fds.remove(context)
                            context.finish()
                            break
                        else:
                            context.write(data)
                    except BlockingIOError as e:
                        break
                    except TimeoutError as e:
                        self.fds.remove(context)
                        self.connections.remove(context)
                        context.finish(e)
                        break

            for context in w:
                # 已经连接成功远程服务器，开始向远程发送请求数据
                if context in self.fds:
                    data = context.send_request_data()
                    context.sock.sendall(data)
                    self.connections.remove(context)

            self.check_conn_timeout()


if __name__ == '__main__':
    def callback_func(context, response, ex):
        """
        :param context: HttpContext对象，内部封装了请求相关信息
        :param response: 请求响应内容
        :param ex: 是否出现异常（如果有异常则值为异常对象；否则值为None）
        :return:
        """
        print(context, response, ex)

    obj = AsyncRequest()
    url_list = [
        {'host': 'www.google.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5,
         'callback': callback_func},
        {'host': 'www.baidu.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5,
         'callback': callback_func},
        {'host': 'www.bing.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5,
         'callback': callback_func},
    ]
    for item in url_list:
        print(item)
        obj.add_request(**item)

    obj.running()