Preventing from being banned with scrapy structure
1. delay time
import time
# first
time.sleep()
# second one, which can be used in setting.py or spider
download_delay = ***
2. Ban cookies
# Disable cookies (enabled by default) in settings.py
COOKIES_ENABLED = True
3. User Agent pool
3.1 Single change
in scrapy, you can check the user-agent by using request.headers
scrapy shell example.com
request.headers
you need to enable the setting in the settings.py
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'xxxxxxxxxx'
3.2 Using the user-agent list.
3.21 set the user-agent list in settings.py
default setting in settings.py is
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'txposition (+http://www.yourdomain.com)'
Change it to …
import random
USER_AGENT_LIST = [
'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23',
'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',
'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',
'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',
'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)',
'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)'
]
USER_AGENT = random.choice(USER_AGENT_LIST)
check if you change it successfully
# -*- coding: utf-8 -*-
import scrapy
class MyspiderSpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['www.baidu.com']
start_urls = ['http://www.baidu.com']
def parse(self, response):
print(response.request.headers['User-Agent'], len(response.text))
# different user-agents usually get different type of content (mobile phone/desktop).
3.22 set the user-agent list using downloadermiddleware
default item in settings.py
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
Enable it and add the keyword user-agent in headers.
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'user-agent': random.choice(user_agent_list)
}
3.23 set the user-agent list through downloadermiddleware
changes in settings.py
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
#'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'usergent.middlewares.UsergentDownloaderMiddleware': 543,
}
changes in middleware.py
import random
class UsergentDownloaderMiddleware(object):
def __init__(self, user_agent=''):
# the default user_agent_list composes chrome,IE,firefox,Mozilla,opera,netscape
# for more user agent strings,you can find it in
# http://www.useragentstring.com/pages/useragentstring.php
self.user_agent_list = [
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
]
@classmethod
def process_request(self, request, spider):
ua = random.choice(self.user_agent_list)
if ua:
request.headers['user-agent'] = ua
4. IP pools
i’ll finish it later
5. distributed scrape
using multiprocessing module or thread