使用Scrapy与代理IP进行爬虫-优快云博客

本文链接：https://blog.youkuaiyun.com/Homewm/article/details/77477399

本文介绍了两种在Scrapy中设置代理IP的方法。第一种是通过获取西刺网站上的代理IP存入数据库，在中间件中调用；第二种是在下载器中直接添加代理IP，如proxy_ips.py文件。这两种方式能帮助提升爬虫的匿名性和防止被目标网站封禁。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

方法1：

首先可以在类似西刺网站获取ip并存储在数据库

然后在spider的middlewares.py中添加代理ip

# importing base64 library because we'll need it ONLY in case 
#if the proxy we are going to use requires authentication
#-*- coding:utf-8-*-
import base64
from proxy import GetIp,counter
import logging
ips=GetIp().get_ips()  ##########################################

class ProxyMiddleware(object):
    http_n=0     #counter for http requests
    https_n=0    #counter for https requests  
    # overwrite process request
    def process_request(self, request, spider):
        # Set the location of the proxy
        if request.url.startswith("http://"):
            n=ProxyMiddleware.http_n
            n=n if n<len(ips['http']) else 0 
            request.meta['proxy']= "http://%s:%d"%(
                ips['http'][n][0],int(ips['http'][n][1]))
            logging.info('Squence - http: %s - %s'%(n,str(ips['http'][n])))
            ProxyMiddleware.http_n=n+1

        if request.url.startswith("https://"):
            n=ProxyMiddleware.https_n
            n=n if n<len(ips['https']) else 0             
            request.meta['proxy']= "https://%s:%d"%(
                ips['https'][n][0],int(ips['https'][n][1]))
            logging.info('Squence - https: %s - %s'%(n,str(ips['https'][n])))
            ProxyMiddleware.https_n=n+1

然后在proxy.py中过滤可以使用的ip

import sys
from handledb import exec_sql
import socket
import urllib2

dbapi="MySQLdb"
kwargs={'user':'root','passwd':'toor','db':'ippool','host':'localhost', 'use_unicode':True}

def counter(start_at=0):
    '''Function: count number
	Usage: f=counter(i) print f() #i+1'''
    count=[start_at]
    def incr():
        count[0]+=1
        return count[0]
    return incr

def use_proxy (browser,proxy,url):
    '''Open browser with proxy'''
    #After visited transfer ip
    profile=browser.profile
    profile.set_preference('network.proxy.type', 1)  
    profile.set_preference('network.proxy.http', proxy[0])  
    profile.set_preference('network.proxy.http_port', int(proxy[1]))  
    profile.set_preference('permissions.default.image',2)
    profile.update_preferences() 
    browser.profile=profile
    browser.get(url)
    browser.implicitly_wait(30)
    return browser
    
class Singleton(object):
    '''Signal instance example.'''
    def __new__(cls, *args, **kw):  
        if not hasattr(cls, '_instance'):  
            orig = super(Singleton, cls)  
            cls._instance = orig.__new__(cls, *args, **kw)  
        return cls._instance 

class GetIp(Singleton):
    def __init__(self):
        sql='''SELECT  `IP`,`PORT`,`TYPE`
        FROM  `proxy` 
        WHERE `TYPE` REGEXP  'HTTP|HTTPS'
        AND  `SPEED`<5 OR `SPEED` IS NULL
        ORDER BY `proxy`.`TYPE` ASC 
        LIMIT 50 '''
        self.result = exec_sql(sql,**kwargs)
    def del_ip(self,record):
        '''delete ip that can not use'''
        sql="delete from proxy where IP='%s' and PORT='%s'"%(record[0],record[1])
        print sql
        exec_sql(sql,**kwargs)
        print record ," was deleted."
    def judge_ip(self,record):
        '''Judge IP can use or not'''
        http_url="http://www.baidu.com/"
        https_url="https://www.alipay.com/"
        proxy_type=record[2].lower()
        url=http_url if  proxy_type== "http" else https_url
        proxy="%s:%s"%(record[0],record[1])
        try:
            req=urllib2.Request(url=url)
            req.set_proxy(proxy,proxy_type)
            response=urllib2.urlopen(req,timeout=30)
        except Exception,e:
            print "Request Error:",e
            self.del_ip(record)
            return False
        else:
            code=response.getcode()
            if code>=200 and code<300:
                print 'Effective proxy',record
                return True
            else:
                print 'Invalide proxy',record
                self.del_ip(record)
                return False
        
    def get_ips(self):
        print "Proxy getip was executed."
        http=[h[0:2] for h in self.result if h[2] =="HTTP" and self.judge_ip(h)]
        https=[h[0:2] for h in self.result if h[2] =="HTTPS" and self.judge_ip(h)]
        print "Http: ",len(http),"Https: ",len(https)
        return {"http":http,"https":https}

最后在settings中添加代理ip

#保存项目中启用的下载中间件及其顺序的字典。默认:: {}
DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
    'tutorial.middlewares.ProxyMiddleware': 100,
}

方法2：

当然我们也可以在download中添加代理ip，如添加proxy_ips.py文件

# encoding=utf-8
""" proxy_ip """

proxy_ip = [ '122.142.77.85:8080', '120.52.73.173:80',]

在爬虫文件中调用代理ip

from proxy_ips import proxy_ip
import random

def random_proxy_ip():
    proxy_ip_index = random.randint(0, len(proxy_ip) - 1)
    # res = {'http':proxy_ip[proxy_ip_index]}
    res = proxy_ip[proxy_ip_index]
    return res

方法二更容易操作和实现。