方法1:
首先可以在类似西刺网站获取ip并存储在数据库
然后在spider的middlewares.py中添加代理ip
# importing base64 library because we'll need it ONLY in case
#if the proxy we are going to use requires authentication
#-*- coding:utf-8-*-
import base64
from proxy import GetIp,counter
import logging
ips=GetIp().get_ips() ##########################################
class ProxyMiddleware(object):
http_n=0 #counter for http requests
https_n=0 #counter for https requests
# overwrite process request
def process_request(self, request, spider):
# Set the location of the proxy
if request.url.startswith("http://"):
n=ProxyMiddleware.http_n
n=n if n<len(ips['http']) else 0
request.meta['proxy']= "http://%s:%d"%(
ips['http'][n][0],int(ips['http'][n][1]))
logging.info('Squence - http: %s - %s'%(n,str(ips['http'][n])))
ProxyMiddleware.http_n=n+1
if request.url.startswith("https://"):
n=ProxyMiddleware.https_n
n=n if n<len(ips['https']) else 0
request.meta['proxy']= "https://%s:%d"%(
ips['https'][n][0],int(ips['https'][n][1]))
logging.info('Squence - https: %s - %s'%(n,str(ips['https'][n])))
ProxyMiddleware.https_n=n+1
然后在proxy.py中过滤可以使用的ip
import sys
from handledb import exec_sql
import socket
import urllib2
dbapi="MySQLdb"
kwargs={'user':'root','passwd':'toor','db':'ippool','host':'localhost', 'use_unicode':True}
def counter(start_at=0):
'''Function: count number
Usage: f=counter(i) print f() #i+1'''
count=[start_at]
def incr():
count[0]+=1
return count[0]
return incr
def use_proxy (browser,proxy,url):
'''Open browser with proxy'''
#After visited transfer ip
profile=browser.profile
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.http', proxy[0])
profile.set_preference('network.proxy.http_port', int(proxy[1]))
profile.set_preference('permissions.default.image',2)
profile.update_preferences()
browser.profile=profile
browser.get(url)
browser.implicitly_wait(30)
return browser
class Singleton(object):
'''Signal instance example.'''
def __new__(cls, *args, **kw):
if not hasattr(cls, '_instance'):
orig = super(Singleton, cls)
cls._instance = orig.__new__(cls, *args, **kw)
return cls._instance
class GetIp(Singleton):
def __init__(self):
sql='''SELECT `IP`,`PORT`,`TYPE`
FROM `proxy`
WHERE `TYPE` REGEXP 'HTTP|HTTPS'
AND `SPEED`<5 OR `SPEED` IS NULL
ORDER BY `proxy`.`TYPE` ASC
LIMIT 50 '''
self.result = exec_sql(sql,**kwargs)
def del_ip(self,record):
'''delete ip that can not use'''
sql="delete from proxy where IP='%s' and PORT='%s'"%(record[0],record[1])
print sql
exec_sql(sql,**kwargs)
print record ," was deleted."
def judge_ip(self,record):
'''Judge IP can use or not'''
http_url="http://www.baidu.com/"
https_url="https://www.alipay.com/"
proxy_type=record[2].lower()
url=http_url if proxy_type== "http" else https_url
proxy="%s:%s"%(record[0],record[1])
try:
req=urllib2.Request(url=url)
req.set_proxy(proxy,proxy_type)
response=urllib2.urlopen(req,timeout=30)
except Exception,e:
print "Request Error:",e
self.del_ip(record)
return False
else:
code=response.getcode()
if code>=200 and code<300:
print 'Effective proxy',record
return True
else:
print 'Invalide proxy',record
self.del_ip(record)
return False
def get_ips(self):
print "Proxy getip was executed."
http=[h[0:2] for h in self.result if h[2] =="HTTP" and self.judge_ip(h)]
https=[h[0:2] for h in self.result if h[2] =="HTTPS" and self.judge_ip(h)]
print "Http: ",len(http),"Https: ",len(https)
return {"http":http,"https":https}
最后在settings中添加代理ip
#保存项目中启用的下载中间件及其顺序的字典。默认:: {}
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
'tutorial.middlewares.ProxyMiddleware': 100,
}
方法2:
当然我们也可以在download中添加代理ip,如添加proxy_ips.py文件
# encoding=utf-8
""" proxy_ip """
proxy_ip = [ '122.142.77.85:8080', '120.52.73.173:80',]
在爬虫文件中调用代理ip
from proxy_ips import proxy_ip
import random
def random_proxy_ip():
proxy_ip_index = random.randint(0, len(proxy_ip) - 1)
# res = {'http':proxy_ip[proxy_ip_index]}
res = proxy_ip[proxy_ip_index]
return res
方法二更容易操作和实现。