1、电脑系统:ubuntu 18.04.5 LTS ;开发工具:pycharm;编译环境:anaconda3,python_vesion:Python 3.8;第三方库文件:Scrapy,selenium
2、下面就是代码了,想说的话都在注释里了
第一步:编写middlewares.py文件:
from ScrapySpider.setting import USER_AGENT_LIST,PROXIY_LIST
import base64
import random
from selenium import webdriver
from time
from scrapy.http import HtmlResponse
#开启Headers头文件,User-Agent代理池,随机选取User-Agent
class Random_UserAgent :
def process_request(self,request,spider):
User_Agent = random.choice(USER_AGENT_LIST)
request.headers['User_Agent'] = User_Agent
#开启ip代理,从ip代理池中随机选取代理IP
class Random_Proxies() :
def process_request(self,request,spider):
proxy = random.choice(PROXIY_LIST)
if 'user_password' in proxy :
bs64 = base64.b16decode(proxy['user_password'])#因为代理IP有账号密码,所以要进行编码处理
request.headers['Proxy_Authorization'] ='Basic ' +b64_up.decode('utf-8')
request.meta['Proxy'] = proxy['ip_port']
else:
request.meat['Proxy'] = proxy['ip_port']#这里Proxy不能写成小写p,会报错
#根据爬取的网页不同,选用selenium,做模拟登录或模拟点击JS
#这里只为驱动url中的js,以获取JS生成的数据
class SeleniumMiddleware():
def process_request(self,request,spider):
url = request.url