middlewares.py文件
**#更换User-agent ,读取cookie**
import time
from selenium import webdriver
from scrapy.http import HtmlResponse, Headers
from selenium.webdriver.chrome.options import Options
class UAMiddleware(object):
user_agent_list = settings['UAPOOL']
def process_request(self,request,spider):
print("((((((((((()))))))))))))))))))))))))))))))))))))))))))))))))")
if request.url == "https://www.qichacha.com/user_login?back=%2F":
return HtmlResponse(url=request.url, encoding="utf-8")
ua = random.choice(self.user_agent_list)
with open('E:cookies.json', 'r', encoding='utf-8') as fp:# 读取login保存的cookies值
listcookies = json.loads(fp.read())
cookies_dict = '' # 通过构建字典类型的cookies
for cookie in listcookies:
sss=cookie['name']+"="+cookie['value']+";"
cookies_dict=cookies_dict+sss
print("@@@@@@@@@@@@@@@@@@@@@@@@@@111111")
print(cookies_dict)
#cookies_dict2=json.dumps(cookies_dict)
hhh = {
'Host': 'www.qichacha.com',
'User-Agent': ua,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Language": "zh-CN,zh;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Cache-Control": "max-age= 0",
'Cookie': cookies_dict}
print("@@@@@@@@@@@@@@@@@@@@@@@@@@")
request.headers = Headers(hhh)
#return scrapy.http.HtmlResponse(url=request.url, encoding='utf-8',request=request)
****#模拟登录****
from PIL import Image,ImageEnhance
import scrapy
import json
import re
import random
import urllib.parse
import scrapy,time
import urllib.parse
from openpyxl import Workbook
from openpyxl import load_workbook
from selenium import webdriver
wb = Workbook()
ws = wb.active
import pytesseract
from selenium.webdriver import ActionChains
class JSPageMiddleware(object):
#通过chrome 动态访问,滑块移动函数(未用)
def get_strack(self):
distance = 348
track = []
current = 0
mid = distance * 3/ 5
t = 0.2
v = 0
while current < distance:
if current < mid:
a = 3
else:
a = 6
v0 = v
v = v0 + a * t
move = v0 * t + 1 / 2 * a * t * t
print(move)
print('----------------------------')
current += move
print(current)
print('+++++++++++++++++++')
track.append(round(move))
print(track)
return track
**#模拟登录,保存cookie**
def process_request(self, request, spider):
#chrome_options = Options()
# chrome_options.add_argument('--headless') # 使用无头谷歌浏览器模式
# chrome_options.add_argument('--disable-gpu')
# chrome_options.add_argument('--no-sandbox')
if request.url=="https://www.qichacha.com/user_login?back=%2F" :
chrome_options = Options()
# 指定谷歌浏览器路径
self.browser = webdriver.Chrome(chrome_options=chrome_options, executable_path="F:chromedriver.exe")
self.browser.get(request.url)
# time.sleep(3)
self.browser.find_element_by_xpath('//*[@id = "normalLogin"]').click() # 转到登录界面
self.browser.find_element_by_xpath(' // *[ @ id = "nameNormal"]').send_keys("13080024006") # 账号
self.browser.find_element_by_xpath('// *[ @ id = "pwdNormal"]').send_keys("13080024006") # 密码
time.sleep(3)
but