python3爬虫总结(模拟滑块验证，自动登录)

最新推荐文章于 2024-08-31 23:09:41 发布

原创最新推荐文章于 2024-08-31 23:09:41 发布 · 1.3k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#selenium #urllib #BeautifulSoup #mitmproxy #淘宝

python 专栏收录该内容

11 篇文章

订阅专栏

本文总结了Python3爬虫过程中如何处理模拟滑块验证和自动登录，涉及urllib、BeautifulSoup、Selenium以及mitmproxy等库。首先通过urllib请求网页，然后使用BeautifulSoup解析内容。对于需要登录的页面，利用Selenium的ChromeOptions进行模拟登录，并提醒注意ChromeDriver版本与浏览器匹配。此外，还提到了mitmproxy在抓包和代理中的应用。

1.应用python库

from urllib import request

import ssl

from bs4 import BeautifulSoup

import sys

import io

import json

import time

import random

from selenium import webdriver

from selenium.webdriver import ActionChains

from selenium.webdriver.common.by import By

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.support.wait import WebDriverWait

from selenium.common.exceptions import TimeoutException, NoSuchElementException

from selenium.webdriver.chrome.options import Options

from selenium.webdriver.common.keys import Keys

from mitmproxy import ctx

2.主要思路

1.首先用urllib的request请求要爬取的页面：

  # 请求
  req = request.Request(url)
  # 设置cookie
  file = open('cookie.txt', 'r+', encoding='utf-8')  
  cookie_str = file.read()
  file.close()
  # print('cookie_str',cookie_str)
  req.add_header('cookie', cookie_str)
  req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) 
  AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36')
  
  response = request.urlopen(req)

  # 获取状态码，200表示成功
  print(response.getcode())
  # 获取网页内容
  read = response.read()

2.使用BeautifulSoup操作获取到的页面内容

soup = BeautifulSoup(read,"html.parser")

3.遇到需要登录的页面，使用selenium进行模拟登录，这里我使用webdriver.ChromeOptions浏览器：

ChromeDriver下载地址http://chromedriver.chromium.org/downloads/version-selection

或者 http://chromedriver.storage.googleapis.com/index.html

注意下载的版本要与浏览器版本一致,有个坑人的地方就是谷歌浏览器自己默默的更新了版本，于是爬虫就报错了，这个时候就需要去ChromeDriver下载地址重新下载对应版本的驱动。

解压压缩包，找到chromedriver.exe复制到chrome的安装目录。复制chromedriver.exe文件的路径并加入到电脑的环境变量中去。


# **********为mitmproxy代理添加证书
ssl._create_default_https_context = ssl._create_unverified_context
# ******************
  
  #浏览器配置
  chromeOptions = webdriver.ChromeOptions()
  
  # **********添加mitmproxy代理
  chromeOptions.add_argument("--proxy-server=http://127.0.0.1:8080")
  # **********
  #Linux系统要加上这句话
  chromeOptions.add_argument('–no-sandbox')
  #隐藏浏览器 
  chromeOptions.add_argument('--headless')
  #隐藏弹框
  chromeOptions.add_argument('disable-infobars')
  #改变window.navigator.webdriver等属性值
  chromeOptions.add_experimental_option("excludeSwitches", ['enable-automation'])
  #驱动器地址
  chrome_driver = "C:/Users/lin/AppData/Local/Google/Chrome/Application/Chromedriver.exe"
  #实例化浏览器
  browser = webdriver.Chrome(chrome_options=chromeOptions)
  #访问登录地址
  browser.get(url)

常见操作

#遇到有frame的需要切换到frame操作，操作完后切出

#切换至frame
frame = browser.find_element_by_xpath("//div//iframe")
browser.switch_to.frame(frame)

#######操作内容#####

#切出
browser.switch_to.parent_frame()

# 模拟人为输入 browser对象，输入框name，输入值
def imitateInput(browser,elementName,value):
  element = browser.find_element_by_name(elementName) 
  valueName = element.get_attribute("value")
  if valueName == '':
    action = ActionChains(browser)
    action.move_to_element(element).perform()
    action.click(element).perform()
    for key in value:
      if key == '林':
        time.sleep(0.9)
      else:
        sleepTime = random.uniform(0.2, 0.4)
        time.sleep(sleepTime)
      element.send_keys(key)
    time.sleep(1)

#遇到滑块的处理

# 滑块操作
def typeSlide(browser):
  browser.implicitly_wait(3)
  # 找到滑块
  button = browser.find_element_by_id('nc_1_n1z')  
  
  # 移动滑块
  flag = 0
  distance = 300
  offset = 5
  times = 0
  # 实例化一个action对象
  action = ActionChains(browser)
  action.move_to_element(button).perform()
  # perform()用来执行ActionChains中存储的行为
  action.click_and_hold(button).perform()
  
  # 清除之前的action  
  action.reset_actions()
  track = get_track(distance)
  
  for i in track:
    action.move_by_offset(xoffset=i, yoffset=0).perform()
    action.reset_actions()
  time.sleep(0.5)
  action.release().perform()
  time.sleep(1)

# 构造移动轨迹
def get_track(distance):
	track = []
	current = 0
	mid = distance * 3 / 4
	t = 0.2
	v = 0
	while current < distance:
		if current < mid:
			a = 2
		else:
			a = -3
		v0 = v
		v = v0 + a * t
		move = v0 * t + 1 / 2 * a * t * t
		current += move
		track.append(round(move))
	return track

4.mitmproxy文件

from mitmproxy import ctx

# mitmdump -p 8080 -s spider/modify_response.py 启动默认为8080端口

#修改自动化浏览器特有标识
def response(flow):
    if '/sufei_data/3.7.2/index.' in flow.request.url:
        for webdriver_key in ['webdriver', '__driver_evaluate', '__webdriver_evaluate', '__selenium_evaluate', '__fxdriver_evaluate', '__driver_unwrapped', '__webdriver_unwrapped', '__selenium_unwrapped', '__fxdriver_unwrapped', '_Selenium_IDE_Recorder', '_selenium', 'calledSelenium', '_WEBDRIVER_ELEM_CACHE', 'ChromeDriverw', 'driver-evaluate', 'webdriver-evaluate', 'selenium-evaluate', 'webdriverCommand', 'webdriver-evaluate-response', '__webdriverFunc', '__webdriver_script_fn', '__$webdriverAsyncExecutor', '__lastWatirAlert', '__lastWatirConfirm', '__lastWatirPrompt', '$chrome_asyncScriptInfo', '$cdc_asdjflasutopfhvcZLmcfl_']:
            ctx.log.info('Remove "{}" from {}.'.format(
                webdriver_key, flow.request.url))
            flow.response.text = flow.response.text.replace(
                '"{}"'.format(webdriver_key), '"NO-ATTR" ')
        flow.response.text = flow.response.text.replace('f.webdriver', 'false')
        flow.response.text = flow.response.text.replace('ChromeDriver', '')