安装
# 安装selenium
pip install -U selenium
# centos安装chrome(如有必要)
yum install -y google-chrome-stable_current_x86_64.rpm
Drivers
Chrome: https://sites.google.com/a/chromium.org/chromedriver/downloads
Edge: https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/
Firefox: https://github.com/mozilla/geckodriver/releases
Safari: https://webkit.org/blog/6900/webdriver-support-in-safari-10/
headless
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome('../cookie/chromedriver', chrome_options=options)
相关操作
获取页面
from selenium import webdriver
browser = webdriver.Firefox()
browser.get('http://seleniumhq.org/‘)
等待页面加载完成(Waits)
显式等待
- 等待一定条件发生后再进一步执行你的代码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get("http://somedomain/url_that_delays_loading")
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "myDynamicElement"))
)
finally:
driver.quit()
在抛出TimeoutException异常之前将等待10秒或者在10秒内发现了查找的元素。 WebDriverWait 默认情况下会每500毫秒调用一次ExpectedCondition直到结果成功返回。 ExpectedCondition成功的返回结果是一个布尔类型的true或是不为null的返回值。
- 预期条件
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 10)
element = wait.until(EC.element_to_be_clickable((By.ID,'someid')))
- title_is - title_contains - presence_of_element_located - visibility_of_element_located - visibility_of - presence_of_all_elements_located - text_to_be_present_in_element - text_to_be_present_in_element_value - frame_to_be_available_and_switch_to_it - invisibility_of_element_located - element_to_be_clickable - 显示并可用. - staleness_of - element_to_be_selected - element_located_to_be_selected - element_selection_state_to_be - element_located_selection_state_to_be - alert_is_present
请参考python selenium expected_conditions使用实例
隐式等待
隐式等待是告诉WebDriver去等待一定的时间后去查找元素。 默认等待时间是0秒,一旦设置该值,隐式等待是设置该WebDriver的实例的生命周期。
from selenium import webdriver
driver = webdriver.Firefox()
driver.implicitly_wait(10) # seconds
driver.get("http://somedomain/url_that_delays_loading")
myDynamicElement = driver.find_element_by_id("myDynamicElement")
获取页面元素
element = driver.find_element_by_id("passwd-id")
element = driver.find_element_by_name("passwd")
element = driver.find_elements_by_tag_name("input")
element = driver.find_element_by_xpath("//input[@id='passwd-id']")
# 根据链接文本获取超链接
element_link = driver.find_element_by_link_text('Continue')
element_link = driver.find_element_by_partial_link_text('Conti') # 部分文本
获取元素属性
element.get_attribute('')
driver.title
element.text
操作元素
# 输入文本并点击
element.send_keys("and some", Keys.ARROW_DOWN)
# 清除叠加的输入文本
element.clear()
填充表单
第一种方式
element = driver.find_element_by_xpath("//select[@name='name']")
all_options = element.find_elements_by_tag_name("option")
for option in all_options:
print("Value is: %s" % option.get_attribute("value"))
option.click()
第二种方式Select
from selenium.webdriver.support.ui import Select
select = Select(driver.find_element_by_name('name'))
select.select_by_index(index) # 从0开始计数
select.select_by_visible_text("text") # 根据text值
select.select_by_value(value) # 根据value值
提交表单
driver.find_element_by_id("submit").click()
# 单独提交某个元素
element.submit()
添加头信息
# !/usr/bin/python
# -*- coding: utf-8 -*-
from selenium import webdriver
# 进入浏览器设置
options = webdriver.ChromeOptions()
# 设置中文
options.add_argument('lang=zh_CN.UTF-8')
# 更换头部
options.add_argument('user-agent="Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20"')
browser = webdriver.Chrome(chrome_options=options)
url = "https://httpbin.org/get?show_env=1"
browser.get(url)
browser.quit()
Cookies
添加cookies
# Go to the correct domain
driver.get("http://www.example.com")
# Now set the cookie. This one's valid for the entire domain
cookie = {‘name’ : ‘foo’, ‘value’ : ‘bar’}
driver.add_cookie(cookie)
获取cookies
# Go to the correct domain
driver.get("http://www.example.com")
# And now output all the available cookies for the current URL
driver.get_cookies()
传递cookies
import requests
from selenium import webdriver
driver = webdriver.Firefox()
url = "some_url" #a redirect to a login page occurs
driver.get(url)
#storing the cookies generated by the browser
request_cookies_browser = driver.get_cookies()
#making a persistent connection using the requests library
params = {'os_username':'username', 'os_password':'password'}
s = requests.Session()
#passing the cookies generated from the browser to the session
c = [s.cookies.set(c['name'], c['value']) for c in request_cookies_browser]
resp = s.post(url, params) #I get a 200 status_code
#passing the cookie of the response to the browser
dict_resp_cookies = resp.cookies.get_dict()
response_cookies_browser = [{'name':name, 'value':value} for name, value in dict_resp_cookies.items()]
c = [driver.add_cookie(c) for c in response_cookies_browser]
#the browser now contains the cookies generated from the authentication
driver.get(url)
截取元素图片
# 截取图片验证码并保存
element = self.driver.find_element_by_id(self.check_code_id)
element.screenshot(png_name)
Centos中部署所遇问题
- centos中安装
yum update
yum -y install google-chrome-stable
- centos中运行报错
DevToolsActivePort file doesn\'t exist
解决:
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('/path/to/your_chrome_driver_dir/chromedriver',chrome_options=chrome_options)
Chrome version must be between 70 and 73
解决:
通过/opt/google/chrome/chrome --version
确认安装的chrome版本,检查下载的chromedriver版本,确保两者一致即可解决此问题
参考:
Selenium Client Driver
Python爬虫利器五之Selenium的用法
Selenium Navigating
Selenium passing cookies
Selenium 查找元素
selenium设置Chrome
CentOS 7 快速安装 Chrome 浏览器