Selenium
参考崔庆才爬虫
安装: pip install selenium
注意:需要下载Chrome的 webdriverwebdriver
下载完成后解压到与python.exe可执行文件同一目录下
基本使用
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
browser=webdriver.Chrome()
browser.get("https://www.taobao.com")
input_first=browser.find_element_by_id('q')
input_second=browser.find_element_by_css_selector('#q')
#input_third=browser.find_element_by_xpath('//*(@id="q")')
print(input_first,input_second)
browser.close()
<selenium.webdriver.remote.webelement.WebElement (session="4432d05b280a89cf54be45c143d95cff", element="0.02248678307764207-1")> <selenium.webdriver.remote.webelement.WebElement (session="4432d05b280a89cf54be45c143d95cff", element="0.02248678307764207-1")>
声明浏览器对象
from selenium import webdriver
browser=webdriver.Chrome()
#browser=webdriver.Firefox()
#browser=webdriver.Edge()
访问页面
from selenium import webdriver
browser=webdriver.Chrome()
browser.get("http://www.zhihu.com")
print(browser.page_source[:500])
browser.close()
<!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml" lang="zh" data-hairline="true" data-theme="light"><head><meta charset="utf-8" /><title>知乎 - 有问题上知乎</title><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=1" /><meta name="renderer" content="webkit" /><meta name="force-rendering" content="webkit" /><meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" /><meta name="google-site-verification" content="FTeR0c8arOPKh8c5DYh_9uu98_zJbaWw53J-Sch9MTg" /><tit
查找元素
单个元素
from selenium import webdriver
browser=webdriver.Chrome()
browser.get("http://www.gdsgj.com")
print(browser.page_source[:500])
print("...............")
input_1=browser.find_element_by_id("nav_box")
input_2=browser.find_element_by_css_selector('#nav_box')
#input_3=browser.find_element_by_xpath('//*(@id="q")')
print(input_1,input_2)
browser.close()
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head>
<!--360浏览器兼容模式定义如下一条语句-->
<meta name="renderer" content="ie-comp" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta http-equiv="Content-Language" content="zh-cn" />
<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
<meta name="Keywords" content="广东省高级技工学校,广东省技师学院,惠州博罗广东省高级技工学
...............
<selenium.webdriver.remote.webelement.WebElement (session="fbb5ecf6b543de1ed103c3ac991b3f60", element="0.3298393285200234-1")> <selenium.webdriver.remote.webelement.WebElement (session="fbb5ecf6b543de1ed103c3ac991b3f60", element="0.3298393285200234-1")>
- find_element_by_name()
- find_element_by_xpath()
- find_element_by_id()
- find_element_by_css_selector()
- find_element_by_tag_name()
- find_element_by_class_name()
- find_element_by_link_text()
- find_element_by_partial_link_text()
多个元素
from selenium import webdriver
browser=webdriver.Chrome()
browser.get("https://www.taobao.com")
li=browser.find_elements_by_css_selector('li')
print(li[:20])
browser.close()
[<selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-1")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-2")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-3")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-4")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-5")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-6")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-7")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-8")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-9")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-10")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-11")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-12")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-13")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-14")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-15")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-16")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-17")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-18")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-19")>, <selenium.webdriver.remote.webelement.WebElement (session="47c77f533e7b9de89a67db541307f4d0", element="0.12123117773549641-20")>]
- find_elements_by_name()
- find_elements_by_xpath()
- find_elements_by_id()
- find_elements_by_css_selector()
- find_elements_by_tag_name()
- find_elements_by_class_name()
- find_elements_by_link_text()
- find_elements_by_partial_link_text()
元素交互动作
对获取的元素调用交互方法
from selenium import webdriver
import time
browser=webdriver.Chrome()
browser.get("https://www.taobao.com") #搜索输入框
input=browser.find_element_by_id('q')
input.send_keys('iphone')
time.sleep(1)
input.clear()
input.send_keys('ipad')
button=browser.find_element_by_class_name('btn-search') #搜索按钮
button.click()
交互动作
将动作附加到动作链中串行执行
from selenium import webdriver
from selenium.webdriver import ActionChains
browser=webdriver.Chrome()
url="http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable"
browser.get(url)
browser.switch_to.frame('iframeResult')
source=browser.find_element_by_css_selector('#draggable') #要拖拽区域
target=browser.find_element_by_css_selector('#droppable') #目标区域
actions=ActionChains(browser) #创建一个动作链
actions.drag_and_drop(source,target) #执行动作链
actions.perform()
执行JavaScript
from selenium import webdriver
browser=webdriver.Chrome()
browser.get("https://www.zhihu.com/explore")
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
browser.execute_script('alert("To Bottom")')
获取元素
获取属性
from selenium import webdriver
browser=webdriver.Chrome()
browser.get("https://www.zhihu.com/explore")
logo=browser.find_element_by_id('zh-top-link-logo')
print(logo)
print(logo.get_attribute('class'))
<selenium.webdriver.remote.webelement.WebElement (session="1c8eaf5705221cac70f1bebd5ba4be05", element="0.010884747176694631-1")>
zu-top-link-logo
获取文本值
from selenium import webdriver
browser=webdriver.Chrome()
url="https://www.zhihu.com/explore"
browser.get(url)
input=browser.find_element_by_class_name('zu-top-add-question')
print(input.text)
提问
获取ID,位置,标签名,大小
from selenium import webdriver
browser=webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
input=browser.find_element_by_class_name('zu-top-add-question')
print(input.id)
print(input.location)
print(input.tag_name)
print(input.size)
0.9653522968329347-1
{'x': 759, 'y': 7}
button
{'width': 66, 'height': 32}
Frame
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
browser=webdriver.Chrome()
url="http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable"
browser.get(url)
browser.switch_to.frame('iframeResult')
source=browser.find_element_by_css_selector('#draggable')
print(source)
try:
logo=browser.find_element_by_css_selector('logo')
except:
print('NO LOGO')
browser.switch_to.parent_frame()
logo=browser.find_element_by_class_name('logo')
print(logo)
print(logo.text)
<selenium.webdriver.remote.webelement.WebElement (session="677c65a0ea4a4f9ceaf0df89eaf6bcde", element="0.6027971659640068-1")>
NO LOGO
<selenium.webdriver.remote.webelement.WebElement (session="677c65a0ea4a4f9ceaf0df89eaf6bcde", element="0.20115529620564865-2")>
RUNOOB.COM
等待
隐式等待
当使用了隐式等待执行测试的时候,如果WebDriver没有在DOM中找到元素,将继续等待,超出设定时间后则抛出找不到元素的异常,换句话说,当查找元素或元素并没有用立即出现问题的时候,隐式等待将等待一段时间再查找DOM元素,默认时间是0
from selenium import webdriver
browser=webdriver.Chrome()
browser.implicitly_wait(10)
browser.get('https://www.zhihu.com/explore')
input=browser.find_element_by_class_name('zu-top-add-question')
print(input)
<selenium.webdriver.remote.webelement.WebElement (session="3bcd9a9bf78deee11cee2dcc6e9f358a", element="0.6158222971089322-1")>
显示等待
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser=webdriver.Chrome()
browser.get("https://www.taobao.com/")
wait=WebDriverWait(browser,10)
input=wait.until(EC.presence_of_element_located((By.ID,'q')))
button=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'btn-search')))
print(input,button)
---------------------------------------------------------------------------
TimeoutException Traceback (most recent call last)
<ipython-input-14-88189fce9bb7> in <module>()
8 wait=WebDriverWait(browser,10)
9 input=wait.until(EC.presence_of_element_located((By.ID,'q')))
---> 10 button=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'btn-search')))
11 print(input,button)
C:\Program Files\Anaconda3\lib\site-packages\selenium\webdriver\support\wait.py in until(self, method, message)
78 if time.time() > end_time:
79 break
---> 80 raise TimeoutException(message, screen, stacktrace)
81
82 def until_not(self, method, message=''):
TimeoutException: Message:
前进后退
from selenium import webdriver
import time
browser=webdriver.Chrome()
browser.get('https://www.taobao.com')
browser.get('http://www.zhihu.com')
browser.get('http://www.baidu.com')
browser.back() #后退一步
time.sleep(1)
borwser.forward() #前进一步
browser.close()
Cookies
from selenium import webdriver
browser=webdriver.Chrome()
browser.get('https://www.zhihu.com')
print(browser.get_cookies())
browser.add_cookie({'name':'name','domain':'www.zhihu.com','value':'19'})
print(browser.get_cookies())
browser.delete_all_cookies()
print(browser.get_cookies())
[{'httpOnly': False, 'value': '8ffa4a0b7ecd9bdb5ad19b8c1037b063', 'domain': 'www.zhihu.com', 'path': '/', 'expiry': 1547781217.917957, 'secure': False, 'name': 'tgw_l7_route'}, {'httpOnly': False, 'value': 'd5e6b7cd-b27f-4ad9-b093-59625fa87e08', 'domain': '.zhihu.com', 'path': '/', 'expiry': 1610852317.918046, 'secure': False, 'name': '_zap'}, {'httpOnly': False, 'value': 'Mg3lICZ9264KnmroQXynie0G5Cyu1p8s', 'domain': '.zhihu.com', 'path': '/', 'expiry': 1625540317.918098, 'secure': False, 'name': '_xsrf'}]
[{'httpOnly': False, 'value': '8ffa4a0b7ecd9bdb5ad19b8c1037b063', 'domain': 'www.zhihu.com', 'path': '/', 'expiry': 1547781217.917957, 'secure': False, 'name': 'tgw_l7_route'}, {'httpOnly': False, 'value': 'd5e6b7cd-b27f-4ad9-b093-59625fa87e08', 'domain': '.zhihu.com', 'path': '/', 'expiry': 1610852317.918046, 'secure': False, 'name': '_zap'}, {'httpOnly': False, 'value': 'Mg3lICZ9264KnmroQXynie0G5Cyu1p8s', 'domain': '.zhihu.com', 'path': '/', 'expiry': 1625540317.918098, 'secure': False, 'name': '_xsrf'}, {'httpOnly': False, 'value': '19', 'domain': 'www.zhihu.com', 'path': '/', 'expiry': 2178500320, 'secure': True, 'name': 'name'}]
[{'httpOnly': False, 'value': 'dylbYgEEdf0gMQDfYcNzi7XcXym3wJlH', 'domain': '.zhihu.com', 'path': '/', 'expiry': 1625540321.166329, 'secure': False, 'name': '_xsrf'}, {'httpOnly': False, 'value': '"AODj0QbN1w6PTv69r6L5VykWXy6ahFXDzko=|1547780320"', 'domain': '.zhihu.com', 'path': '/', 'expiry': 1642388320.980894, 'secure': False, 'name': 'd_c0'}, {'httpOnly': False, 'value': '025a67177706b199591bd562de56e55b', 'domain': 'www.zhihu.com', 'path': '/', 'expiry': 1547781220.166226, 'secure': False, 'name': 'tgw_l7_route'}]
选项卡管理
from selenium import webdriver
import time
browser=webdriver.Chrome()
browser.get('https://www.taobao.com')
browser.execute_script('window.open()')
print(browser.window_handles)
browser.switch_to_window(browser.window_handles[1])
browser.get('http://www.baidu.com')
time.sleep(1)
browser.switch_to_window(browser.window_handles[0])
browser.get('https://python.org')
['CDwindow-08C207F70FD501825F3B509EE9951B7E', 'CDwindow-949E824D7CE34A78D5C1843BFDCBF9CE', 'CDwindow-AE3C151DCAC4A78BCCFC94DB976AF680']
C:\Program Files\Anaconda3\lib\site-packages\ipykernel\__main__.py:7: DeprecationWarning: use driver.switch_to.window instead
C:\Program Files\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: DeprecationWarning: use driver.switch_to.window instead
异常处理
from selenium import webdriver
browser=webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.find_element_by_id('hello') #无法查找到id="hello"的元素,产生异常
---------------------------------------------------------------------------
NoSuchElementException Traceback (most recent call last)
<ipython-input-18-cc86308282e5> in <module>()
3 browser=webdriver.Chrome()
4 browser.get('https://www.baidu.com')
----> 5 browser.find_element_by_id('hello') #无法查找到id="hello"的元素,产生异常
C:\Program Files\Anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py in find_element_by_id(self, id_)
358 element = driver.find_element_by_id('foo')
359 """
--> 360 return self.find_element(by=By.ID, value=id_)
361
362 def find_elements_by_id(self, id_):
C:\Program Files\Anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py in find_element(self, by, value)
976 return self.execute(Command.FIND_ELEMENT, {
977 'using': by,
--> 978 'value': value})['value']
979
980 def find_elements(self, by=By.ID, value=None):
C:\Program Files\Anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py in execute(self, driver_command, params)
319 response = self.command_executor.execute(driver_command, params)
320 if response:
--> 321 self.error_handler.check_response(response)
322 response['value'] = self._unwrap_value(
323 response.get('value', None))
C:\Program Files\Anaconda3\lib\site-packages\selenium\webdriver\remote\errorhandler.py in check_response(self, response)
240 alert_text = value['alert'].get('text')
241 raise exception_class(message, screen, stacktrace, alert_text)
--> 242 raise exception_class(message, screen, stacktrace)
243
244 def _value_or_default(self, obj, key, default):
NoSuchElementException: Message: no such element: Unable to locate element: {"method":"id","selector":"hello"}
(Session info: chrome=71.0.3578.98)
(Driver info: chromedriver=2.43.600210 (68dcf5eebde37173d4027fa8635e332711d2874a),platform=Windows NT 10.0.17134 x86_64)
from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException
browser=webdriver.Chrome()
try:
browser.get('https://www.baidu.com')
browser.find_element_by_id('hello')
except Exception as e:
print('Exception',e)
Exception Message: no such element: Unable to locate element: {"method":"id","selector":"hello"}
(Session info: chrome=71.0.3578.98)
(Driver info: chromedriver=2.43.600210 (68dcf5eebde37173d4027fa8635e332711d2874a),platform=Windows NT 10.0.17134 x86_64)