Python+Selenium-5-driver.page_source获取页面源码

本文介绍如何使用Selenium获取网页源码,并通过正则表达式提取煎蛋网首页的所有文章标题,实现自动化信息抓取。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

driver.page_source

selenium的page_source方法可以获取到页面源码

跟爬虫有点相似,获取到页面资源,提取出我们需要的信息

 

案例

以煎蛋网为例,获取首页的全部title(获取页面源码 -- 使用re正则提取需要的title)

代码

#coding:utf-8
from selenium import webdriver
import re
class JianDan():
    def __init__(self):
        self.browser = webdriver.Chrome()
        self.browser.get("http://jandan.net/")
        self.browser.maximize_window()
        self.browser.implicitly_wait(3)

    def get_page_title(self):
        self.page = self.browser.page_source
        # 非贪婪匹配,匹配所有满足'target="_blank">....</a></h2>'格式的信息,结果显示是一个列表
        self.titles = re.findall(r'target="_blank">(.*?)</a></h2>',self.page)
        for title in self.titles:
            print(title)

if __name__ == '__main__':
    jian_dan = JianDan()
    jian_dan.get_page_title()

结果

 

 

from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.edge.service import Service from selenium.webdriver.edge.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.action_chains import ActionChains import time # 配置Edge浏览器选项 def configure_edge_options(): """配置Edge浏览器选项""" options = Options() options.add_argument("--start-maximized") # 最大化窗口 options.add_argument("--disable-notifications") # 禁用通知 # options.add_argument("--headless") # 无头模式(可选) options.add_experimental_option("excludeSwitches", ["enable-automation"]) # 隐藏自动化控制提示 options.add_experimental_option("useAutomationExtension", False) # 禁用自动化扩展 return options # 初始化Edge浏览器驱动 def init_edge_driver(): """初始化Edge浏览器驱动""" try: # 尝试使用webdriver-manager自动管理驱动 from webdriver_manager.microsoft import EdgeChromiumDriverManager service = Service(EdgeChromiumDriverManager().install()) return webdriver.Edge(service=service, options=configure_edge_options()) except: # 如果自动管理失败,尝试手动指定路径 try: # Windows默认路径 service = Service(executable_path="C:/Program Files (x86)/Microsoft/Edge/Application/msedgedriver.exe") return webdriver.Edge(service=service, options=configure_edge_options()) except: # Linux/Mac路径 service = Service(executable_path="/usr/local/bin/msedgedriver") return webdriver.Edge(service=service, options=configure_edge_options()) # 初始化浏览器 driver = init_edge_driver() try: # 1. 打开网页 print("打开百度首页...") driver.get("https://www.baidu.com") # 2. 获取页面标题和URL print(f"页面标题: {driver.title}") print(f"当前URL: {driver.current_url}") # 3. 定位元素并输入搜索词 search_box = driver.find_element(By.ID, "kw") search_box.send_keys("Selenium Edge浏览器自动化") # 4. 提交搜索表单 search_box.submit() # 5. 显式等待结果加载 print("等待搜索结果加载...") WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "content_left")) ) # 6. 获取搜索结果数量 result_stats = driver.find_element(By.CLASS_NAME, "nums_text").text print(f"搜索结果: {result_stats}") # 7. 点击第一个搜索结果 first_result = driver.find_element(By.CSS_SELECTOR, "#content_left .result-op h3 a") first_result.click() # 8. 切换到新标签页 print("切换到新标签页...") driver.switch_to.window(driver.window_handles[-1]) # 9. 页面滚动 print("滚动页面...") driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);") time.sleep(1) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(1) driver.execute_script("window.scrollTo(0, 0);") # 10. 鼠标悬停操作 print("执行鼠标悬停操作...") try: # 尝试在页面上找到可悬停的元素 hover_element = driver.find_element(By.CSS_SELECTOR, "nav a") ActionChains(driver).move_to_element(hover_element).perform() time.sleep(1) except: print("未找到可悬停元素,跳过此步骤") # 11. 浏览器导航操作 print("浏览器导航操作...") driver.back() # 返回上一页 time.sleep(1) driver.forward() # 前进 time.sleep(1) driver.refresh() # 刷新页面 time.sleep(1) # 12. 处理弹窗 print("处理JavaScript弹窗...") # 触发一个alert弹窗 driver.execute_script("alert('这是Selenium测试弹窗!');") # 切换到弹窗并接受 alert = driver.switch_to.alert print(f"弹窗内容: {alert.text}") alert.accept() # 13. 操作浏览器窗口 print("操作浏览器窗口...") # 获取当前窗口大小 print(f"当前窗口尺寸: {driver.get_window_size()}") # 设置窗口大小 driver.set_window_size(800, 600) time.sleep(1) # 最大化窗口 driver.maximize_window() time.sleep(1) # 14. 获取页面截图 print("保存页面截图...") driver.save_screenshot("selenium_screenshot.png") # 15. 获取页面源码 # print("获取页面源码...") # page_source = driver.page_source # with open("page_source.html", "w", encoding="utf-8") as f: # f.write(page_source) print("所有基础操作执行完成!") finally: # 16. 关闭浏览器 input("按Enter键关闭浏览器...") # 用于演示时查看结果 driver.quit() print("浏览器已关闭") 运行结果: ValueError Traceback (most recent call last) Cell In[9], line 28, in init_edge_driver() 27 from webdriver_manager.microsoft import EdgeChromiumDriverManager ---> 28 service = Service(EdgeChromiumDriverManager().install()) 29 return webdriver.Edge(service=service, options=configure_edge_options()) File ~\AppData\Roaming\Python\Python313\site-packages\webdriver_manager\microsoft.py:73, in EdgeChromiumDriverManager.install(self) 72 def install(self) -> str: ---> 73 driver_path = self._get_driver_binary_path(self.driver) 74 os.chmod(driver_path, 0o755) File ~\AppData\Roaming\Python\Python313\site-packages\webdriver_manager\core\manager.py:35, in DriverManager._get_driver_binary_path(self, driver) 34 def _get_driver_binary_path(self, driver): ---> 35 binary_path = self._cache_manager.find_driver(driver) 36 if binary_path: File ~\AppData\Roaming\Python\Python313\site-packages\webdriver_manager\core\driver_cache.py:107, in DriverCacheManager.find_driver(self, driver) 105 return None --> 107 driver_version = self.get_cache_key_driver_version(driver) 108 metadata = self.load_metadata_content() File ~\AppData\Roaming\Python\Python313\site-packages\webdriver_manager\core\driver_cache.py:154, in DriverCacheManager.get_cache_key_driver_version(self, driver) 153 return self._cache_key_driver_version --> 154 return driver.get_driver_version_to_download() File ~\AppData\Roaming\Python\Python313\site-packages\webdriver_manager\core\driver.py:48, in Driver.get_driver_version_to_download(self) 46 return self._driver_version_to_download ---> 48 return self.get_latest_release_version() File ~\AppData\Roaming\Python\Python313\site-packages\webdriver_manager\drivers\edge.py:51, in EdgeChromiumDriver.get_latest_release_version(self) 43 latest_release_url = { 44 OSType.WIN 45 in os_type: f"{self._latest_release_url}_{major_edge_version}_WINDOWS", (...) 49 in os_type: f"{self._latest_release_url}_{major_edge_version}_LINUX", 50 }[True] ---> 51 resp = self._http_client.get(url=latest_release_url) 52 return resp.text.rstrip() File ~\AppData\Roaming\Python\Python313\site-packages\webdriver_manager\core\http.py:36, in WDMHttpClient.get(self, url, **kwargs) 35 raise exceptions.ConnectionError(f"Could not reach host. Are you offline?") ---> 36 self.validate_response(resp) 37 return resp File ~\AppData\Roaming\Python\Python313\site-packages\webdriver_manager\core\http.py:15, in HttpClient.validate_response(resp) 14 if status_code == 404: ---> 15 raise ValueError(f"There is no such driver by url {resp.url}") 16 elif status_code == 401: ValueError: There is no such driver by url https://msedgedriver.azureedge.net/LATEST_RELEASE_138_WINDOWS During handling of the above exception, another exception occurred: ValueError Traceback (most recent call last) File ~\AppData\Roaming\Python\Python313\site-packages\selenium\webdriver\common\driver_finder.py:64, in DriverFinder._binary_paths(self) 63 if not Path(path).is_file(): ---> 64 raise ValueError(f"The path is not a valid file: {path}") 65 self._paths["driver_path"] = path ValueError: The path is not a valid file: C:/Program Files (x86)/Microsoft/Edge/Application/msedgedriver.exe The above exception was the direct cause of the following exception: NoSuchDriverException Traceback (most recent call last) Cell In[9], line 35, in init_edge_driver() 34 service = Service(executable_path="C:/Program Files (x86)/Microsoft/Edge/Application/msedgedriver.exe") ---> 35 return webdriver.Edge(service=service, options=configure_edge_options()) 36 except: 37 # Linux/Mac路径 File ~\AppData\Roaming\Python\Python313\site-packages\selenium\webdriver\edge\webdriver.py:47, in WebDriver.__init__(self, options, service, keep_alive) 45 options = options if options else Options() ---> 47 super().__init__( 48 browser_name=DesiredCapabilities.EDGE["browserName"], 49 vendor_prefix="ms", 50 options=options, 51 service=service, 52 keep_alive=keep_alive, 53 ) File ~\AppData\Roaming\Python\Python313\site-packages\selenium\webdriver\chromium\webdriver.py:53, in ChromiumDriver.__init__(self, browser_name, vendor_prefix, options, service, keep_alive) 52 finder = DriverFinder(self.service, options) ---> 53 if finder.get_browser_path(): 54 options.binary_location = finder.get_browser_path() File ~\AppData\Roaming\Python\Python313\site-packages\selenium\webdriver\common\driver_finder.py:47, in DriverFinder.get_browser_path(self) 46 def get_browser_path(self) -> str: ---> 47 return self._binary_paths()["browser_path"] File ~\AppData\Roaming\Python\Python313\site-packages\selenium\webdriver\common\driver_finder.py:78, in DriverFinder._binary_paths(self) 77 msg = f"Unable to obtain driver for {browser}" ---> 78 raise NoSuchDriverException(msg) from err 79 return self._paths NoSuchDriverException: Message: Unable to obtain driver for MicrosoftEdge; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location During handling of the above exception, another exception occurred: ValueError Traceback (most recent call last) File ~\AppData\Roaming\Python\Python313\site-packages\selenium\webdriver\common\driver_finder.py:64, in DriverFinder._binary_paths(self) 63 if not Path(path).is_file(): ---> 64 raise ValueError(f"The path is not a valid file: {path}") 65 self._paths["driver_path"] = path ValueError: The path is not a valid file: /usr/local/bin/msedgedriver The above exception was the direct cause of the following exception: NoSuchDriverException Traceback (most recent call last) Cell In[9], line 42 39 return webdriver.Edge(service=service, options=configure_edge_options()) 41 # 初始化浏览器 ---> 42 driver = init_edge_driver() 44 try: 45 # 1. 打开网页 46 print("打开百度首页...") Cell In[9], line 39, in init_edge_driver() 36 except: 37 # Linux/Mac路径 38 service = Service(executable_path="/usr/local/bin/msedgedriver") ---> 39 return webdriver.Edge(service=service, options=configure_edge_options()) File ~\AppData\Roaming\Python\Python313\site-packages\selenium\webdriver\edge\webdriver.py:47, in WebDriver.__init__(self, options, service, keep_alive) 44 service = service if service else Service() 45 options = options if options else Options() ---> 47 super().__init__( 48 browser_name=DesiredCapabilities.EDGE["browserName"], 49 vendor_prefix="ms", 50 options=options, 51 service=service, 52 keep_alive=keep_alive, 53 ) File ~\AppData\Roaming\Python\Python313\site-packages\selenium\webdriver\chromium\webdriver.py:53, in ChromiumDriver.__init__(self, browser_name, vendor_prefix, options, service, keep_alive) 50 self.service = service 52 finder = DriverFinder(self.service, options) ---> 53 if finder.get_browser_path(): 54 options.binary_location = finder.get_browser_path() 55 options.browser_version = None File ~\AppData\Roaming\Python\Python313\site-packages\selenium\webdriver\common\driver_finder.py:47, in DriverFinder.get_browser_path(self) 46 def get_browser_path(self) -> str: ---> 47 return self._binary_paths()["browser_path"] File ~\AppData\Roaming\Python\Python313\site-packages\selenium\webdriver\common\driver_finder.py:78, in DriverFinder._binary_paths(self) 76 except Exception as err: 77 msg = f"Unable to obtain driver for {browser}" ---> 78 raise NoSuchDriverException(msg) from err 79 return self._paths NoSuchDriverException: Message: Unable to obtain driver for MicrosoftEdge; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location
最新发布
08-09
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值