使用selenium库，获取网站html_selenium获取网页html-优快云博客

本文链接：https://blog.youkuaiyun.com/hanjiangxue0912/article/details/141091254

selenium安装：pip install selenium

使用selenium库获取指定网页的html数据：

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
 
 
def get_push():
     # 设置Chrome浏览器的选项，启用无头模式（不显示浏览器窗口）
     chrome_options = Options()
    chrome_options.add_argument("--headless") ## 无头模式
    chrome_options.add_argument("no-sandbox")
    chrome_options.add_argument("--disable-extensions")
    # 创建Chrome浏览器对象
    browser = webdriver.Chrome(options=chrome_options)
    try:
        # 访问目标网站
        browser.get('http://www.baidu.com')
        # 等待JavaScript加载完成，可以根据实际情况调整等待时间
        # time.sleep(18)
        # 获取状态码
        status_code = browser.execute_script("return window.performance.getEntries()[0].responseStatus")
        # 获取html
        page_html = browser.page_source
        return status_code, page_html
    except Exception as e:
        print(f"Exception: {e}")
        return None
    finally:
        # 关闭浏览器
        browser.quit()
 
 
if __name__ == '__main__':
    max_retries = 5
    retry_count = 0
 
    while retry_count < max_retries:
        web_data = get_push()
        status_code = web_data[0]
 
        if status_code == 200:
            page_html = web_data[1]
            ## 写html文件
            with open('test.html', 'w', encoding='utf-8') as fd:
                fd.write(page_html)
            print("Success to save HTML file !!!")
            break
        else:
            retry_count += 1
            print(f"Retry {retry_count}/{max_retries}. Waiting for 10 seconds before retrying...")
            time.sleep(10)