Python---selenium模拟浏览器爬虫

最新推荐文章于 2025-06-13 14:27:59 发布

原创最新推荐文章于 2025-06-13 14:27:59 发布 · 343 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#python #爬虫 #selenium

该代码示例展示了一个Python脚本，它利用Selenium库的Chromewebdriver，结合ChromeOptions防止被识别为自动化工具，以及BeautifulSoup解析HTML，从jiandaoyun.com网站上抓取并保存表格数据到Excel文件中。脚本遍历页面并提取特定表格的行数据及图片链接。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

#coding=utf-8
from selenium import webdriver
import time
import xlwt
from bs4 import BeautifulSoup
book = xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet = book.add_sheet('mysheet',cell_overwrite_ok=True)

from openpyxl import Workbook
# browser=webdriver.Chrome()
# browser.get('https://www.jiandaoyun.com/app/5df9ec636c793000064c6868/entry/5df9ed16073bbd00067f1d54')
#
# browser.close()
from lxml import etree
chromeOptions= webdriver.ChromeOptions()
chromeOptions.add_argument(r"user-data-dir=C:\Users\Administrator\AppData\Local\Google\Chrome\User Data1")

chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation']) # 防止网站发现我们使用模拟器
browser = webdriver.Chrome(options=chromeOptions,executable_path="E:\YANG_WORK\package_some\chromedriver.exe")
browser.get('https://www.jiandaoyun.com/app/5df9ec636c793000064c6868/entry/5df9ed16073bbd00067f1d54')

def page(i):
time.sleep(10)
b = browser.page_source
soup=BeautifulSoup(b,"html.parser") #html.parser解析器 lxml解析器 xml解析器
tb=soup.tbody
print("hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh")
# print(tb)
print(type(tb))
y = i
for td in tb.find_all('tr'):
x = 0
print('a标签中的href属性是：', td.find_all('img'))
img=td.find_all('img')
src0 = img[0].get('src')
for td1 in td.find_all('td'):
print('td的值是：', td1.string)
sheet.write(y, x, td1.string)
sheet.write(y, 5, src0)
try:
src1 = img[1].get('src')
sheet.write(y, 6, src1)
except:
pass
try:
src2 = img[2].get('src')
sheet.write(y, 7, src2)
except:
pass
# print('t的类型是：', type(td1))
x+=1
y+=1
book.save('test5.xls')

page(1)
browser.find_element_by_xpath('//button[@class="page-btn next"]').click()
page(100)
browser.find_element_by_xpath('//button[@class="page-btn next"]').click()
page(200)
browser.find_element_by_xpath('//button[@class="page-btn next"]').click()
page(300)
browser.find_element_by_xpath('//button[@class="page-btn next"]').click()
page(400)
browser.find_element_by_xpath('//button[@class="page-btn next"]').click()
page(500)