一、前言
通过自动化页面点击操作来实现采集携程酒店数据。
二、主代码
from selenium import webdriver
from selenium.webdriver import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import ActionChains
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import random
class SpiderTest1():
def __init__(self):
self.phone = "自己的手机号"
chromedriver_path = '用自己的浏览器驱动路径'
chrome_options = Options()
# 添加一些常用选项以降低被检测概率
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_experimental_option("detach", True)
# 如果上述方法无效,可以尝试禁用JavaScript,但可能导致页面功能异常
# chrome_options.add_argument("--disable-javascript")
service = Service(executable_path=chromedriver_path)
self.driver = webdriver.Chrome(service=service, options=chrome_options)
self.action = ActionChains(self.driver)
# 关键步骤:使用CDP命令覆盖navigator.webdriver属性
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
# 移除Chrome浏览器自带的"Chrome is being controlled by automated test software"提示
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
'''
})
# 保存主窗口句柄
self.main_window = None
"""手动登录"""
def manual_login(self):
print(f"你有 60s 时间进行登录,请手动操作页面进行登录")
time.sleep(60)
"""自动登录"""
def voluntarily_login(self):
# 点击登录
login_button = self.driver.find_element(By.CLASS_NAME, 'tl_nfes_home_header_login_title_5neWJ')
login_button.click()
# 点击验证码登录
verification_login_button = self.driver.find_element(By.CLASS_NAME, 'login-entry-dynamic')
verification_login_button.click()
# 输入手机号
input_phone = self.driver.find_element(By.XPATH, '//*[@id="bbz_accounts_pc_lg_box"]/div/div/div[1]/div[2]/form/dl[2]/dd/dl/dd/input')
input_phone.send_keys(self.phone)
# 勾选同意
agree_button = self.driver.find_element(By.XPATH, '//*[@id="bbz_accounts_pc_lg_box"]/div/div/div[1]/div[4]/div/div[1]/label')
agree_button.click()
# 点击发送验证码
send_verification = self.driver.find_element(By.CLASS_NAME, 'btn-primary-s ')
send_verification.click()
# 等待手动过滑块以及输入验证码
print(f"你有 30s 时间输入验证码,请手动操作页面进行登录")
time.sleep(30)
"""点击首页来跳过弹出的登录界面"""
def pass_login(self):
# 搜索
search_button = self.driver.find_element(By.XPATH, '//*[@id="kakxi"]/li[6]/div/span')
search_button.click()
# 返回首页
time.sleep(1)
back_homepage = self.driver.find_element(By.XPATH, '//*[@id="hp_nfes_homepage"]/span')
back_homepage.click()
time.sleep(2)
def crawl_hotel(self,city_name , hotel_name):
self.main_window = self.driver.current_window_handle
# 城市名称
city_box = self.driver.find_element(By.ID, 'hotels-destination')
city_box.click()
time.sleep(1)
city_box.clear()
for char in city_name:
city_box.send_keys(char)
time.sleep(random.uniform(0.1, 0.3))
# city_box.send_keys(city_name)
time.sleep(1)
city_box.send_keys(" ")
# self.action.send_keys(Keys.SPACE).pause(1)
time.sleep(2)
self.driver.find_element(By.ID, 'hotels-destination').send_keys(Keys.TAB)
# 酒店关键字
hotel_box = self.driver.find_element(By.ID, 'keyword')
hotel_box.clear()
hotel_box.send_keys(hotel_name)
time.sleep(1)
# 搜索(搜索后有时候没有数据)
search_button = self.driver.find_element(By.XPATH, '//*[@id="kakxi"]/li[6]/div/span')
search_button.click()
# 拿到酒店搜索结果列表
hotel_elements = self.driver.find_elements(By.CLASS_NAME, "list-item")
for i, hotel in enumerate(hotel_elements):
print(f"\n正在处理第 {i + 1} 个酒店...")
try:
# 点击酒店链接(会在新标签页打开)
hotel.click()
time.sleep(2)
# 获取所有窗口句柄
all_windows = self.driver.window_handles
# 切换到新打开的窗口
new_window = [window for window in all_windows if window != self.main_window][0]
self.driver.switch_to.window(new_window)
# 在这里可以对新页面进行操作,比如获取信息等
# print(f"新页面标题: {self.driver.title}")
# print(f"新页面URL: {self.driver.current_url}")
# 等待一下让页面加载完全
time.sleep(2)
# 采集酒店房间数据
self.crawl_room()
# 关闭新窗口
self.driver.close()
# 切换回主窗口
self.driver.switch_to.window(self.main_window)
# 等待一下再处理下一个酒店
time.sleep(1)
except Exception as e:
print(f"处理第 {i + 1} 个酒店时出错: {str(e)}")
# 确保切换回主窗口
if self.main_window in self.driver.window_handles:
self.driver.switch_to.window(self.main_window)
continue
def crawl_room(self):
# 尝试点击"展开其余房型"按钮,如果不存在则跳过
try:
# 使用WebDriverWait等待元素出现,设置较短超时时间
rest_room = WebDriverWait(self.driver, 3).until(
EC.element_to_be_clickable((By.CLASS_NAME, 'mainRoomList-foldButton_text__yb3An'))
)
rest_room.click()
print("已点击展开其余房型")
time.sleep(2) # 等待房型加载
except Exception as e:
print("没有找到'展开其余房型'按钮或按钮不可点击,继续处理现有房型")
hotel_name = self.driver.find_element(By.CLASS_NAME, 'headInit_headInit-title_nameA__EE_LB').text
address = self.driver.find_element(By.CLASS_NAME, 'headInit_headInit-address_text__D_Atv').text
print(f"hotel_name:{hotel_name} address:{address}", )
room_elements = self.driver.find_elements(By.CLASS_NAME, "commonRoomCard__BpNjl")
for i, room_element in enumerate(room_elements):
try:
#获取房间名称
span_element = room_element.find_element(By.TAG_NAME, "span")
room_name = span_element.text
print(f"房间 {i + 1}: {room_name}")
except Exception as e:
print(f"获取房间 {i + 1} 信息时出错: {str(e)}")
def run(self):
self.driver.get('https://www.ctrip.com/')
# 手动登录
# self.login()
# 自动登录
self.voluntarily_login()
# 跳过登录(就算登录了在第一次点击搜索的时候也会跳出登录界面,所以也需要跳过一下)
self.pass_login()
# 根据城市名称和酒店关键字搜索对应的酒店(精准搜索酒店的话city_name为空串就好)
city_name = ""
hotel_name = "上海松江开元名都大酒店"
hotel_names = ["上海外滩豫园美居酒店", "上海松江开元名都大酒店", "上海徐家汇中心城际酒店"]
for hotel_name in hotel_names:
self.crawl_hotel(city_name, hotel_name)
# 后退
self.driver.back()
# 不关闭网页
self.driver.execute_script('window.stop()')
if __name__ == '__main__':
test = SpiderTest1()
test.run()
三、使用说明
1.先下载好对应版本的goole浏览器驱动(不会的网上搜下,我这里就不放了)。
2.然后运行代码,会自动跳到登陆页面以及输入手机号和发送验证码,接下来的步骤需要手动(下述操作默认是最长30s内完成,可自行设置):
1)过滑块
2)输入验证码后点击登录
3..登录成功后就不用动了,等待自动采集即可
四、总结
获取价格的代码还没有写,因为UP主的号由于之前调接口被封了导致现在看不到价格标签元素,有需要的可以联系我,按照我指示把价格标签元素发我我补全一下代码,博主也很需要检测一下自动化采集会不会被封号,所以把代码放出来。
最后希望有人能找我~
3067

被折叠的 条评论
为什么被折叠?



