selenium自动化采集携程酒店数据

一、前言

        通过自动化页面点击操作来实现采集携程酒店数据。

二、主代码

from selenium import webdriver
from selenium.webdriver import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import ActionChains
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import random


class SpiderTest1():
    def __init__(self):
        self.phone = "自己的手机号"
        chromedriver_path = '用自己的浏览器驱动路径'
        chrome_options = Options()
        # 添加一些常用选项以降低被检测概率
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_experimental_option("detach", True)
        # 如果上述方法无效,可以尝试禁用JavaScript,但可能导致页面功能异常
        # chrome_options.add_argument("--disable-javascript")

        service = Service(executable_path=chromedriver_path)
        self.driver = webdriver.Chrome(service=service, options=chrome_options)
        self.action = ActionChains(self.driver)

        # 关键步骤:使用CDP命令覆盖navigator.webdriver属性
        self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        # 移除Chrome浏览器自带的"Chrome is being controlled by automated test software"提示
        self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
            'source': '''
                        Object.defineProperty(navigator, 'webdriver', {
                            get: () => undefined
                        })
                    '''
        })
        # 保存主窗口句柄
        self.main_window = None

    """手动登录"""
    def manual_login(self):
        print(f"你有 60s 时间进行登录,请手动操作页面进行登录")
        time.sleep(60)
    """自动登录"""
    def voluntarily_login(self):
        # 点击登录
        login_button = self.driver.find_element(By.CLASS_NAME, 'tl_nfes_home_header_login_title_5neWJ')
        login_button.click()
        # 点击验证码登录
        verification_login_button = self.driver.find_element(By.CLASS_NAME, 'login-entry-dynamic')
        verification_login_button.click()
        # 输入手机号
        input_phone = self.driver.find_element(By.XPATH, '//*[@id="bbz_accounts_pc_lg_box"]/div/div/div[1]/div[2]/form/dl[2]/dd/dl/dd/input')
        input_phone.send_keys(self.phone)
        # 勾选同意
        agree_button = self.driver.find_element(By.XPATH, '//*[@id="bbz_accounts_pc_lg_box"]/div/div/div[1]/div[4]/div/div[1]/label')
        agree_button.click()
        # 点击发送验证码
        send_verification = self.driver.find_element(By.CLASS_NAME, 'btn-primary-s ')
        send_verification.click()

        # 等待手动过滑块以及输入验证码
        print(f"你有 30s 时间输入验证码,请手动操作页面进行登录")
        time.sleep(30)
    """点击首页来跳过弹出的登录界面"""
    def pass_login(self):
        # 搜索
        search_button = self.driver.find_element(By.XPATH, '//*[@id="kakxi"]/li[6]/div/span')
        search_button.click()

        # 返回首页
        time.sleep(1)
        back_homepage = self.driver.find_element(By.XPATH, '//*[@id="hp_nfes_homepage"]/span')
        back_homepage.click()
        time.sleep(2)



    def crawl_hotel(self,city_name , hotel_name):
        self.main_window = self.driver.current_window_handle
        # 城市名称
        city_box = self.driver.find_element(By.ID, 'hotels-destination')
        city_box.click()
        time.sleep(1)
        city_box.clear()
        for char in city_name:
            city_box.send_keys(char)
            time.sleep(random.uniform(0.1, 0.3))
        # city_box.send_keys(city_name)
        time.sleep(1)
        city_box.send_keys(" ")
        # self.action.send_keys(Keys.SPACE).pause(1)
        time.sleep(2)
        self.driver.find_element(By.ID, 'hotels-destination').send_keys(Keys.TAB)

        # 酒店关键字
        hotel_box = self.driver.find_element(By.ID, 'keyword')
        hotel_box.clear()
        hotel_box.send_keys(hotel_name)
        time.sleep(1)

        # 搜索(搜索后有时候没有数据)
        search_button = self.driver.find_element(By.XPATH, '//*[@id="kakxi"]/li[6]/div/span')
        search_button.click()

        # 拿到酒店搜索结果列表
        hotel_elements = self.driver.find_elements(By.CLASS_NAME, "list-item")

        for i, hotel in enumerate(hotel_elements):
            print(f"\n正在处理第 {i + 1} 个酒店...")

            try:
                # 点击酒店链接(会在新标签页打开)
                hotel.click()
                time.sleep(2)
                # 获取所有窗口句柄
                all_windows = self.driver.window_handles
                # 切换到新打开的窗口
                new_window = [window for window in all_windows if window != self.main_window][0]
                self.driver.switch_to.window(new_window)
                # 在这里可以对新页面进行操作,比如获取信息等
                # print(f"新页面标题: {self.driver.title}")
                # print(f"新页面URL: {self.driver.current_url}")
                # 等待一下让页面加载完全
                time.sleep(2)
                # 采集酒店房间数据
                self.crawl_room()
                # 关闭新窗口
                self.driver.close()
                # 切换回主窗口
                self.driver.switch_to.window(self.main_window)
                # 等待一下再处理下一个酒店
                time.sleep(1)

            except Exception as e:
                print(f"处理第 {i + 1} 个酒店时出错: {str(e)}")

                # 确保切换回主窗口
                if self.main_window in self.driver.window_handles:
                    self.driver.switch_to.window(self.main_window)
                continue

    def crawl_room(self):
        # 尝试点击"展开其余房型"按钮,如果不存在则跳过
        try:
            # 使用WebDriverWait等待元素出现,设置较短超时时间
            rest_room = WebDriverWait(self.driver, 3).until(
                EC.element_to_be_clickable((By.CLASS_NAME, 'mainRoomList-foldButton_text__yb3An'))
            )
            rest_room.click()
            print("已点击展开其余房型")
            time.sleep(2)  # 等待房型加载
        except Exception as e:
            print("没有找到'展开其余房型'按钮或按钮不可点击,继续处理现有房型")

        hotel_name = self.driver.find_element(By.CLASS_NAME, 'headInit_headInit-title_nameA__EE_LB').text
        address = self.driver.find_element(By.CLASS_NAME, 'headInit_headInit-address_text__D_Atv').text

        print(f"hotel_name:{hotel_name} address:{address}", )
        room_elements = self.driver.find_elements(By.CLASS_NAME, "commonRoomCard__BpNjl")
        for i, room_element in enumerate(room_elements):
            try:
                #获取房间名称
                span_element = room_element.find_element(By.TAG_NAME, "span")
                room_name = span_element.text
                print(f"房间 {i + 1}: {room_name}")

            except Exception as e:
                print(f"获取房间 {i + 1} 信息时出错: {str(e)}")

    def run(self):
        self.driver.get('https://www.ctrip.com/')

        # 手动登录
        # self.login()
        # 自动登录
        self.voluntarily_login()
        # 跳过登录(就算登录了在第一次点击搜索的时候也会跳出登录界面,所以也需要跳过一下)
        self.pass_login()
        # 根据城市名称和酒店关键字搜索对应的酒店(精准搜索酒店的话city_name为空串就好)
        city_name = ""
        hotel_name = "上海松江开元名都大酒店"
        hotel_names = ["上海外滩豫园美居酒店", "上海松江开元名都大酒店", "上海徐家汇中心城际酒店"]
        for hotel_name in hotel_names:
            self.crawl_hotel(city_name, hotel_name)
            # 后退
            self.driver.back()
        # 不关闭网页
        self.driver.execute_script('window.stop()')



if __name__ == '__main__':
    test = SpiderTest1()
    test.run()

三、使用说明

1.先下载好对应版本的goole浏览器驱动(不会的网上搜下,我这里就不放了)。

2.然后运行代码,会自动跳到登陆页面以及输入手机号和发送验证码,接下来的步骤需要手动(下述操作默认是最长30s内完成,可自行设置):
1)过滑块
2)输入验证码后点击登录

3..登录成功后就不用动了,等待自动采集即可

四、总结

获取价格的代码还没有写,因为UP主的号由于之前调接口被封了导致现在看不到价格标签元素,有需要的可以联系我,按照我指示把价格标签元素发我我补全一下代码,博主也很需要检测一下自动化采集会不会被封号,所以把代码放出来。

最后希望有人能找我~

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值