贝壳网的物业信息统计采集-房产经纪方向-优快云博客

本文链接：https://blog.youkuaiyun.com/www_enjoy1_com/article/details/148381933
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# 配置 Chrome 选项以加速加载
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-gpu")  # 禁用 GPU 加速（可选）
chrome_options.add_argument("--no-sandbox")  # 解决 DevToolsActivePort 文件不存在的报错
chrome_options.add_argument("--disable-images")  # 禁用图片加载
chrome_options.add_argument('--disable-dev-shm-usage')  # 解决资源限制问题（Linux环境下）

# 初始化 WebDriver
driver = webdriver.Chrome(options=chrome_options)

try:
    # 打开目标网址
    driver.get("https://sh.ke.com/xiaoqu/xietulu/")

    # 等待用户输入 "y" 表示访问成功
    user_input = input("请输入 'y' 确认访问成功：")
    if user_input.lower() != 'y':
        print("用户未确认访问成功，程序退出。")
        exit()

    # 用于存储所有抓取的 href
    all_hrefs = []
    previous_data_page = None

    while True:
        try:
            # 抓取 class="listContent" ul 元素下每个 li 元素中的 href
            # 等待 class="content" 容器下的 data-component="list" 容器出现
            content_container = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".content [data-component='list']"))
            )

            list_content = content_container.find_element(By.CLASS_NAME, "listContent")
            li_elements = list_content.find_elements(By.TAG_NAME, "li")
            for li in li_elements:
                a_tag = li.find_element(By.TAG_NAME, "a")
                href = a_tag.get_attribute("href")
                all_hrefs.append(href)

            print(f"当前页面抓取到的链接数量：{len(li_elements)}")
        except Exception as e:
            print("抓取数据时发生错误：", e)
            break

        # 翻页逻辑
        try:
            # 等待 class="page-box fr" 容器中的最后一个 a 标签
            page_box = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "page-box.fr"))
            )
            last_a_tag = page_box.find_elements(By.TAG_NAME, "a")[-1]

            # 获取当前 data-page 的值
            current_data_page = last_a_tag.get_attribute("data-page")

            # 判断最后一个 a 标签的文本是否为数字
            if last_a_tag.text.isdigit():
                print("最后一页已到达，循环结束。")
                break

            # 如果 data-page 没有变化，则等待直到变化
            if current_data_page == previous_data_page:
                print("等待页面加载，data-page 尚未变化...")
                WebDriverWait(driver, 20).until(
                    lambda driver: last_a_tag.get_attribute("data-page") != previous_data_page
                )

            # 更新 previous_data_page
            previous_data_page = current_data_page

            # 使用 ActionChains 模拟鼠标移动到目标元素并点击
            actions = webdriver.ActionChains(driver)
            actions.move_to_element(last_a_tag).click().perform()
            print("翻页操作成功。")

        except Exception as e:
            print("翻页时发生错误：", e)
            continue

    # 循环结束后，将所有抓取的 href 写入文件
    with open("小区物业开发商数据href.txt", "w", encoding="utf-8") as file:
        unique_hrefs = set(all_hrefs)  # 去重
        for href in unique_hrefs:
            file.write(href + "\n")
    print(f"共抓取 {len(unique_hrefs)} 条唯一链接，已保存到 '小区物业开发商数据href.txt' 文件中。")

    # 打开链接文件
    with open("小区物业开发商数据href.txt", "r", encoding="utf-8") as file:
        links = file.readlines()

    # 打开输出文件，追加模式
    with open("板块物业/板块开发商物业/小区物业开发商数据详情4.txt", "a", encoding="utf-8") as output_file:
        for i, link in enumerate(links):
            link = link.strip()  # 去除换行符和空格
            if not link:
                continue  # 跳过空行

            print(f"正在访问链接: {link}")
            driver.get(link)


            # 如果是第一个链接，等待用户确认
            if i == 0:
                user_input = input("请确认是否继续 (输入 y 继续): ")
                if user_input.lower() != 'y':
                    print("用户终止程序")
                    break  # 终止程序
            time.sleep(6)  # 等待页面加载
            try:
                # 定位 class="xiaoquInfoItemOneLine" 的容器
                container = driver.find_element(By.CLASS_NAME, "xiaoquInfoItemOneLine")

                # 获取所有 class="xiaoquInfoItem outerItem" 的子容器
                items = container.find_elements(By.CSS_SELECTOR, ".xiaoquInfoItem.outerItem")

                # 提取 items[0]、items[2]、items[3] 中的文本内容
                text_0 = items[0].find_element(By.CSS_SELECTOR, ".xiaoquInfoContent.outer").text
                text_2 = items[2].find_element(By.CSS_SELECTOR, ".xiaoquInfoContent.outer").text
                text_3 = items[3].find_element(By.CSS_SELECTOR, ".xiaoquInfoContent.outer").text

                # 抓取 class="title-wrapper" 容器中的 h1 标签文本作为 text_4
                title_wrapper = driver.find_element(By.CLASS_NAME, "title-wrapper")
                text_4 = title_wrapper.find_element(By.TAG_NAME, "h1").text

                # 按照指定格式拼接文本
                result = f"{text_0}，{text_2}，{text_3}，{text_4}"
                print(f"提取的内容: {result}")

                # 写入到输出文件
                output_file.write(result + "\n")
                output_file.flush()  # 确保写入立即生效
            except Exception as e:
                print(f"处理链接 {link} 时发生错误：", e)
                continue

except Exception as e:
    print("主程序运行时发生错误：", e)
finally:
    # 关闭浏览器
    driver.quit()