东方财富网的各股信息基本面分析-python

最新推荐文章于 2025-06-17 00:28:39 发布
谢的2元王国
最新推荐文章于 2025-06-17 00:28:39 发布
阅读量84
点赞数 2
CC 4.0 BY-SA版权
文章标签： python 开发语言
本文链接：https://blog.youkuaiyun.com/www_enjoy1_com/article/details/148317064
import os
import re
import time
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService  # 替换 EdgeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager  # 替换 EdgeChromiumDriverManager
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException
from selenium.webdriver import ActionChains

# ==================== 初始化环境 ====================
def write_to_file(filepath, data):
    with open(filepath, 'a', encoding='utf-8') as f:
        f.write(data + '\n')

# 获取当前日期，格式为 YYYY-MM-DD
current_date = datetime.now().strftime("%Y-%m-%d")
# 定义父目录和子目录路径
output_dir = os.path.join("各股及时信息反映", current_date)
os.makedirs(output_dir, exist_ok=True)

chrome_options = webdriver.ChromeOptions()  # 替换 EdgeOptions
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--start-maximized")

def initialize_driver():
    try:
        # 初始化 WebDriver
        return webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)  # 替换 Edge
    except Exception as e:
        print(f"无法自动安装 ChromeDriver，请手动下载并指定路径：{e}")
        exit(1)

def main():
    driver = initialize_driver()

    while True:
        # 获取用户输入的多个股票代码
        raw_input = input("请输入多个股票代码（支持中/英文逗号或空格分隔）：").strip()

        # 增强型分隔处理（支持中英文逗号、空格、混合分隔）
        stock_codes = re.split(r'[，,\s]+', raw_input)  # 正则表达式同时匹配中文(，)和英文(,)逗号

        if not stock_codes:
            print("未输入有效的股票代码！")
            continue

        def generate_stock_prefix(stock_code):
            """增强型股票代码前缀生成"""
            # 统一处理为字符串并补齐6位
            code_str = str(stock_code).zfill(6)
            # 处理科创板特殊逻辑
            if code_str.startswith("688"):
                return f"sh{code_str}"
            # 处理创业板注册制股票
            elif code_str.startswith("300"):
                return f"sz{code_str}"
            # 常规交易所判断
            elif code_str.startswith(("0", "3")):
                return f"sz{code_str}"
            elif code_str.startswith(("6", "9")):  # 包含沪市A股和B股
                return f"sh{code_str}"
            else:
                raise ValueError(f"无法识别的股票代码格式：{code_str}")

        # 标志变量，用于控制缩放按钮点击逻辑是否执行
        first_visit = True

        # 遍历每个股票代码
        for stock_code in stock_codes:
            try:
                # 生成股票代码前缀
                stock_prefix = generate_stock_prefix(stock_code)

                # 动态生成第一个 URL
                url = f"https://quote.eastmoney.com/{stock_prefix}.html"
                print(f"正在访问股票代码 {stock_code} 的页面: {url}")
                driver.get(url)

                # 等待 class="quote_title self_clearfix" 容器中的 class="quote_title_l" 容器的第一个 span 加载完成
                wait = WebDriverWait(driver, 10)
                quote_title_container = wait.until(
                    EC.presence_of_element_located((By.CLASS_NAME, "quote_title.self_clearfix"))
                )
                quote_title_l = quote_title_container.find_element(By.CLASS_NAME, "quote_title_l")
                title_span = quote_title_l.find_element(By.TAG_NAME, "span")
                title_text = title_span.text.strip()

                # 打印标题文本
                print(f"抓取到的标题文本: {title_text}")

                # 生成带有日期时间的文件名
                current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
                output_filename = f"{title_text}_{current_datetime}.txt"
                output_filepath = os.path.join(output_dir, output_filename)

                # 写入股票代码和标题文本
                write_to_file(output_filepath, f"股票代码: {stock_code}")
                write_to_file(output_filepath, f"标题文本: {title_text}")

                # 缩放按钮点击逻辑只在首次访问时执行
                if first_visit:
                    try:
                        # 设置显式等待，最长等待时间为10秒
                        wait = WebDriverWait(driver, 10)

                        # 等待 img 元素出现，并且其 onclick 属性为 'tk_tg_zoomin()'
                        zoom_img = wait.until(
                            EC.element_to_be_clickable((By.CSS_SELECTOR, "img[onclick='tk_tg_zoomin()']"))
                        )

                        # 点击该元素
                        zoom_img.click()
                        print("已点击缩放按钮 (tk_tg_zoomin)。")
                    except Exception as e:
                        print(f"未能找到或点击缩放按钮: {e}")
                    finally:
                        # 将标志变量设置为 False，确保后续不再执行此逻辑
                        first_visit = False

                # 等待目标容器加载完成
                container = wait.until(
                    EC.presence_of_element_located((By.CLASS_NAME, "quote2l.mt10"))
                )

                # 找到第二个 class="quote2l_cr2" 容器
                quote2l_cr2_containers = container.find_elements(By.CLASS_NAME, "quote2l_cr2")
                if len(quote2l_cr2_containers) < 1:
                    raise Exception("未找到足够的 quote2l_cr2 容器")
                second_quote2l_cr2 = quote2l_cr2_containers[0]

                # 找到第二个 class="quote2l_cr2_c mt10" 容器
                quote2l_cr2_c_containers = second_quote2l_cr2.find_elements(By.CLASS_NAME, "quote2l_cr2_c.mt10")
                if len(quote2l_cr2_c_containers) < 1:
                    raise Exception("未找到足够的 quote2l_cr2_c.mt10 容器")
                second_quote2l_cr2_c = quote2l_cr2_c_containers[0]

                # 找到 class="sider_brief" 容器
                sider_brief = second_quote2l_cr2_c.find_element(By.CLASS_NAME, "sider_brief")

                # 提取每个 tr 下的 td 内容
                rows = sider_brief.find_elements(By.TAG_NAME, "tr")
                for row in rows:
                    cells = row.find_elements(By.TAG_NAME, "td")
                    cell_contents = [cell.text for cell in cells]
                    data_line = ', '.join(cell_contents)
                    print(cell_contents)
                    write_to_file(output_filepath, data_line)


                # 新需求：访问财务分析页面
                cwfx_url = f"https://emweb.securities.eastmoney.com/pc_hsf10/pages/index.html?type=web&code={stock_prefix}&color=b#/cwfx"
                driver.get(cwfx_url)
                print("已访问财务分析页面:", cwfx_url)

                # 1. 等待 class="menus" 容器出现，认为加载完成
                wait.until(EC.presence_of_element_located((By.CLASS_NAME, "menus")))

                # 2. 找到 class="section cwbb" 容器中的 class="cwbbTab" 子容器
                section_cwbb_container = wait.until(
                    EC.presence_of_element_located((By.CLASS_NAME, "section.cwbb"))
                )
                cwbb_tab_container = section_cwbb_container.find_element(By.CLASS_NAME, "cwbbTab")
                tab_items = cwbb_tab_container.find_elements(By.TAG_NAME, "li")

                # 检查是否有至少 3 个 li 标签
                if len(tab_items) < 3:
                    raise Exception("未找到足够的 li 标签")

                # 定义需要抓取的表格配置
                table_configs = [
                    {"container_class": "zcfzb_table", "table_id": "report_zcfzb"},
                    {"container_class": "lrb_table", "table_id": "report_lrb"},
                    {"container_class": "xjllb_table", "table_id": "report_xjllb"}
                ]

                # 遍历每个 li 标签，依次模拟鼠标点击并抓取对应的表格数据
                for i, tab_item in enumerate(tab_items[:3]):  # 只处理前三个 li 标签
                    # 使用 ActionChains 模拟鼠标点击
                    actions = ActionChains(driver)
                    actions.move_to_element(tab_item).click().perform()  # 移动到元素并点击
                    print(f"已模拟鼠标点击第 {i + 1} 个标签，等待 1 秒...")
                    time.sleep(1)  # 等待 1 秒

                    # 获取当前 li 对应的表格配置
                    config = table_configs[i]
                    container_class = config["container_class"]
                    table_id = config["table_id"]

                    try:
                        # 等待目标容器中的表格出现
                        table_container = section_cwbb_container.find_element(By.CLASS_NAME, container_class)
                        report_table = table_container.find_element(By.ID, table_id)

                        # 抓取表格中的每个 th 和 td 数据
                        headers = [header.text.strip() for header in report_table.find_elements(By.TAG_NAME, "th") if
                                   header.text.strip()]
                        print(f"\n表头 ({container_class}, 第 {i + 1} 个标签):", headers)
                        write_to_file(output_filepath, f"表头 ({container_class}, 第 {i + 1} 个标签): {', '.join(headers)}")

                        rows = report_table.find_elements(By.TAG_NAME, "tr")
                        for row in rows:
                            cells = [cell.text.strip() for cell in row.find_elements(By.TAG_NAME, "td") if cell.text.strip()]
                            if cells:  # 跳过空行
                                data_line = ', '.join(cells)
                                print(f"行数据 ({container_class}, 第 {i + 1} 个标签):", cells)
                                write_to_file(output_filepath, data_line)
                    except NoSuchElementException:
                        print(f"未找到表格 (容器: {container_class}, 表格 ID: {table_id})")

                #第三段url个股资金流向整理
                url = f"https://data.eastmoney.com/zjlx/{stock_code}.html"
                print(f"正在访问股票代码 {stock_code} 的页面: {url}")
                driver.get(url)
                # 等待 .main-content 容器加载完成
                wait = WebDriverWait(driver, 20)
                main_content = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "main-content")))

                # 在 .main-content 容器中查找 .framecontent 容器
                frame_content = main_content.find_element(By.CLASS_NAME, "framecontent")

                # 在 .framecontent 容器中找到第三个 .content 容器
                contents = frame_content.find_elements(By.CLASS_NAME, "content")
                third_content = contents[2] if len(contents) > 2 else None

                if third_content:
                    # 获取 table 元素
                    table = third_content.find_element(By.TAG_NAME, "table")

                    # 获取 thead 中的 th 文本
                    headers = [header.text for header in
                               table.find_element(By.TAG_NAME, "thead").find_elements(By.TAG_NAME, "th")]
                    print("Table Headers: ", headers)

                    # 获取 tbody 下前7个 tr 的 td 文本
                    rows = table.find_element(By.TAG_NAME, "tbody").find_elements(By.TAG_NAME, "tr")
                    for i, row in enumerate(rows[:7]):  # 只处理前7行
                        cells = [cell.text for cell in row.find_elements(By.TAG_NAME, "td")]
                        print(f"Row {i + 1} Data: {cells}")
                else:
                    print(".content 容器不足三个")
            except Exception as e:
                print(f"股票代码 {stock_code} 处理失败: {e}")
                continue  # 跳过当前股票代码，继续处理下一个

        # 询问用户是否继续
        user_input = input("是否继续处理其他股票代码？(y/n): ").strip().lower()
        if user_input != 'y':
            break

    # 关闭浏览器
    driver.quit()

if __name__ == "__main__":
    main()