import subprocess
import requests
import pandas as pd
import time
import os
import json
import numpy as np
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
# 配置参数
START_TIME = '2025-09-01 09:31' # 设置起始时间
BASE_DIR = r"E:\中国金融"
STOCK_LIST_FILE = os.path.join(BASE_DIR, "股票列表2.txt")
MAX_THREADS = 32
MAX_RETRIES = 5
RETRY_DELAY = 3
# 数据周期设置
PERIOD_CHOICE = '1'
# 数据周期映射
PERIOD_MAP = {
'1': {'candle_period': 1, 'period_name': '1分钟', 'data_count': 240, 'freq': '1min'},
'2': {'candle_period': 5, 'period_name': '5分钟', 'data_count': 240, 'freq': '5min'},
'3': {'candle_period': 15, 'period_name': '15分钟', 'data_count': 240, 'freq': '15min'},
'4': {'candle_period': 30, 'period_name': '30分钟', 'data_count': 240, 'freq': '30min'},
'5': {'candle_period': 60, 'period_name': '60分钟', 'data_count': 240, 'freq': '1H'},
'6': {'candle_period': 1440, 'period_name': '日线', 'data_count': 365, 'freq': '1D'}
}
# 19个指数代码
INDEX_CODES = [
'000016.SS', # 上证50
'399300.SZ', # 沪深300
'000906.SS', # 中证800
'000852.SS', # 中证1000
'000985.SS', # 中证全指
'932078.CSI', # 行业指数1
'932079.CSI', # 行业指数2
'932080.CSI', # 行业指数3
'932081.CSI', # 行业指数4
'932082.CSI', # 行业指数5
'932083.CSI', # 行业指数6
'932084.CSI', # 行业指数7
'932085.CSI', # 行业指数8
'932086.CSI', # 行业指数9
'932077.CSI', # 行业指数10
'932000.CSI', # 中证2000
'399903.SZ', # 中证100
'000904.SS', # 中证200
'000905.SS' # 中证500
]
# 根据选择获取周期配置
PERIOD_CONFIG = PERIOD_MAP.get(PERIOD_CHOICE, PERIOD_MAP['1'])
# 设置输出文件名
OUTPUT_FILENAME = os.path.join(BASE_DIR, f"合并股票数据_{PERIOD_CONFIG['period_name']}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
def get_stock_list(file_path):
"""从文本文件读取股票代码列表并统一格式"""
try:
if not os.path.exists(file_path):
print(f"错误: 文件不存在 - {file_path}")
return []
with open(file_path, 'r', encoding='utf-8') as f:
stock_codes = []
for line in f:
line = line.strip()
if not line:
continue
# 处理不同的代码格式
if '.' in line:
code, suffix = line.split('.')
suffix = suffix.upper()
if suffix in ['SS', 'SH']:
stock_codes.append(f"{code}.SH")
elif suffix == 'SZ':
stock_codes.append(f"{code}.SZ")
else:
stock_codes.append(f"{code}.{suffix}")
else:
# 无后缀,根据代码开头判断
if line.startswith('6'):
stock_codes.append(f"{line}.SH")
else:
stock_codes.append(f"{line}.SZ")
return stock_codes
except Exception as e:
print(f"读取股票列表文件失败: {e}")
return []
def create_time_index(start_time, freq, periods=240):
"""创建时间索引,确保所有股票使用相同的时间点"""
start_dt = pd.to_datetime(start_time)
if freq == '1D': # 日线数据
return pd.date_range(start=start_dt, periods=periods, freq=freq)
else: # 分钟数据
# 确保开始时间是交易时间 (9:30-11:30, 13:00-15:00)
if start_dt.time() < pd.Timestamp('09:30:00').time():
start_dt = start_dt.replace(hour=9, minute=30)
elif start_dt.time() > pd.Timestamp('15:00:00').time():
start_dt = start_dt + timedelta(days=1)
start_dt = start_dt.replace(hour=9, minute=30)
elif pd.Timestamp('11:30:00').time() < start_dt.time() < pd.Timestamp('13:00:00').time():
start_dt = start_dt.replace(hour=13, minute=0)
return pd.date_range(start=start_dt, periods=periods, freq=freq)
def process_stock(stock_code, time_index):
"""处理单个股票/指数的数据,带重试机制"""
for attempt in range(MAX_RETRIES):
try:
# 确定candle_mode参数
suffix = stock_code.split('.')[1].upper() if '.' in stock_code else 'SZ'
candle_mode = 0 if suffix in ['SS', 'SH', 'CSI', 'XSHG'] else 1
# 构建请求URL
timestamp = int(time.time() * 1000)
url = f"https://quotedata.cnfin.com/quote/v1/kline?localDate={timestamp}&get_type=offset&prod_code={stock_code}&candle_period={PERIOD_CONFIG['candle_period']}&candle_mode={candle_mode}&fields=open_px,high_px,low_px,close_px,business_amount,business_balance,turnover_ratio,preclose_px,special_close_px,settlement,amount&data_count={PERIOD_CONFIG['data_count']}"
# 添加请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': 'https://quote.cnfin.com/',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
}
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
# 尝试多种解析方式
try:
raw_data = response.json()
except:
# 如果JSON解析失败,尝试直接解析文本
content = response.text.strip()
if content.startswith('(') and content.endswith(')'):
content = content[1:-1] # 去除可能的JSONP包装
raw_data = json.loads(content)
# 检查数据有效性
if not raw_data or 'data' not in raw_data:
raise ValueError(f"无效响应: 缺少data字段")
data = raw_data['data']
if not data or 'candle' not in data:
raise ValueError(f"无效响应: 缺少candle字段")
candle_data = data['candle']
if not candle_data or stock_code not in candle_data:
# 尝试使用不带后缀的代码
base_code = stock_code.split('.')[0]
if base_code in candle_data:
stock_data = candle_data[base_code]
else:
raise ValueError(f"股票代码 {stock_code} 不在响应数据中")
else:
stock_data = candle_data[stock_code]
if not stock_data:
raise ValueError(f"空数据: {stock_code}")
# 获取字段列表
fields = candle_data.get('fields', ['min_time', 'open_px', 'high_px', 'low_px', 'close_px', 'business_amount', 'business_balance'])
# 创建DataFrame
df = pd.DataFrame(stock_data, columns=fields)
if df.empty:
raise ValueError(f"空DataFrame: {stock_code}")
# 处理时间字段
if 'min_time' in df.columns:
df['datetime'] = pd.to_datetime(df['min_time'], format='%Y%m%d%H%M')
elif 'date' in df.columns:
df['datetime'] = pd.to_datetime(df['date'], format='%Y%m%d')
else:
# 如果没有时间字段,尝试使用第一列
df['datetime'] = pd.to_datetime(df.iloc[:, 0], format='%Y%m%d%H%M', errors='coerce')
if df['datetime'].isna().all():
df['datetime'] = pd.to_datetime(df.iloc[:, 0], format='%Y%m%d', errors='coerce')
# 确保有close_px列
if 'close_px' not in df.columns:
if len(df.columns) >= 5: # 假设第5列是收盘价
df['close_px'] = df.iloc[:, 4]
else:
df['close_px'] = 1
print(f"警告: {stock_code} 无法确定收盘价列,使用默认值1")
# 处理缺失值
df['close_px'] = pd.to_numeric(df['close_px'], errors='coerce').fillna(1)
# 设置时间为索引
df = df.set_index('datetime')
# 重新索引以匹配统一的时间轴
df = df.reindex(time_index, method='ffill')
# 重置索引以便后续处理
df = df.reset_index().rename(columns={'index': 'datetime'})
# 确保数据从START_TIME开始
start_dt = pd.to_datetime(START_TIME)
df = df[df['datetime'] >= start_dt].copy()
if df.empty:
# 如果没有符合时间条件的数据,创建默认数据
df = pd.DataFrame({
'datetime': time_index,
'close_px': 1
})
print(f"警告: {stock_code} 无符合时间条件的数据,使用默认值1")
return stock_code.replace('.', '_'), df[['datetime', 'close_px']]
except Exception as e:
if attempt < MAX_RETRIES - 1:
print(f"处理 {stock_code} 失败 (尝试 {attempt+1}/{MAX_RETRIES}): {str(e)},{RETRY_DELAY*(attempt+1)}秒后重试...")
time.sleep(RETRY_DELAY * (attempt + 1))
else:
print(f"处理 {stock_code} 失败 (所有尝试已用完): {str(e)}")
# 返回一个默认数据框
df = pd.DataFrame({
'datetime': time_index,
'close_px': 1
})
return stock_code.replace('.', '_'), df
def open_with_wps(file_path):
"""使用WPS Office打开指定文件"""
try:
if not os.path.exists(file_path):
print(f"错误: 文件不存在 - {file_path}")
return False
# 尝试多种WPS路径
wps_paths = [
r"C:\Program Files\WPS Office\11.1.0.12680\office6\et.exe",
r"C:\Program Files (x86)\WPS Office\11.1.0.12680\office6\et.exe",
r"C:\Program Files\WPS Office\11.1.0.12827\office6\et.exe",
r"C:\Program Files (x86)\WPS Office\11.1.0.12827\office6\et.exe",
r"C:\Program Files\WPS Office\office6\et.exe",
r"C:\Program Files (x86)\WPS Office\office6\et.exe",
]
wps_executable = None
for path in wps_paths:
if os.path.exists(path):
wps_executable = path
break
if wps_executable:
subprocess.Popen([wps_executable, file_path])
print(f"已使用WPS Office打开文件: {file_path}")
else:
print("警告: 找不到WPS Office可执行文件,尝试使用系统默认应用打开...")
os.startfile(file_path)
return True
except Exception as e:
print(f"打开文件时出错: {str(e)}")
print("尝试使用系统默认应用打开...")
try:
os.startfile(file_path)
return True
except Exception as e2:
print(f"使用系统默认应用打开也失败: {str(e2)}")
return False
def main():
print(f"数据周期: {PERIOD_CONFIG['period_name']}")
# 确保目录存在
if not os.path.exists(BASE_DIR):
try:
os.makedirs(BASE_DIR)
print(f"创建目录: {BASE_DIR}")
except Exception as e:
print(f"无法创建目录: {BASE_DIR}, 错误: {e}")
return
# 读取股票列表
stock_codes = get_stock_list(STOCK_LIST_FILE)
if not stock_codes:
print("没有找到可用的股票代码,请检查股票列表文件")
return
# 将19个指数代码添加到股票列表前面
all_codes = INDEX_CODES + stock_codes
# 创建统一的时间索引
time_index = create_time_index(START_TIME, PERIOD_CONFIG['freq'], PERIOD_CONFIG['data_count'])
print(f"开始处理 {len(all_codes)} 个股票/指数代码")
print(f"开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"最大线程数: {MAX_THREADS}")
print(f"数据起始时间: {START_TIME}")
print(f"数据目录: {BASE_DIR}")
print(f"输出文件: {OUTPUT_FILENAME}")
# 使用线程池处理
results = {}
failed_codes = []
with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
# 传递时间索引给每个处理函数
future_to_code = {executor.submit(process_stock, code, time_index): code for code in all_codes}
with tqdm(total=len(all_codes), desc="处理进度") as pbar:
for future in as_completed(future_to_code):
stock_code = future_to_code[future]
col_name, stock_df = future.result()
if stock_df is not None:
results[col_name] = stock_df
else:
failed_codes.append(stock_code)
pbar.update(1)
# 创建包含所有时间点的DataFrame
if results:
print("\n正在合并数据...")
# 使用统一的时间索引作为基础
base_df = pd.DataFrame({'datetime': time_index})
# 合并所有股票数据
for code in all_codes:
col_name = code.replace('.', '_')
if col_name in results:
temp_df = results[col_name][['datetime', 'close_px']].copy()
temp_df = temp_df.rename(columns={'close_px': col_name})
base_df = pd.merge(base_df, temp_df, on='datetime', how='left')
else:
base_df[col_name] = 1
print(f"为 {code} 添加全1列")
# 填充缺失值
base_df = base_df.fillna(1)
# 确保列顺序正确
col_names = ['datetime'] + [code.replace('.', '_') for code in all_codes]
base_df = base_df[col_names]
# 确保数据从START_TIME开始
start_dt = pd.to_datetime(START_TIME)
base_df = base_df[base_df['datetime'] >= start_dt].copy()
# 写入文件
print(f"正在写入文件: {OUTPUT_FILENAME}")
base_df.to_csv(OUTPUT_FILENAME, index=False, encoding='utf-8-sig')
print(f"\n成功保存数据到: {OUTPUT_FILENAME}")
print(f"成功处理股票/指数数: {len(results)}/{len(all_codes)}")
if failed_codes:
print(f"失败代码列表 ({len(failed_codes)}个):")
for i, code in enumerate(failed_codes):
print(f"{i+1}. {code}")
# 使用WPS Office打开生成的CSV文件
print("\n尝试使用WPS Office打开生成的CSV文件...")
open_with_wps(OUTPUT_FILENAME)
else:
print("\n没有有效数据可保存")
print(f"结束时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("处理完成!")
if __name__ == "__main__":
main() 脚本运行后出现大量与实际不一致的重复数据,请修改。