import os
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
class StockDataAnalysisSystem:
def __init__(self):
# 配置路径
self.stock_data_path = r"F:\baostock_股票文件夹"
self.industry_file = r"F:\TDX股票信息\全部A股基本信息.csv"
self.output_path = r"F:\股票分析结果"
# 创建输出目录
os.makedirs(self.output_path, exist_ok=True)
os.makedirs(os.path.join(self.output_path, "周期涨幅"), exist_ok=True)
os.makedirs(os.path.join(self.output_path, "周期涨幅排名"), exist_ok=True)
os.makedirs(os.path.join(self.output_path, "每日统计"), exist_ok=True)
os.makedirs(os.path.join(self.output_path, "行业统计"), exist_ok=True)
# 周期配置
self.periods = [1, 3, 5, 10, 15, 20, 25, 30, 50, 90, 120, 250]
# 存储数据
self.all_returns_data = {} # 存储所有股票的涨幅数据
self.industry_data = None # 行业分类数据
def step1_traverse_and_calculate(self):
"""第一步:遍历文件夹,读取所有CSV文件,计算每个股票的多个周期涨幅"""
print("=" * 50)
print("第一步:遍历文件夹并计算周期涨幅")
print("=" * 50)
if not os.path.exists(self.stock_data_path):
print(f"股票数据目录不存在: {self.stock_data_path}")
return 0
csv_files = [f for f in os.listdir(self.stock_data_path) if f.endswith('.csv')]
print(f"找到{len(csv_files)}个CSV文件")
success_count = 0
for csv_file in csv_files:
try:
code = csv_file.replace('.csv', '').zfill(6)
file_path = os.path.join(self.stock_data_path, csv_file)
# 尝试多种编码读取
df = self.read_csv_with_encoding(file_path)
if df is None or len(df) < 10:
continue
# 计算周期涨幅
returns_df = self.calculate_period_returns(df, code)
if returns_df is not None:
self.all_returns_data[code] = returns_df
success_count += 1
# 保存单个股票的周期涨幅
output_file = os.path.join(self.output_path, "周期涨幅", f"{code}.csv")
returns_df.to_csv(output_file, index=False)
if success_count % 100 == 0:
print(f"已处理{success_count}个文件...")
except Exception as e:
print(f"处理文件{csv_file}失败: {e}")
continue
print(f"第一步完成,成功处理{success_count}个文件")
return success_count
def read_csv_with_encoding(self, file_path):
"""使用多种编码读取CSV文件"""
encodings = ['gbk', 'utf-8', 'gb2312', 'gb18030', 'ansi']
for encoding in encodings:
try:
df = pd.read_csv(file_path, encoding=encoding)
# 标准化列名
df.columns = df.columns.str.strip().str.lower()
# 映射标准列名
column_mapping = {}
for col in df.columns:
if 'date' in col or '日期' in col:
column_mapping[col] = 'date'
elif 'close' in col or '收盘' in col:
column_mapping[col] = 'close'
elif 'turn' in col or '换手' in col:
column_mapping[col] = 'turn'
elif 'pe' in col or '市盈率' in col:
column_mapping[col] = 'peTTM'
elif 'pb' in col or '市净率' in col:
column_mapping[col] = 'pBMRQ'
df = df.rename(columns=column_mapping)
# 确保必需列存在
if 'date' not in df.columns or 'close' not in df.columns:
return None
# 数据清洗
df['date'] = pd.to_datetime(df['date'])
df['close'] = pd.to_numeric(df['close'], errors='coerce')
df = df.sort_values('date').dropna(subset=['close'])
return df
except UnicodeDecodeError:
continue
except Exception as e:
print(f"读取文件{file_path}失败: {e}")
continue
return None
def calculate_period_returns(self, df, code):
"""计算股票的多周期涨幅"""
try:
df = df.sort_values('date')
# 计算各周期涨幅
for period in self.periods:
if period == 1:
df[f'ZF{period}'] = df['close'].pct_change() * 100
else:
df[f'ZF{period}'] = df['close'].pct_change(period) * 100
# 添加股票代码
df['code'] = code
# 选择需要的列
columns = ['date', 'code'] + [f'ZF{p}' for p in self.periods]
if 'turn' in df.columns:
columns.append('turn')
if 'peTTM' in df.columns:
columns.append('peTTM')
if 'pBMRQ' in df.columns:
columns.append('pBMRQ')
return df[columns]
except Exception as e:
print(f"计算周期涨幅失败: {e}")
return None
def step2_merge_data(self):
"""第二步:合并所有股票数据到一个大的DataFrame"""
print("\n" + "=" * 50)
print("第二步:合并所有股票数据")
print("=" * 50)
if not self.all_returns_data:
print("没有可用的涨幅数据")
return None
# 合并所有数据
all_data = []
for code, df in self.all_returns_data.items():
all_data.append(df)
if not all_data:
print("没有数据可以合并")
return None
# 合并为一个大的DataFrame
merged_df = pd.concat(all_data, ignore_index=True)
merged_df['date'] = pd.to_datetime(merged_df['date'])
print(f"合并完成,共{len(merged_df)}条记录")
return merged_df
def step3_normalize_ranking(self, merged_df):
"""第三步:计算归一化排名"""
print("\n" + "=" * 50)
print("第三步:计算归一化排名")
print("=" * 50)
if merged_df is None or len(merged_df) == 0:
print("没有数据可以计算排名")
return
# 按日期分组计算排名
ranking_data = []
for date in merged_df['date'].unique():
day_data = merged_df[merged_df['date'] == date].copy()
# 计算每个周期的排名
for period in self.periods[1:]: # 跳过1日
col = f'ZF{period}'
rank_col = f'ZF{period}排名'
if col in day_data.columns:
# 计算百分比排名并乘以100
day_data[rank_col] = day_data[col].rank(pct=True) * 100
ranking_data.append(day_data)
if ranking_data:
ranking_df = pd.concat(ranking_data, ignore_index=True)
# 保存每个股票的排名数据
for code in ranking_df['code'].unique():
stock_data = ranking_df[ranking_df['code'] == code].copy()
output_file = os.path.join(self.output_path, "周期涨幅排名", f"{code}.csv")
stock_data.to_csv(output_file, index=False)
print(f"排名计算完成,共处理{len(ranking_df)}条记录")
return ranking_df
return None
def load_industry_classification(self):
"""加载行业分类数据"""
print("\n" + "=" * 50)
print("加载行业分类数据")
print("=" * 50)
if not os.path.exists(self.industry_file):
print(f"行业分类文件不存在: {self.industry_file}")
return None
# 尝试读取行业分类
encodings = ['gbk', 'utf-8', 'gb2312', 'gb18030']
for encoding in encodings:
try:
df = pd.read_csv(self.industry_file, encoding=encoding)
# 标准化列名
df.columns = df.columns.str.strip()
# 查找股票代码和行业列
code_col = None
industry_col = None
for col in df.columns:
col_lower = str(col).lower()
if any(keyword in col_lower for keyword in ['code', '代码', 'stock']):
code_col = col
if any(keyword in str(col) for keyword in ['行业', 'industry', '研究1']):
industry_col = col
if code_col is None:
code_col = df.columns[0]
if industry_col is None and len(df.columns) > 1:
industry_col = df.columns[1]
# 标准化股票代码
df[code_col] = df[code_col].astype(str).str.zfill(6)
# 创建行业映射
industry_map = dict(zip(df[code_col], df[industry_col]))
print(f"成功加载{len(industry_map)}条行业分类信息")
return industry_map
except UnicodeDecodeError:
continue
except Exception as e:
print(f"读取行业分类文件失败: {e}")
continue
return None
def step4_group_by_industry(self, merged_df):
"""第四步:按行业分组统计"""
print("\n" + "=" * 50)
print("第四步:按行业分组统计")
print("=" * 50)
if merged_df is None or len(merged_df) == 0:
print("没有数据可以分组")
return
# 加载行业分类
industry_map = self.load_industry_classification()
if not industry_map:
print("无法加载行业分类,跳过行业分析")
return
# 添加行业信息
merged_df['industry'] = merged_df['code'].map(industry_map)
# 按行业和日期分组统计
industry_stats = []
for (industry, date), group in merged_df.groupby(['industry', 'date']):
if len(group) == 0:
continue
stats = {
'date': date,
'industry': industry,
'stock_count': len(group)
}
# 计算各周期统计
for period in self.periods:
col = f'ZF{period}'
if col in group.columns:
stats[f'{period}日涨幅均值'] = group[col].mean()
stats[f'{period}日涨幅中位数'] = group[col].median()
stats[f'{period}日涨幅>0数量'] = (group[col] > 0).sum()
stats[f'{period}日涨幅>0占比'] = (group[col] > 0).sum() / len(group)
# 计算其他指标
for col in ['turn', 'peTTM', 'pBMRQ']:
if col in group.columns:
stats[f'{col}均值'] = group[col].mean()
stats[f'{col}中位数'] = group[col].median()
industry_stats.append(stats)
if industry_stats:
industry_df = pd.DataFrame(industry_stats)
# 按行业保存
for industry in industry_df['industry'].unique():
industry_data = industry_df[industry_df['industry'] == industry]
safe_name = str(industry).replace('/', '_').replace('\\', '_')
output_file = os.path.join(self.output_path, "行业统计", f"{safe_name}.csv")
industry_data.to_csv(output_file, index=False)
print(f"行业分组完成,共处理{len(industry_df)}条记录")
return industry_df
return None
def step5_daily_median_stats(self, merged_df):
"""第五步:计算整个市场每日的中位数统计"""
print("\n" + "=" * 50)
print("第五步:计算每日市场统计")
print("=" * 50)
if merged_df is None or len(merged_df) == 0:
print("没有数据可以计算统计")
return
# 按日期分组计算
daily_stats = []
for date in merged_df['date'].unique():
day_data = merged_df[merged_df['date'] == date].copy()
if len(day_data) == 0:
continue
stats = {'date': date}
# 计算各周期中位数
for period in self.periods:
col = f'ZF{period}'
if col in day_data.columns:
stats[f'{period}日涨幅中位数'] = day_data[col].median()
stats[f'{period}日涨幅>0占比'] = (day_data[col] > 0).sum() / len(day_data)
# 计算其他指标中位数
for col in ['turn', 'peTTM', 'pBMRQ']:
if col in day_data.columns:
stats[f'{col}中位数'] = day_data[col].median()
# 计算股票数量
stats['股票数量'] = len(day_data)
daily_stats.append(stats)
if daily_stats:
daily_df = pd.DataFrame(daily_stats)
daily_df['date'] = pd.to_datetime(daily_df['date'])
daily_df = daily_df.sort_values('date')
# 保存每日统计
output_file = os.path.join(self.output_path, "每日统计", "市场每日统计.csv")
daily_df.to_csv(output_file, index=False)
print(f"每日统计完成,共{len(daily_df)}天数据")
return daily_df
return None
def step6_export_results(self):
"""第六步:导出所有结果"""
print("\n" + "=" * 50)
print("第六步:导出结果汇总")
print("=" * 50)
# 检查各目录下的文件
directories = {
"周期涨幅": os.path.join(self.output_path, "周期涨幅"),
"周期涨幅排名": os.path.join(self.output_path, "周期涨幅排名"),
"每日统计": os.path.join(self.output_path, "每日统计"),
"行业统计": os.path.join(self.output_path, "行业统计")
}
for name, directory in directories.items():
if os.path.exists(directory):
files = [f for f in os.listdir(directory) if f.endswith('.csv')]
print(f"{name}: {len(files)}个文件")
else:
print(f"{name}: 目录不存在")
print("=" * 50)
print("所有步骤完成!")
print("=" * 50)
def run_analysis(self):
"""运行完整的分析流程"""
print("开始股票数据分析...")
print(f"股票数据路径: {self.stock_data_path}")
print(f"输出路径: {self.output_path}")
start_time = datetime.now()
# 执行六步计划
step1_count = self.step1_traverse_and_calculate()
if step1_count > 0:
merged_df = self.step2_merge_data()
if merged_df is not None:
self.step3_normalize_ranking(merged_df)
self.step4_group_by_industry(merged_df)
self.step5_daily_median_stats(merged_df)
self.step6_export_results()
end_time = datetime.now()
duration = (end_time - start_time).total_seconds()
print(f"\n总耗时: {duration:.2f}秒")
if __name__ == "__main__":
analyzer = StockDataAnalysisSystem()
analyzer.run_analysis()