import pandas as pd, numpy as np, time
from pathlib import Path
from tqdm import tqdm
def progress_analyzer():
"""终极进度版 - 4阶段进度条+精确耗时"""
t0 = time.time()
root, out = Path(r"F:\stock_data"), Path(r"F:\自定义数据源")
out.mkdir(exist_ok=True)
periods = [5, 10, 15, 25, 30, 50, 90, 120, 250]
# 📊 阶段1: 文件扫描 (带进度条)
print("🔍 阶段1: 扫描文件...")
files = list(tqdm(root.rglob("*.csv"), desc="📂 扫描CSV"))
scan_time = time.time() - t0
# 📈 阶段2: 数据读取与清洗
print(f"\n📊 阶段2: 读取数据 ({len(files)}个文件)...")
t1 = time.time()
dfs = []
for f in tqdm(files, desc="📥 读取"):
try:
d = pd.read_csv(f, encoding='gbk', low_memory=False)
cols = {c.lower(): c for c in d.columns}
# 智能列映射
date = next((c for k, c in cols.items() if 'date' in k), d.columns[0])
close = next((c for k, c in cols.items() if 'close' in k), d.columns[1])
d = d[[date, close] + [c for c in [cols.get('turn'), cols.get('pe'), cols.get('pb')] if c]].copy()
d.columns = ['date', 'close', 'turn', 'peTTM', 'pbMRQ'][:len(d.columns)]
# 精准清洗
d['date'] = pd.to_datetime(d['date'], errors='coerce')
d['close'] = pd.to_numeric(d['close'], errors='coerce')
d = d[d['date'] >= '2000-01-04'].dropna(subset=['date', 'close'])
if len(d) > 5:
d['stock'] = f.stem
dfs.append(d)
except:
continue
# 生成测试数据
if not dfs:
print("🔄 生成测试数据...")
stocks = [f"S{i:04d}" for i in range(50)]
dates = pd.date_range('2000-01-04', '2024-12-31', freq='B')
df = pd.DataFrame({
'date': np.repeat(dates, len(stocks)),
'stock': stocks * len(dates),
'close': 100 + np.random.randn(len(dates) * len(stocks)).cumsum() * 3,
'turn': np.random.uniform(0.1, 15, len(dates) * len(stocks)),
'peTTM': np.random.uniform(1, 100, len(dates) * len(stocks)),
'pbMRQ': np.random.uniform(0.1, 10, len(dates) * len(stocks))
})
else:
df = pd.concat(dfs)
read_time = time.time() - t1
# 📈 阶段3: 涨幅计算
print(f"\n📈 阶段3: 计算涨幅 ({len(df)}条记录)...")
t2 = time.time()
df = df.sort_values(['stock', 'date'])
# 批量计算所有周期涨幅
for p in tqdm(periods, desc="⚡ 计算周期"):
df[f'r{p}'] = df.groupby('stock')['close'].pct_change(p) * 100
calc_time = time.time() - t2
# 📊 阶段4: 统计汇总
print(f"\n📊 阶段4: 统计汇总...")
t3 = time.time()
# 高效分组统计
stats = df.groupby('date').apply(lambda g: pd.Series({
**{f'r{p}_{q}': g[f'r{p}'].dropna().quantile(q)
for p in periods[1:] for q in [0.25, 0.5, 0.75]},
**{f'{m}_{q}': g[m].dropna().quantile(q)
for m in ['turn', 'peTTM', 'pbMRQ'] if m in g for q in [0.25, 0.5, 0.75]},
**{f'{p}日占比': (g[f'r{p}'] > 0).mean() * 100 for p in periods}
})).reset_index()
stats['日期'] = stats['date'].dt.strftime('%Y/%m/%d')
stats = stats.drop('date', axis=1)
# 按指定顺序排列列
cols = ['日期']
for p in periods[1:]:
cols += [f'r{p}_{q}' for q in [0.25, 0.5, 0.75]]
for m in ['turn', 'peTTM', 'pbMRQ']:
if f'{m}_0.25' in stats.columns:
cols += [f'{m}_{q}' for q in [0.25, 0.5, 0.75]]
for p in periods:
if f'{p}日占比' in stats.columns:
cols.append(f'{p}日占比')
stats = stats[cols]
# 保存结果
filename = out / f"进度统计_{int(time.time())}.xlsx"
stats.to_excel(filename, index=False)
total_time = time.time() - t0
# 📊 完整耗时报告
print("\n" + "=" * 50)
print("📊 完整耗时报告")
print("=" * 50)
print(f"📂 文件扫描: {scan_time:.2f}s")
print(f"📥 数据读取: {read_time:.2f}s")
print(f"⚡ 涨幅计算: {calc_time:.2f}s")
print(f"📊 统计汇总: {time.time() - t3:.2f}s")
print(f"⏱️ 总耗时: {total_time:.1f}s")
print(f"📈 交易日: {len(stats)}天")
print(f"📁 文件: {filename}")
print("=" * 50)
if __name__ == "__main__":
progress_analyzer()详细解释每行代码,并举例说明