import baostock as bs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
class MultiFactorStockSelection:
def __init__(self):
self.lg = None
self.stock_pool = []
def login_baostock(self):
"""登录baostock"""
self.lg = bs.login()
if self.lg.error_code != '0':
print(f'登录失败: {self.lg.error_msg}')
return False
print('baostock登录成功')
return True
def logout_baostock(self):
"""登出baostock"""
if self.lg:
bs.logout()
print('baostock已登出')
def get_all_stocks(self, date):
"""获取指定日期所有股票代码"""
try:
rs = bs.query_all_stock(date)
if rs.error_code != '0':
print(f"获取股票列表失败: {rs.error_msg}")
return []
stock_list = rs.get_data()
if stock_list.empty:
print("获取的股票列表为空")
return []
# 过滤有效的股票代码
valid_stocks = []
for code in stock_list['code']:
if code and len(code) >= 6 and code not in ['', 'code']:
valid_stocks.append(code)
print(f"获取到 {len(valid_stocks)} 只有效股票")
return valid_stocks
except Exception as e:
print(f"获取股票列表异常: {e}")
return []
def clean_stock_pool(self, stock_list, clean_date):
"""
清洗股票池 - 简化版本
实际应用中需要更复杂的ST股判断逻辑
"""
if not stock_list:
return []
cleaned_stocks = []
checked_count = 0
for code in stock_list:
try:
# 基本格式检查
if not code or len(code) < 6:
continue
# 跳过指数和基金
if code.startswith(('sh.1', 'sz.1', 'sh.15')):
continue
# 简化处理:直接通过,实际应用需要更严格的ST判断
cleaned_stocks.append(code)
checked_count += 1
# 限制检查数量,提高速度
if checked_count >= 300: # 最多检查300只股票
break
except Exception as e:
print(f"检查股票 {code} 时出错: {e}")
continue
print(f"股票池清洗完成:原始{len(stock_list)}只,清洗后{len(cleaned_stocks)}只")
return cleaned_stocks
def get_stock_data(self, code, start_date, end_date):
"""获取股票历史数据"""
try:
# 确保代码格式正确
if not code or len(code) < 6:
return None
rs = bs.query_history_k_data_plus(
code,
"date,code,open,high,low,close,volume,turn,peTTM,pbMRQ",
start_date=start_date,
end_date=end_date,
frequency="d",
adjustflag="3"
)
if rs.error_code != '0':
return None
df = rs.get_data()
if df.empty:
return None
# 转换数据类型
numeric_cols = ['open', 'high', 'low', 'close', 'volume', 'turn', 'peTTM', 'pbMRQ']
for col in numeric_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
df['date'] = pd.to_datetime(df['date'])
return df
except Exception as e:
print(f"获取股票 {code} 数据失败: {e}")
return None
def calculate_factors(self, df, current_date):
"""
计算三个核心因子 - 简化稳定版本
"""
if df is None or df.empty:
return None
try:
current_date_dt = pd.to_datetime(current_date)
current_data = df[df['date'] == current_date_dt].copy()
if current_data.empty:
# 如果没有精确日期数据,使用最近的数据
current_data = df[df['date'] <= current_date_dt].tail(1).copy()
if current_data.empty:
return None
result_data = current_data.iloc[[0]].copy()
# 1. 价值因子:市净率PB倒数
if 'pbMRQ' in result_data.columns and pd.notna(result_data['pbMRQ'].iloc[0]):
pb_value = float(result_data['pbMRQ'].iloc[0])
if pb_value > 0:
result_data['value_pb'] = 1 / pb_value
else:
result_data['value_pb'] = 0
else:
result_data['value_pb'] = 0
# 2. 质量因子:使用价格变化作为ROE代理
if len(df) >= 60: # 至少2个月数据
current_price = float(result_data['close'].iloc[0])
start_price = float(df['close'].iloc[0])
if start_price > 0:
result_data['quality_roe'] = (current_price / start_price - 1)
else:
result_data['quality_roe'] = 0
else:
result_data['quality_roe'] = 0
# 3. 动量因子:过去3个月收益率(简化版本)
if len(df) >= 60:
current_price = float(result_data['close'].iloc[0])
price_3m_ago = float(df['close'].iloc[0])
if price_3m_ago > 0:
result_data['momentum_3m'] = (current_price / price_3m_ago - 1)
else:
result_data['momentum_3m'] = 0
else:
result_data['momentum_3m'] = 0
return result_data
except Exception as e:
print(f"计算因子失败: {e}")
return None
def remove_outliers(self, df, factor_columns):
"""去极值处理 - 稳健版本"""
if df.empty:
return df
for factor in factor_columns:
if factor in df.columns:
valid_data = df[df[factor].notna()][factor]
if len(valid_data) > 0:
q_low = valid_data.quantile(0.05)
q_high = valid_data.quantile(0.95)
df[factor] = df[factor].clip(q_low, q_high)
return df
def standardize_factors(self, df, factor_columns):
"""Z-score标准化 - 稳健版本"""
if df.empty:
return df
for factor in factor_columns:
if factor in df.columns:
valid_data = df[df[factor].notna()][factor]
if len(valid_data) > 1:
mean_val = valid_data.mean()
std_val = valid_data.std()
if std_val > 0:
df[f'{factor}_std'] = (df[factor] - mean_val) / std_val
else:
df[f'{factor}_std'] = 0
else:
df[f'{factor}_std'] = 0
return df
def calculate_composite_score(self, df, weights=None):
"""计算综合得分"""
if df.empty:
return df
if weights is None:
weights = {'value_pb_std': 0.4, 'quality_roe_std': 0.3, 'momentum_3m_std': 0.3}
df['composite_score'] = 0
for factor, weight in weights.items():
if factor in df.columns:
df['composite_score'] += df[factor].fillna(0) * weight
return df
def select_top_stocks(self, df, top_pct=0.1):
"""选择排名前10%的股票"""
if df is None or df.empty:
return pd.DataFrame()
df = df[df['composite_score'].notna()].copy()
if df.empty:
return pd.DataFrame()
df = df.sort_values('composite_score', ascending=False)
select_count = max(1, int(len(df) * top_pct))
selected_stocks = df.head(select_count)[['code', 'composite_score']].copy()
return selected_stocks
def get_trade_price(self, code, trade_date):
"""获取T+1日开盘价 - 简化版本"""
try:
if not code or len(code) < 6:
return None
# T+1日
trade_dt = datetime.strptime(trade_date, '%Y-%m-%d')
t1_date = (trade_dt + timedelta(days=1)).strftime('%Y-%m-%d')
rs = bs.query_history_k_data_plus(
code, "open,close",
start_date=t1_date,
end_date=t1_date,
frequency="d",
adjustflag="3"
)
if rs.error_code == '0':
price_data = rs.get_data()
if not price_data.empty:
# 使用开盘价,如果没有则用收盘价
if 'open' in price_data.columns and pd.notna(price_data['open'].iloc[0]):
return float(price_data['open'].iloc[0])
elif 'close' in price_data.columns and pd.notna(price_data['close'].iloc[0]):
return float(price_data['close'].iloc[0])
return None
except:
return None
def run_backtest(self, start_date, end_date, initial_capital=1000000):
"""运行回测 - 修复版本"""
print("开始多因子选股回测...")
if not self.login_baostock():
return None, None, None, None
try:
# 生成调仓日期(每月第一个交易日)
all_dates = pd.date_range(start_date, end_date, freq='D')
rebalance_dates = []
for date in all_dates:
if date.day == 1: # 每月第一天作为调仓日
rebalance_dates.append(date)
if not rebalance_dates:
print("没有找到调仓日期,请检查时间范围")
return None, None, None, None
print(f"找到 {len(rebalance_dates)} 个调仓日期")
# 初始化投资组合
portfolio_records = []
current_cash = initial_capital
current_positions = {} # {code: shares}
portfolio_values = []
dates_record = []
# 获取基准数据
benchmark_data = self.get_benchmark_data(start_date, end_date)
for i, rebalance_date in enumerate(rebalance_dates):
if i >= len(rebalance_dates) - 1:
break
current_date = rebalance_date.strftime('%Y-%m-%d')
print(f"\n=== 调仓日: {current_date} ===")
# 1. 获取并清洗股票池
all_stocks = self.get_all_stocks(current_date)
if not all_stocks:
print("获取股票列表失败,跳过本次调仓")
continue
cleaned_stocks = self.clean_stock_pool(all_stocks, current_date)
if not cleaned_stocks:
print("清洗后无有效股票,跳过本次调仓")
continue
# 2. 计算因子
all_factor_data = []
success_count = 0
for code in cleaned_stocks:
stock_data = self.get_stock_data(
code,
start_date=(rebalance_date - timedelta(days=120)).strftime('%Y-%m-%d'), # 4个月数据
end_date=current_date
)
if stock_data is not None and len(stock_data) >= 20: # 至少20天数据
factor_data = self.calculate_factors(stock_data, current_date)
if factor_data is not None and not factor_data.empty:
all_factor_data.append(factor_data)
success_count += 1
# 限制处理数量
if success_count >= 100: # 最多处理100只股票
break
print(f"成功计算 {success_count} 只股票的因子")
if not all_factor_data:
print("没有有效的因子数据,跳过本次调仓")
continue
# 3. 合并数据并处理
combined_data = pd.concat(all_factor_data, ignore_index=True)
# 去极值
factor_columns = ['value_pb', 'quality_roe', 'momentum_3m']
cleaned_data = self.remove_outliers(combined_data, factor_columns)
# 标准化
standardized_data = self.standardize_factors(cleaned_data, factor_columns)
# 计算综合得分
scored_data = self.calculate_composite_score(standardized_data)
# 4. 选股(前10%)
selected_stocks = self.select_top_stocks(scored_data, 0.1)
if selected_stocks.empty:
print("选股结果为空,跳过本次调仓")
continue
print(f"选中股票数量: {len(selected_stocks)}")
# 5. 执行调仓交易
selected_codes = selected_stocks['code'].tolist()
# 卖出不在新组合中的股票
positions_to_sell = [code for code in current_positions.keys()
if code not in selected_codes]
for code in positions_to_sell:
price = self.get_trade_price(code, current_date)
if price and price > 0:
shares = current_positions[code]
sell_amount = shares * price * (1 - 0.0003) # 扣除交易成本
current_cash += sell_amount
del current_positions[code]
print(f"卖出 {code}: {shares}股 @ {price:.2f}")
# 买入新组合股票(等权重)
if len(selected_stocks) > 0:
# 计算每只股票分配金额
stock_value = current_cash / len(selected_stocks)
for _, stock in selected_stocks.iterrows():
code = stock['code']
if code not in current_positions:
price = self.get_trade_price(code, current_date)
if price and price > 0:
shares = int(stock_value / price)
if shares > 0:
buy_cost = shares * price * (1 + 0.0003) # 加入交易成本
if buy_cost <= current_cash:
current_positions[code] = shares
current_cash -= buy_cost
print(f"买入 {code}: {shares}股 @ {price:.2f}")
# 6. 计算组合净值
portfolio_value = current_cash
position_values = []
for code, shares in current_positions.items():
price = self.get_trade_price(code, current_date)
if price:
stock_value = shares * price
portfolio_value += stock_value
position_values.append(stock_value)
# 记录组合状态
record = {
'date': current_date,
'cash': current_cash,
'stock_value': portfolio_value - current_cash,
'total_value': portfolio_value,
'num_stocks': len(current_positions),
'avg_position_value': np.mean(position_values) if position_values else 0
}
portfolio_records.append(record)
portfolio_values.append(portfolio_value)
dates_record.append(current_date)
print(f"组合总价值: {portfolio_value:,.2f}元")
print(f"现金: {current_cash:,.2f}元")
print(f"持股数量: {len(current_positions)}只")
print(f"股票市值: {portfolio_value - current_cash:,.2f}元")
print(f"\n回测完成,共处理 {len(portfolio_records)} 次调仓")
return portfolio_records, portfolio_values, benchmark_data, dates_record
except Exception as e:
print(f"回测过程中出现错误: {e}")
import traceback
traceback.print_exc()
return None, None, None, None
finally:
self.logout_baostock()
# 其他方法保持不变...
def get_benchmark_data(self, start_date, end_date):
"""获取沪深300基准数据"""
try:
rs = bs.query_history_k_data_plus(
"sh.000300",
"date,close",
start_date=start_date,
end_date=end_date,
frequency="d",
adjustflag="3"
)
if rs.error_code == '0':
benchmark_df = rs.get_data()
benchmark_df['date'] = pd.to_datetime(benchmark_df['date'])
benchmark_df['close'] = pd.to_numeric(benchmark_df['close'])
return benchmark_df
return None
except:
return None
def calculate_performance_metrics(self, portfolio_values, benchmark_values=None):
"""计算绩效指标"""
if len(portfolio_values) < 2:
return {}
returns = pd.Series(portfolio_values).pct_change().dropna()
metrics = {}
# 累计收益率
total_return = (portfolio_values[-1] / portfolio_values[0] - 1) * 100
metrics['累计收益率'] = total_return
# 年化收益率
years = len(portfolio_values) / 12 # 按月调仓
annual_return = ((1 + total_return/100) ** (1/years) - 1) * 100
metrics['年化收益率'] = annual_return
# 年化波动率
annual_volatility = returns.std() * np.sqrt(12) * 100
metrics['年化波动率'] = annual_volatility
# 夏普比率(无风险利率3%)
risk_free_rate = 0.03
sharpe_ratio = (annual_return - risk_free_rate * 100) / annual_volatility
metrics['夏普比率'] = sharpe_ratio
# 最大回撤
cumulative = (1 + returns).cumprod()
running_max = cumulative.expanding().max()
drawdown = (cumulative - running_max) / running_max
max_drawdown = drawdown.min() * 100
metrics['最大回撤'] = max_drawdown
# 卡玛比率
calmar_ratio = annual_return / abs(max_drawdown) if max_drawdown != 0 else 0
metrics['卡玛比率'] = calmar_ratio
return metrics
def plot_results(self, portfolio_records, benchmark_data=None, dates=None):
"""绘制回测结果图表"""
if not portfolio_records:
print("没有有效数据可绘制")
return
dates = [record['date'] for record in portfolio_records]
values = [record['total_value'] for record in portfolio_records]
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
# 净值曲线
ax1.plot(range(len(dates)), values, label='多因子策略', linewidth=2, color='blue')
if benchmark_data is not None and dates:
# 对齐基准数据
benchmark_values = []
for date in dates:
date_dt = pd.to_datetime(date)
benchmark_point = benchmark_data[benchmark_data['date'] == date_dt]
if not benchmark_point.empty:
benchmark_values.append(benchmark_point['close'].iloc[0])
else:
# 找最近日期
earlier_data = benchmark_data[benchmark_data['date'] <= date_dt]
if not earlier_data.empty:
benchmark_values.append(earlier_data['close'].iloc[-1])
else:
benchmark_values.append(np.nan)
if len(benchmark_values) == len(values):
benchmark_normalized = np.array(benchmark_values) / benchmark_values[0] * values[0]
ax1.plot(range(len(dates)), benchmark_normalized, label='沪深300', linewidth=2, color='red')
ax1.set_title('多因子选股策略净值曲线 vs 沪深300', fontsize=14)
ax1.set_ylabel('组合净值(元)')
ax1.legend()
ax1.grid(True)
# 回撤曲线
if len(values) > 1:
returns = pd.Series(values).pct_change().dropna()
cumulative = (1 + returns).cumprod()
running_max = cumulative.expanding().max()
drawdown = (cumulative - running_max) / running_max * 100
ax2.fill_between(range(1, len(dates)), drawdown, 0, color='red', alpha=0.3)
ax2.plot(range(1, len(dates)), drawdown, color='red', linewidth=1)
ax2.set_title('策略回撤')
ax2.set_ylabel('回撤 (%)')
ax2.set_xlabel('调仓期数')
ax2.grid(True)
plt.tight_layout()
plt.show()
def generate_report(self, portfolio_records, metrics):
"""生成绩效报告"""
print("\n" + "="*60)
print("多因子选股策略绩效报告")
print("="*60)
for key, value in metrics.items():
if '比率' in key:
print(f"{key}: {value:.4f}")
else:
print(f"{key}: {value:.2f}%")
if portfolio_records:
print(f"\n策略概要:")
print(f"回测期间: {portfolio_records[0]['date']} 至 {portfolio_records[-1]['date']}")
print(f"期初资金: {portfolio_records[0]['total_value']:,.2f}元")
print(f"期末资金: {portfolio_records[-1]['total_value']:,.2f}元")
print(f"总调仓次数: {len(portfolio_records)}次")
print("="*60)
# 主执行函数
def main():
# 创建多因子选股实例
strategy = MultiFactorStockSelection()
# 设置回测参数 - 使用较短时间测试
start_date = '2020-06-01' # 调整开始日期
end_date = '2020-12-31' # 先用半年数据测试
initial_capital = 1000000 # 100万元
print("多因子选股模型在A股市场的应用研究")
print("回测参数:")
print(f"期间: {start_date} 至 {end_date}")
print(f"初始资金: {initial_capital:,}元")
print(f"调仓频率: 月度")
print(f"选股比例: 前10%")
print(f"交易成本: 单边0.03%")
# 运行回测
portfolio_records, portfolio_values, benchmark_data, dates = strategy.run_backtest(
start_date, end_date, initial_capital
)
if portfolio_records:
# 计算绩效指标
benchmark_values = None
if benchmark_data is not None and dates:
benchmark_values = []
for date in dates:
date_dt = pd.to_datetime(date)
benchmark_point = benchmark_data[benchmark_data['date'] == date_dt]
if not benchmark_point.empty:
benchmark_values.append(benchmark_point['close'].iloc[0])
metrics = strategy.calculate_performance_metrics(portfolio_values, benchmark_values)
# 生成报告和图表
strategy.generate_report(portfolio_records, metrics)
strategy.plot_results(portfolio_records, benchmark_data, dates)
else:
print("回测失败,请检查参数和数据连接")
if __name__ == "__main__":
main() 为我改进代码