报错:KeyError: 'monthly_return'
代码:import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
class MultiFactorArbitrageStrategy:
def __init__(self, data_path: str, start_date: str = '2014-01-01', end_date: str = '2024-12-31'):
"""
多因子多空套利策略
Args:
data_path: 数据文件路径
start_date: 回测开始日期
end_date: 回测结束日期
"""
self.data_path = data_path
self.start_date = pd.to_datetime(start_date)
self.end_date = pd.to_datetime(end_date)
self.data = None
self.factor_data = None
self.strategy_results = None
# 策略参数
self.window_size = 6 # 6个月滚动窗口
self.top_n = 20 # 选股数量
self.short_n = 20 # 做空数量
self.rebalance_freq = 'M' # 月度调仓
def load_and_preprocess_data(self):
"""加载和预处理数据"""
print("正在加载数据...")
self.data = pd.read_csv(self.data_path, low_memory=False)
# 处理重复列
if self.data.columns.duplicated().any():
self.data = self.data.loc[:, ~self.data.columns.duplicated()]
# 转换日期格式
self.data['TradingMonth'] = pd.to_datetime(self.data['TradingMonth'])
# 过滤时间范围
self.data = self.data[
(self.data['TradingMonth'] >= self.start_date) &
(self.data['TradingMonth'] <= self.end_date)
].copy()
# 确保数据按时间和股票代码排序
self.data = self.data.sort_values(['TradingMonth', 'Stkcd']).reset_index(drop=True)
# 删除重复数据
self.data = self.data.drop_duplicates(subset=['TradingMonth', 'Stkcd'], keep='first')
print(f"数据加载完成,共 {len(self.data)} 条记录")
print(f"时间范围: {self.data['TradingMonth'].min()} 到 {self.data['TradingMonth'].max()}")
print(f"股票数量: {self.data['Stkcd'].nunique()}")
def calculate_financial_factors(self, group: pd.DataFrame) -> dict:
"""计算财务因子"""
factors = {}
available_columns = set(group.columns)
# 财务指标映射
financial_indicators = {
'roe': ['ROE', 'roe', '净资产收益率'],
'roa': ['ROA', 'roa', '总资产收益率'],
'gross_profit_margin': ['GrossProfitMargin', 'gross_profit_margin', '毛利率'],
'net_profit_margin': ['NetProfitMargin', 'net_profit_margin', '净利率'],
'asset_turnover': ['AssetTurnover', 'asset_turnover', '总资产周转率'],
'current_ratio': ['CurrentRatio', 'current_ratio', '流动比率'],
'debt_to_equity': ['DebtToEquity', 'debt_to_equity', '资产负债率'],
'revenue_growth': ['RevenueGrowth', 'revenue_growth', '营业收入增长率'],
'profit_growth': ['ProfitGrowth', 'profit_growth', '净利润增长率'],
'pe_ratio': ['PERatio', 'pe_ratio', '市盈率'],
'pb_ratio': ['PBRatio', 'pb_ratio', '市净率'],
'ps_ratio': ['PSRatio', 'ps_ratio', '市销率']
}
for factor_name, possible_columns in financial_indicators.items():
found_column = None
for col in possible_columns:
if col in available_columns:
found_column = col
break
if found_column:
try:
value = group[found_column].mean()
if not pd.isna(value) and value != 0:
factors[factor_name] = value
except:
pass
return factors
def calculate_technical_factors(self, group: pd.DataFrame) -> dict:
"""计算技术因子"""
factors = {}
try:
# 价格相关因子
if 'Clsprc' in group.columns:
close = group['Clsprc']
high = group['Hiprc'] if 'Hiprc' in group.columns else close
low = group['Loprc'] if 'Loprc' in group.columns else close
# 移动平均线
for window in [5, 10, 20, 60]:
ma = close.rolling(window=window).mean().iloc[-1]
if not pd.isna(ma):
factors[f'ma{window}'] = ma
# MACD
exp1 = close.ewm(span=12, adjust=False).mean()
exp2 = close.ewm(span=26, adjust=False).mean()
macd = exp1 - exp2
signal = macd.ewm(span=9, adjust=False).mean()
if not pd.isna(macd.iloc[-1]):
factors['macd'] = macd.iloc[-1]
if not pd.isna(signal.iloc[-1]):
factors['macd_signal'] = signal.iloc[-1]
# RSI
delta = close.diff()
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
rs = gain / loss
rsi = 100 - (100 / (1 + rs.iloc[-1]))
if not pd.isna(rsi):
factors['rsi'] = rsi
# 布林带
bb_middle = close.rolling(window=20).mean().iloc[-1]
if not pd.isna(bb_middle):
factors['bb_middle'] = bb_middle
std = close.rolling(window=20).std().iloc[-1]
if not pd.isna(std):
factors['bb_upper'] = bb_middle + 2 * std
factors['bb_lower'] = bb_middle - 2 * std
# 收益率相关因子
returns = close.pct_change()
factors['return_mean'] = returns.mean()
factors['return_std'] = returns.std()
factors['return_skew'] = returns.skew()
factors['return_kurt'] = returns.kurtosis()
# 价格位置
factors['price_position'] = (close.iloc[-1] - low.min()) / (high.max() - low.min())
except Exception as e:
pass
return factors
def calculate_market_factors(self, group: pd.DataFrame) -> dict:
"""计算市场因子"""
factors = {}
try:
# 市值因子
if 'Dsmvtll' in group.columns:
market_value = group['Dsmvtll'].mean()
if not pd.isna(market_value):
factors['market_value'] = market_value
factors['log_market_value'] = np.log(market_value)
# 成交量因子
if 'Dnvaltrd' in group.columns:
volume = group['Dnvaltrd']
factors['volume_mean'] = volume.mean()
factors['volume_std'] = volume.std()
factors['volume_ratio'] = volume.iloc[-1] / volume.mean() if volume.mean() > 0 else 1
# 波动率因子
if 'Clsprc' in group.columns:
returns = group['Clsprc'].pct_change()
factors['volatility'] = returns.std() * np.sqrt(252) # 年化波动率
except Exception as e:
pass
return factors
def calculate_all_factors(self):
"""计算所有因子"""
print("正在计算因子...")
results = []
grouped_data = self.data.groupby(['TradingMonth', 'Stkcd'])
for (trading_month, stkcd), group in grouped_data:
factor_values = {
'TradingMonth': trading_month,
'Stkcd': stkcd
}
# 计算各类因子
financial_factors = self.calculate_financial_factors(group)
technical_factors = self.calculate_technical_factors(group)
market_factors = self.calculate_market_factors(group)
factor_values.update(financial_factors)
factor_values.update(technical_factors)
factor_values.update(market_factors)
results.append(factor_values)
self.factor_data = pd.DataFrame(results)
# 处理缺失值
factor_cols = [col for col in self.factor_data.columns if col not in ['TradingMonth', 'Stkcd']]
self.factor_data[factor_cols] = self.factor_data[factor_cols].fillna(method='ffill').fillna(method='bfill')
# 标准化因子
for col in factor_cols:
if self.factor_data[col].std() > 0:
self.factor_data[col] = (self.factor_data[col] - self.factor_data[col].mean()) / self.factor_data[col].std()
print(f"因子计算完成,共 {len(factor_cols)} 个因子")
print(f"因子数据形状: {self.factor_data.shape}")
def dynamic_factor_selection(self, date: pd.Timestamp, window_months: int = 6) -> list:
"""动态因子选择"""
# 获取历史数据窗口
start_date = date - pd.DateOffset(months=window_months)
historical_data = self.factor_data[
(self.factor_data['TradingMonth'] >= start_date) &
(self.factor_data['TradingMonth'] < date)
]
if len(historical_data) == 0:
return []
# 计算因子与未来收益的相关性
factor_cols = [col for col in historical_data.columns if col not in ['TradingMonth', 'Stkcd']]
# 计算未来1个月收益
historical_data = historical_data.sort_values(['Stkcd', 'TradingMonth'])
historical_data['future_return'] = historical_data.groupby('Stkcd')['TradingMonth'].shift(-1)
# 计算因子重要性(基于与未来收益的相关性)
factor_importance = {}
for factor in factor_cols:
try:
correlation = historical_data[factor].corr(historical_data['future_return'])
if not pd.isna(correlation):
factor_importance[factor] = abs(correlation)
except:
factor_importance[factor] = 0
# 选择最重要的因子
sorted_factors = sorted(factor_importance.items(), key=lambda x: x[1], reverse=True)
selected_factors = [factor for factor, importance in sorted_factors[:20]] # 选择前20个因子
return selected_factors
def calculate_stock_scores(self, date: pd.Timestamp, selected_factors: list) -> pd.DataFrame:
"""计算股票评分"""
current_data = self.factor_data[self.factor_data['TradingMonth'] == date].copy()
if len(current_data) == 0 or len(selected_factors) == 0:
return pd.DataFrame()
# 计算综合评分
score = 0
for factor in selected_factors:
if factor in current_data.columns:
# 对于某些因子,负值可能更好(如估值因子)
if factor in ['pe_ratio', 'pb_ratio', 'ps_ratio', 'debt_to_equity']:
score -= current_data[factor]
else:
score += current_data[factor]
current_data['score'] = score
return current_data.sort_values('score', ascending=False)
def calculate_returns(self, date: pd.Timestamp, positions: dict) -> float:
"""计算策略收益"""
next_month = date + pd.DateOffset(months=1)
current_prices = self.data[self.data['TradingMonth'] == date][['Stkcd', 'Clsprc']].set_index('Stkcd')['Clsprc']
next_prices = self.data[self.data['TradingMonth'] == next_month][['Stkcd', 'Clsprc']].set_index('Stkcd')['Clsprc']
total_return = 0
for stkcd, position in positions.items():
if stkcd in current_prices.index and stkcd in next_prices.index:
current_price = current_prices[stkcd]
next_price = next_prices[stkcd]
if current_price > 0:
stock_return = (next_price - current_price) / current_price
total_return += position * stock_return
return total_return / len(positions) if positions else 0
def run_backtest(self):
"""运行回测"""
print("开始回测...")
# 获取所有调仓日期
rebalance_dates = pd.date_range(
start=self.factor_data['TradingMonth'].min() + pd.DateOffset(months=self.window_size),
end=self.factor_data['TradingMonth'].max(),
freq=self.rebalance_freq
)
backtest_results = []
current_positions = {}
for date in rebalance_dates:
# 动态因子选择
selected_factors = self.dynamic_factor_selection(date)
if len(selected_factors) == 0:
continue
# 计算股票评分
scored_stocks = self.calculate_stock_scores(date, selected_factors)
if len(scored_stocks) == 0:
continue
# 选择多空股票
long_stocks = scored_stocks.head(self.top_n)['Stkcd'].tolist()
short_stocks = scored_stocks.tail(self.short_n)['Stkcd'].tolist()
# 更新持仓
new_positions = {}
for stock in long_stocks:
new_positions[stock] = 1.0 / self.top_n # 等权重
for stock in short_stocks:
new_positions[stock] = -1.0 / self.short_n # 等权重做空
# 计算换手率
turnover = 0
if current_positions:
all_stocks = set(current_positions.keys()) | set(new_positions.keys())
for stock in all_stocks:
old_pos = current_positions.get(stock, 0)
new_pos = new_positions.get(stock, 0)
turnover += abs(new_pos - old_pos)
turnover /= 2
# 计算收益
if current_positions:
monthly_return = self.calculate_returns(date, current_positions)
else:
monthly_return = 0
# 记录结果
backtest_results.append({
'date': date,
'monthly_return': monthly_return,
'turnover': turnover,
'long_count': len(long_stocks),
'short_count': len(short_stocks),
'selected_factors': selected_factors[:5], # 记录前5个因子
'long_stocks': long_stocks[:10], # 记录前10只做多股票
'short_stocks': short_stocks[:10] # 记录前10只做空股票
})
current_positions = new_positions
self.strategy_results = pd.DataFrame(backtest_results)
# 计算累计收益
self.strategy_results['cumulative_return'] = (1 + self.strategy_results['monthly_return']).cumprod()
print(f"回测完成,共 {len(self.strategy_results)} 个调仓周期")
def calculate_performance_metrics(self) -> dict:
"""计算策略绩效指标"""
if self.strategy_results is None or len(self.strategy_results) == 0:
return {}
returns = self.strategy_results['monthly_return'].values
cumulative_returns = self.strategy_results['cumulative_return'].values
# 年化收益率
total_return = cumulative_returns[-1] - 1
years = len(returns) / 12
annual_return = (1 + total_return) ** (1 / years) - 1
# 年化波动率
annual_volatility = np.std(returns) * np.sqrt(12)
# 夏普比率
risk_free_rate = 0.03 # 假设无风险利率为3%
sharpe_ratio = (annual_return - risk_free_rate) / annual_volatility if annual_volatility > 0 else 0
# 最大回撤
peak = np.maximum.accumulate(cumulative_returns)
drawdown = (cumulative_returns - peak) / peak
max_drawdown = np.min(drawdown)
# 胜率
win_rate = np.sum(returns > 0) / len(returns)
# 平均换手率
avg_turnover = self.strategy_results['turnover'].mean()
# 卡玛比率
calmar_ratio = annual_return / abs(max_drawdown) if max_drawdown != 0 else 0
metrics = {
'annual_return': annual_return,
'annual_volatility': annual_volatility,
'sharpe_ratio': sharpe_ratio,
'max_drawdown': max_drawdown,
'win_rate': win_rate,
'avg_turnover': avg_turnover,
'calmar_ratio': calmar_ratio,
'total_return': total_return,
'total_months': len(returns)
}
return metrics
def plot_performance(self):
"""绘制策略绩效图表"""
if self.strategy_results is None:
print("没有回测结果可供绘图")
return
# 创建子图
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('多因子多空套利策略回测结果', fontsize=16, fontweight='bold')
# 1. 累计收益率
axes[0, 0].plot(self.strategy_results['date'], self.strategy_results['cumulative_return'],
linewidth=2, color='blue', label='策略累计收益')
axes[0, 0].axhline(y=1, color='red', linestyle='--', alpha=0.5, label='基准线')
axes[0, 0].set_title('累计收益率')
axes[0, 0].set_xlabel('日期')
axes[0, 0].set_ylabel('累计收益率')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)
# 2. 月度收益率分布
axes[0, 1].hist(self.strategy_results['monthly_return'], bins=30, alpha=0.7,
color='green', edgecolor='black')
axes[0, 1].axvline(x=0, color='red', linestyle='--', alpha=0.7, label='零收益线')
axes[0, 1].set_title('月度收益率分布')
axes[0, 1].set_xlabel('月度收益率')
axes[0, 1].set_ylabel('频次')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)
# 3. 回撤曲线
cumulative_returns = self.strategy_results['cumulative_return'].values
peak = np.maximum.accumulate(cumulative_returns)
drawdown = (cumulative_returns - peak) / peak
axes[1, 0].fill_between(self.strategy_results['date'], drawdown, 0,
alpha=0.3, color='red', label='回撤')
axes[1, 0].plot(self.strategy_results['date'], drawdown, color='red', linewidth=1)
axes[1, 0].set_title('回撤曲线')
axes[1, 0].set_xlabel('日期')
axes[1, 0].set_ylabel('回撤率')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
# 4. 换手率
axes[1, 1].plot(self.strategy_results['date'], self.strategy_results['turnover'],
linewidth=1, color='orange', label='换手率')
axes[1, 1].axhline(y=self.strategy_results['turnover'].mean(), color='red',
linestyle='--', alpha=0.7, label=f'平均换手率: {self.strategy_results["turnover"].mean():.2%}')
axes[1, 1].set_title('换手率变化')
axes[1, 1].set_xlabel('日期')
axes[1, 1].set_ylabel('换手率')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('策略绩效图.png', dpi=300, bbox_inches='tight')
plt.show()
# 绘制持仓情况
self.plot_holdings()
def plot_holdings(self):
"""绘制持仓情况"""
if self.strategy_results is None:
return
# 统计持仓股票
all_long_stocks = []
all_short_stocks = []
for _, row in self.strategy_results.iterrows():
all_long_stocks.extend(row['long_stocks'])
all_short_stocks.extend(row['short_stocks'])
# 统计股票出现频次
long_freq = pd.Series(all_long_stocks).value_counts().head(20)
short_freq = pd.Series(all_short_stocks).value_counts().head(20)
# 绘制持仓频次图
fig, axes = plt.subplots(1, 2, figsize=(16, 8))
fig.suptitle('策略持仓股票频次统计', fontsize=16, fontweight='bold')
# 做多股票频次
axes[0].barh(range(len(long_freq)), long_freq.values, color='green', alpha=0.7)
axes[0].set_yticks(range(len(long_freq)))
axes[0].set_yticklabels(long_freq.index)
axes[0].set_title('做多股票频次 (Top 20)')
axes[0].set_xlabel('出现频次')
axes[0].invert_yaxis()
# 做空股票频次
axes[1].barh(range(len(short_freq)), short_freq.values, color='red', alpha=0.7)
axes[1].set_yticks(range(len(short_freq)))
axes[1].set_yticklabels(short_freq.index)
axes[1].set_title('做空股票频次 (Top 20)')
axes[1].set_xlabel('出现频次')
axes[1].invert_yaxis()
plt.tight_layout()
plt.savefig('持仓情况图.png', dpi=300, bbox_inches='tight')
plt.show()
def print_performance_summary(self):
"""打印策略绩效摘要"""
metrics = self.calculate_performance_metrics()
if not metrics:
print("无法计算绩效指标")
return
print("\n" + "="*60)
print("多因子多空套利策略绩效摘要")
print("="*60)
print(f"回测期间: {self.start_date.strftime('%Y-%m-%d')} 至 {self.end_date.strftime('%Y-%m-%d')}")
print(f"总调仓次数: {metrics['total_months']}")
print(f"总收益率: {metrics['total_return']:.2%}")
print(f"年化收益率: {metrics['annual_return']:.2%}")
print(f"年化波动率: {metrics['annual_volatility']:.2%}")
print(f"夏普比率: {metrics['sharpe_ratio']:.3f}")
print(f"最大回撤: {metrics['max_drawdown']:.2%}")
print(f"胜率: {metrics['win_rate']:.2%}")
print(f"平均换手率: {metrics['avg_turnover']:.2%}")
print(f"卡玛比率: {metrics['calmar_ratio']:.3f}")
print("="*60)
# 打印最新持仓情况
if len(self.strategy_results) > 0:
latest_result = self.strategy_results.iloc[-1]
print(f"\n最新调仓日期: {latest_result['date'].strftime('%Y-%m-%d')}")
print(f"做多股票数量: {latest_result['long_count']}")
print(f"做空股票数量: {latest_result['short_count']}")
print(f"主要因子: {latest_result['selected_factors']}")
print(f"做多股票: {latest_result['long_stocks'][:5]}...")
print(f"做空股票: {latest_result['short_stocks'][:5]}...")
def run_strategy(self):
"""运行完整策略"""
print("开始运行多因子多空套利策略...")
# 1. 加载数据
self.load_and_preprocess_data()
# 2. 计算因子
self.calculate_all_factors()
# 3. 运行回测
self.run_backtest()
# 4. 计算绩效指标
metrics = self.calculate_performance_metrics()
# 5. 打印结果
self.print_performance_summary()
# 6. 绘制图表
self.plot_performance()
print("策略运行完成!")
def main():
"""主函数"""
# 创建策略实例
strategy = MultiFactorArbitrageStrategy(
data_path='/Users/diligence/PycharmProjects/mt_fx/merge_data/datafile/m_data/n_mdata1.csv',
start_date='2014-01-01',
end_date='2024-12-31'
)
# 运行策略
strategy.run_strategy()
if __name__ == "__main__":
main()