import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
import warnings
from Return import *
from datetime import *
from pandas.tseries.offsets import BMonthEnd
warnings.filterwarnings('ignore')
class FactorBase:
"""因子基类,所有因子类都应该继承这个类"""
def __init__(self, t1, t2, t4):
"""
初始化因子基类
Parameters:
t1: DataFrame, 专利详细数据
t2: DataFrame, 专利变更历史记录
t4: DataFrame, 专利持有情况
"""
self.t1 = t1.copy()
self.t2 = t2.copy()
self.t4 = t4.copy()
self._preprocess_data()
def _extract_ipc_list(self, ipc_str):
"""提取IPC二级分类列表"""
try:
if pd.isna(ipc_str):
return []
if isinstance(ipc_str, str):
ipc_str = ipc_str.replace("'", "").replace("[", "").replace("]", "")
ipc_list = [ipc.strip() for ipc in ipc_str.split(",") if ipc.strip()]
ipc_secondary = [ipc[:3] for ipc in ipc_list if len(ipc) >= 3]
return list(set(ipc_secondary))
return []
except:
return []
def _preprocess_data(self):
"""数据预处理(公共部分)"""
# 只在t2中没有ipc_secondary列时才创建
if 'ipc_secondary' not in self.t2.columns:
self.t2['ipc_secondary'] = self.t2['ipc_code'].apply(self._extract_ipc_list)
# 处理t4中的日期字段
if 'start_date' in self.t4.columns and not pd.api.types.is_datetime64_any_dtype(self.t4['start_date']):
self.t4['start_date'] = pd.to_datetime(self.t4['start_date'])
if 'end_date' in self.t4.columns and not pd.api.types.is_datetime64_any_dtype(self.t4['end_date']):
self.t4['end_date'] = pd.to_datetime(self.t4['end_date'])
# 获取所有A股股票代码
self.a_share_tickers = set()
if 'ticker' in self.t4.columns:
for tickers in self.t4['ticker'].dropna():
if isinstance(tickers, str):
for ticker in tickers.split('/'):
if any(market in ticker for market in ['SZ', 'SH', 'BJ']):
self.a_share_tickers.add(ticker)
print(f"获取到 {len(self.a_share_tickers)} 个A股股票代码")
def calculate_factor(self, month_end, returns_data):
"""计算因子值(需要子类实现)"""
raise NotImplementedError("子类必须实现calculate_factor方法")
#科技关联动量因子
class TechMomentum(FactorBase):
"""科技动量因子类"""
def __init__(self, t1, t2, t4, lookback_period='1Y'):
super().__init__(t1, t2, t4)
self.lookback_period = lookback_period
self._setup_ipc_categories()
def _setup_ipc_categories(self):
"""设置IPC分类"""
self.all_ipc_codes = set()
for ipc_list in self.t2['ipc_secondary']:
if isinstance(ipc_list, list):
self.all_ipc_codes.update(ipc_list)
self.all_ipc_codes = sorted(self.all_ipc_codes)
self.ipc_to_idx = {ipc: idx for idx, ipc in enumerate(self.all_ipc_codes)}
print(f"IPC二级分类数量: {len(self.all_ipc_codes)}")
def _get_tech_vectors(self, month_end):
"""获取技术向量"""
# 计算时间窗口
if self.lookback_period == '1Y':
lookback_years = 1
elif self.lookback_period == '3Y':
lookback_years = 3
elif self.lookback_period == '5Y':
lookback_years = 5
else:
lookback_years = 1
month_start = month_end - pd.DateOffset(years=lookback_years)
# 筛选有效的专利持有记录(公司自身持有)
valid_holdings = self.t4[
(self.t4['oaid'] == self.t4['recog_oa_id']) &
(self.t4['start_date'] <= month_end) &
((self.t4['end_date'].isna()) | (self.t4['end_date'] >= month_start))
].copy()
# 只保留A股公司
valid_holdings = valid_holdings[valid_holdings['ticker'].apply(
lambda x: any(market in str(x) for market in ['SZ', 'SH', 'BJ']) if pd.notna(x) else False
)]
tech_vectors = {}
for ticker in self.a_share_tickers:
# 获取该公司的专利
company_patents = valid_holdings[valid_holdings['ticker'].str.contains(ticker, na=False)]
patent_ids = company_patents['patent_id'].unique()
if len(patent_ids) == 0:
tech_vectors[ticker] = np.zeros(len(self.all_ipc_codes))
continue
# 获取专利的IPC分类
patent_ipcs = self.t2[
self.t2['patent_id'].isin(patent_ids) &
(pd.to_datetime(self.t2['grant_date']) >= month_start) &
(pd.to_datetime(self.t2['grant_date']) <= month_end)
]
# 构建技术向量
tech_vector = np.zeros(len(self.all_ipc_codes))
for ipc_list in patent_ipcs['ipc_secondary']:
if isinstance(ipc_list, list):
for ipc in ipc_list:
if ipc in self.ipc_to_idx:
tech_vector[self.ipc_to_idx[ipc]] += 1
tech_vectors[ticker] = tech_vector
return tech_vectors
def calculate_factor(self, month_end, returns_data):
"""计算科技动量因子"""
print(f"计算 {month_end.strftime('%Y-%m')} 的科技动量因子...")
# 获取技术向量
tech_vectors = self._get_tech_vectors(month_end)
if not tech_vectors:
print(f"没有找到技术向量数据")
return {}
# 过滤掉全零向量
valid_tech_vectors = {k: v for k, v in tech_vectors.items() if np.any(v > 0)}
if len(valid_tech_vectors) < 2:
print(f"有效公司数量不足 ({len(valid_tech_vectors)})")
return {}
tickers = list(valid_tech_vectors.keys())
tech_matrix = np.array([valid_tech_vectors[code] for code in tickers])
# 计算技术相似度
similarity_matrix = cosine_similarity(tech_matrix)
# 获取收益率数据
returns = returns_data.get(month_end, {})
# 计算科技动量因子
tech_momentum = {}
for i, ticker in enumerate(tickers):
numerator = 0
denominator = 0
for j in range(len(tickers)):
if i != j and similarity_matrix[i, j] > 0:
return_j = returns.get(tickers[j], 0)
numerator += similarity_matrix[i, j] * return_j
denominator += similarity_matrix[i, j]
if denominator > 0:
tech_momentum[ticker] = numerator / denominator
else:
tech_momentum[ticker] = 0
print(f" -> 成功计算 {len(tech_momentum)} 只股票的因子值")
return tech_momentum
class ReturnCalculator:
"""收益率计算类"""
def __init__(self, adjf_path, pv_path):
self.adjf_loader = AdjFactorLoader(adjf_path)
self.pv_loader = PriceLoader(pv_path)
self.return_calculator = Month_return(self.adjf_loader, self.pv_loader)
def get_stock_returns(self, month_end):
"""获取单月收益率"""
month_start = month_end.replace(day=1)
trading_days = pd.date_range(month_start, month_end, freq='BM')
if len(trading_days) < 2:
print(f" {month_end.strftime('%Y-%m')}: 交易日不足")
return {}
try:
returns = self.return_calculator.Return(trading_days)
if returns:
print(f" {month_end.strftime('%Y-%m')}: 获取 {len(returns)} 只股票收益率")
return returns
else:
print(f" {month_end.strftime('%Y-%m')}: 无收益率数据")
return {}
except Exception as e:
print(f" {month_end.strftime('%Y-%m')}: 计算失败 - {e}")
return {}
def get_multiple_months_returns(self, month_ends):
"""获取多月收益率"""
returns_data = {}
print("开始获取收益率数据...")
for i, month_end in enumerate(month_ends):
print(f"[{i+1}/{len(month_ends)}] ", end="")
returns = self.get_stock_returns(month_end)
if returns:
returns_data[month_end] = returns
print(f"收益率数据获取完成,共 {len(returns_data)} 个月有效数据")
return returns_data
class FactorProcessor:
"""因子处理器,用于整合所有因子和收益率数据"""
def __init__(self, t1, t2, t4, adjf_path, pv_path):
# 创建数据副本,避免修改原始数据
self.t1 = t1.copy() if t1 is not None else pd.DataFrame()
self.t2 = t2.copy() if t2 is not None else pd.DataFrame()
self.t4 = t4.copy() if t4 is not None else pd.DataFrame()
self.adjf_path = adjf_path
self.pv_path = pv_path
self.factors = {}
self.returns_calculator = ReturnCalculator(adjf_path, pv_path)
def register_factor(self, factor_name, factor_class, **kwargs):
"""注册因子"""
try:
factor_instance = factor_class(self.t1, self.t2, self.t4, **kwargs)
self.factors[factor_name] = factor_instance
print(f"成功注册因子: {factor_name}")
except Exception as e:
print(f"注册因子 {factor_name} 失败: {e}")
def process_factors(self, month_ends, factor_names=None):
"""处理所有因子(获取收益率和计算因子值)"""
if factor_names is None:
factor_names = list(self.factors.keys())
if not factor_names:
print("没有注册任何因子")
return pd.DataFrame()
# 获取所有需要的收益率数据(当前月和上个月)
all_returns_needed = set(month_ends)
previous_months = []
for month_end in month_ends:
# 计算上个月末
previous_month_end = month_end - pd.offsets.MonthBegin(1) - pd.offsets.Day(1)
previous_months.append(previous_month_end)
all_returns_needed.add(previous_month_end)
returns_data = self.returns_calculator.get_multiple_months_returns(list(all_returns_needed))
if not returns_data:
print("没有获取到有效的收益率数据")
return pd.DataFrame()
# 计算因子数据
all_factor_data = []
for factor_name in factor_names:
if factor_name not in self.factors:
print(f"警告: 因子 {factor_name} 未注册")
continue
print(f"\n开始计算因子: {factor_name}")
factor_instance = self.factors[factor_name]
for i, month_end in enumerate(month_ends):
print(f"[{i+1}/{len(month_ends)}] 处理 {month_end.strftime('%Y-%m')}...")
# 计算上个月末日期
previous_month_end = month_end - pd.offsets.MonthBegin(1) - pd.offsets.Day(1)
try:
# 使用上个月的数据计算因子值
factor_values = factor_instance.calculate_factor(previous_month_end, returns_data)
if factor_values:
for ticker, value in factor_values.items():
all_factor_data.append({
'date': month_end, # 当前月(收益实现的月份)
'ticker': ticker,
'factor_value': value, # 上月末计算的因子值
'factor_name': factor_name
})
except Exception as e:
print(f"计算 {factor_name} 在 {previous_month_end} 失败: {e}")
continue
if not all_factor_data:
print("没有生成任何因子数据")
return pd.DataFrame()
factor_df = pd.DataFrame(all_factor_data)
# 合并收益率数据(当前月的收益率)
return_data_list = []
for month_end in month_ends:
returns = returns_data.get(month_end, {})
for ticker, ret in returns.items():
return_data_list.append({
'date': month_end,
'ticker': ticker,
'return': ret # 当前月的收益率
})
return_df = pd.DataFrame(return_data_list)
# 合并因子和收益率数据
frdata = pd.merge(factor_df, return_df, on=['date', 'ticker'], how='inner')
print(f"\n数据处理完成,共 {len(frdata)} 条有效记录")
print("数据格式:")
print("- date: 收益实现的月份")
print("- ticker: 股票代码")
print("- factor_value: 上月末计算的因子值")
print("- return: 当月收益率")
return frdata
# 使用示例
if __name__ == "__main__":
# 读取数据(确保文件路径正确)
try:
t1 = pd.read_csv('/home/liao/zhuanlidata/1-1.秩鼎_专利数据_专利详细数据_to202407.csv')
t2 = pd.read_csv('/home/liao/zhuanlidata/1-2.秩鼎_专利数据_专利变更历史记录_to202407.csv')
t4 = pd.read_csv('/home/liao/zhuanlidata/1-4.秩鼎_专利数据_专利持有情况_全部子公司_to202407.csv')
print("数据读取成功")
except Exception as e:
print(f"数据读取失败: {e}")
# 创建空DataFrame继续测试
t1, t2, t4 = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
# 初始化因子处理器
processor = FactorProcessor(t1, t2, t4,
adjf_path='/tank0/byb/data/other/adjf_rust',
pv_path='/tank0/byb/data/mikuang/pv/')
# 注册因子
processor.register_factor('Tech_Momentum_1Y', TechMomentum, lookback_period='1Y')
# 生成时间序列(先用少量数据测试)
test_month_ends = pd.date_range('2023-01-31', '2023-02-28', freq='BM')
# 处理因子数据
frdata = processor.process_factors(test_month_ends)
# # 使用示例
# if __name__ == "__main__":
# # 读取数据
# t1 = pd.read_csv('/path/to/t1.csv')
# t2 = pd.read_csv('/path/to/t2.csv')
# t4 = pd.read_csv('/path/to/t4.csv')
# # 初始化因子处理器
# processor = FactorProcessor(t1, t2, t4,
# adjf_path='/tank0/byb/data/other/adjf_rust',
# pv_path='/tank0/byb/data/mikuang/pv/')
# # 注册因子
# processor.register_factor('Tech_Momentum_1Y', TechMomentum, lookback_period='1Y')
# processor.register_factor('Tech_Momentum_3Y', TechMomentum, lookback_period='3Y')
# processor.register_factor('Tech_Momentum_5Y', TechMomentum, lookback_period='5Y')
# # 生成时间序列
# month_ends = pd.date_range('2015-01-31', '2023-12-31', freq='M')
# # 处理因子数据
# frdata = processor.process_factors(month_ends)
# # 保存结果
# if not frdata.empty:
# frdata.to_csv('factor_returns_data.csv', index=False)
# print(f"数据已保存,共 {len(frdata)} 条记录")
# # 可以用于回测
# backtester = Backtrade(frdata[frdata['factor_name'] == 'Tech_Momentum_1Y'], 'Tech_Momentum_1Y')
# metrics = backtester.run_backtest()
# else:
# print("没有生成有效数据")修改完后输出:数据读取成功
获取到 5363 个A股股票代码
IPC二级分类数量: 127
成功注册因子: Tech_Momentum_1Y
开始获取收益率数据...
[1/3] 2022-12: 交易日不足
2023-01: 交易日不足
[3/3] 2023-02: 交易日不足
收益率数据获取完成,共 0 个月有效数据
没有获取到有效的收益率数据