根据你的介绍的方法,融合进我的r# 加载所需库
library(glmnet)
library(quantreg)
library(ggplot2)
library(dplyr)
library(KernSmooth)
library(reshape2)
library(ggpubr)
# 跨平台字体设置
if (.Platform$OS.type == "windows") {
# Windows系统使用SimHei
windowsFonts(SimHei = windowsFont("SimHei"))
default_font <- "SimHei"
} else {
# macOS系统使用Heiti TC
default_font <- "Heiti TC"
}
# 应用字体设置
theme_update(text = element_text(family = default_font))
# 数据准备 ---------------------------------------------------------------
data <- read.csv("/Users/lijiawei/Desktop/变量选择.csv", fileEncoding = "UTF-8")
colnames(data)[1] <- "index_return"
component_cols <- colnames(data)[-1]
# 划分训练集和测试集
set.seed(123)
n <- nrow(data)
train_idx <- sample(1:n, floor(0.7 * n))
test_idx <- setdiff(1:n, train_idx)
train_data <- data[train_idx, ]
test_data <- data[test_idx, ]
# 数据标准化处理 (保持数据框结构)
train_data_std <- train_data %>%
mutate(across(all_of(component_cols), ~ .x - mean(.x, na.rm = TRUE)))
test_data_std <- test_data %>%
mutate(across(all_of(component_cols), ~ .x - mean(.x, na.rm = TRUE)))
# 准备数据框格式 (保留列名)
X_train_df <- train_data_std[, component_cols, drop = FALSE]
X_test_df <- test_data_std[, component_cols, drop = FALSE]
y_train <- train_data$index_return
y_test <- test_data$index_return
# 1. 自适应LASSO模型 -----------------------------------------------------
# 转换为矩阵格式
X_train_mat <- as.matrix(X_train_df)
X_test_mat <- as.matrix(X_test_df)
# 训练模型
lasso_cv <- cv.glmnet(X_train_mat, y_train, alpha = 1)
lambda_opt <- lasso_cv$lambda.min
# 变量选择
lasso_coef <- as.vector(coef(lasso_cv, s = "lambda.min"))[-1]
selected_idx_lasso <- which(lasso_coef != 0)
selected_vars_lasso <- component_cols[selected_idx_lasso]
num_selected_lasso <- length(selected_vars_lasso)
# 预测计算
train_pred_lasso <- predict(lasso_cv, X_train_mat, s = "lambda.min") %>% as.numeric()
test_pred_lasso <- predict(lasso_cv, X_test_mat, s = "lambda.min") %>% as.numeric()
# 2. 改进的核密度加权分位数回归及变量选择 ------------------------------
# 定义函数:使用稳定性选择进行变量筛选(修复版)
select_variables_stability <- function(X, y, n_boot = 50, prob = 0.7, tau = 0.5) {
n_vars <- ncol(X)
selection_freq <- rep(0, n_vars)
col_names <- colnames(X) # 保存变量名
# 多次抽样训练分位数回归模型
for(i in 1:n_boot) {
# 有放回抽样(同时抽样X和y,保持索引一致)
boot_idx <- sample(1:nrow(X), replace = TRUE)
X_boot <- X[boot_idx, , drop = FALSE] # 保持数据框结构
y_boot <- y[boot_idx]
# 合并成完整数据集
df_boot <- data.frame(y_boot, X_boot)
colnames(df_boot)[1] <- "y" # 重命名因变量列
# 构建动态公式(使用所有自变量列)
formula <- as.formula(paste("y ~", paste(col_names, collapse = "+")))
# 训练分位数回归模型(通过data参数传递数据框)
model <- rq(formula, data = df_boot, tau = tau)
coefs <- coef(model)[-1] # 排除截距项,获取自变量系数
# 记录变量是否被选中(系数绝对值大于阈值)
selection_freq <- selection_freq + (abs(coefs) > 1e-6)
}
# 计算选择频率
selection_prob <- selection_freq / n_boot
# 选择频率超过阈值的变量
selected_idx <- which(selection_prob >= prob)
return(list(selected_idx = selected_idx, selection_prob = selection_prob))
}
# 使用稳定性选择进行变量筛选
set.seed(456)
stability_result <- select_variables_stability(
X = X_train_df,
y = y_train,
n_boot = 100,
prob = 0.6,
tau = 0.5
)
selected_idx_cqr <- stability_result$selected_idx
selection_prob <- stability_result$selection_prob
# 计算每个变量在不同分位数下的平均系数绝对值作为重要性度量
calc_var_importance <- function(X, y, taus = c(0.25, 0.5, 0.75)) {
var_importance <- matrix(0, nrow = ncol(X), ncol = length(taus))
for(i in seq_along(taus)) {
model <- rq(y ~ X, tau = taus[i])
var_importance[, i] <- abs(coef(model)[-1]) # 排除截距项
}
# 计算平均重要性
mean_importance <- rowMeans(var_importance)
return(mean_importance)
}
# 计算变量重要性
var_importance <- calc_var_importance(X_train_df, y_train)
# 结合稳定性选择和变量重要性进行最终变量选择
final_selected_idx <- intersect(
selected_idx_cqr,
which(var_importance > quantile(var_importance, 0.75)) # 选择重要性前25%的变量
)
selected_vars_cqr <- component_cols[final_selected_idx]
num_selected_cqr <- length(selected_vars_cqr)
# 构建只包含选择变量的模型公式
if(num_selected_cqr > 0) {
formula_str_cqr <- paste("y_train ~", paste(selected_vars_cqr, collapse = "+"))
model_formula_cqr <- as.formula(formula_str_cqr)
} else {
# 如果没有选择任何变量,使用所有变量
formula_str_cqr <- paste("y_train ~", paste(component_cols, collapse = "+"))
model_formula_cqr <- as.formula(formula_str_cqr)
final_selected_idx <- 1:length(component_cols)
selected_vars_cqr <- component_cols
num_selected_cqr <- length(selected_vars_cqr)
}
# 训练基础分位数回归 (仅使用选择的变量)
base_models <- lapply(tau_seq, function(tau) {
rq(model_formula_cqr, data = cbind(y_train, X_train_df[, final_selected_idx, drop = FALSE]), tau = tau)
})
# 计算核密度权重
residuals <- sapply(base_models, residuals)
avg_resid <- rowMeans(residuals)
if(length(unique(avg_resid)) == 1) {
weights <- rep(1/length(avg_resid), length(avg_resid))
} else {
kde <- bkde(avg_resid)
dens_fun <- approxfun(kde$x, kde$y, rule = 2)
weights <- dens_fun(avg_resid)
weights <- weights / sum(weights)
}
# 训练加权模型
weighted_models <- lapply(tau_seq, function(tau) {
rq(model_formula_cqr,
data = cbind(y_train, X_train_df[, final_selected_idx, drop = FALSE]),
tau = tau, weights = weights)
})
# 确保测试数据格式正确
test_data_df_cqr <- data.frame(X_test_df[, final_selected_idx, drop = FALSE])
colnames(test_data_df_cqr) <- selected_vars_cqr
# 生成预测
test_pred_cqr <- rowMeans(sapply(weighted_models, function(m) {
predict(m, newdata = test_data_df_cqr)
}))
# 3. 结果计算 -----------------------------------------------------------
# 维度校验
stopifnot(length(y_test) == length(test_pred_lasso))
stopifnot(length(y_test) == length(test_pred_cqr))
# 计算误差指标
metrics <- data.frame(
方法 = c("自适应LASSO", "改进核密度CQR"),
内预测误差 = c(
mean((y_train - train_pred_lasso)^2),
mean((y_train - predict(weighted_models[[2]]))^2) # 使用中位数模型代表
),
外预测误差 = c(
mean((y_test - test_pred_lasso)^2),
mean((y_test - test_pred_cqr)^2)
)
)
# 变量选择结果 - 包含所有方法
selection_counts <- data.frame(
方法 = c("自适应LASSO", "改进核密度CQR"),
筛选数量 = c(num_selected_lasso, num_selected_cqr)
)
# 入选成分股结果 - 包含所有方法
selected_stocks <- data.frame(
方法 = c("自适应LASSO", "改进核密度CQR"),
入选成分股 = c(
paste(selected_vars_lasso, collapse = ", "),
if(num_selected_cqr > 0) paste(selected_vars_cqr, collapse = ", ") else "无显著变量"
)
)
# 4. 结果输出 -----------------------------------------------------------
cat("=== 表格1: 预测误差比较 ===\n")
print(metrics)
cat("\n=== 表格2: 变量选择数量 ===\n")
print(selection_counts)
cat("\n=== 表格3: 入选成分股 ===\n")
print(selected_stocks)
# 5. 可视化输出 ---------------------------------------------------------
# 预测结果对比图
plot_df <- data.frame(
Date = seq_along(y_test),
Actual = y_test,
LASSO = test_pred_lasso,
CQR = test_pred_cqr
)
p1 <- ggplot(plot_df, aes(x = Date)) +
geom_line(aes(y = Actual, color = "实际值"), linewidth = 1) +
geom_line(aes(y = LASSO, color = "LASSO预测"), linetype = "dashed") +
geom_line(aes(y = CQR, color = "CQR预测"), linetype = "dotted") +
scale_color_manual(
name = "图例",
values = c("实际值" = "black", "LASSO预测" = "blue", "CQR预测" = "red")
) +
labs(title = "指数追踪效果对比", x = "样本序号", y = "收益率") +
theme_bw() +
theme(
legend.position = "bottom",
text = element_text(family = default_font),
plot.title = element_text(hjust = 0.5)
)
# 变量重要性图 (仅显示CQR选择的变量)
importance_df <- data.frame(
变量 = selected_vars_cqr,
重要性 = var_importance[final_selected_idx]
)
# 按重要性排序
importance_df <- importance_df[order(-importance_df$重要性), ]
p2 <- ggplot(importance_df, aes(x = reorder(变量, -重要性), y = 重要性)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(title = "改进核密度CQR变量重要性", x = "变量", y = "重要性") +
theme_bw() +
theme(
text = element_text(family = default_font),
plot.title = element_text(hjust = 0.5)
)
# 稳定性选择结果可视化
stability_df <- data.frame(
变量 = component_cols,
选择概率 = selection_prob
)
# 只显示选择概率大于0的变量
stability_df <- stability_df[stability_df$选择概率 > 0, ]
stability_df <- stability_df[order(-stability_df$选择概率), ]
p3 <- ggplot(stability_df, aes(x = reorder(变量, -选择概率), y = 选择概率)) +
geom_bar(stat = "identity", fill = "lightgreen") +
coord_flip() +
labs(title = "变量稳定性选择概率", x = "变量", y = "选择概率") +
theme_bw() +
theme(
text = element_text(family = default_font),
plot.title = element_text(hjust = 0.5)
)
# 显示所有图表
gridExtra::grid.arrange(p1, p2, p3, ncol = 1, heights = c(2, 1.5, 1.5))