# 题目重述
基于 `data_HeartDisease.xlsx` 数据构建分类模型,要求:
1. 尝试选择不同的特征(自变量);
2. 使用多种机器学习算法(如决策树、随机森林、逻辑回归、KNN等)建模;
3. 对模型进行优化(如超参数调优、阈值调整等);
4. 使用最优模型对 `student_HD.xlsx` 数据进行预测;
5. 分析并确定具有最高预测准确率的算法与特征组合;
6. 提交完整 R 代码和预测结果的 Excel 文件。
---
# 详解
以下为完整可运行的 **R语言解决方案**,适用于课程作业提交。代码实现了从数据加载、预处理、多特征集与多算法对比、交叉验证评估、最优模型训练到对 `student_HD.xlsx` 进行预测并输出结果文件的全流程。
```r
# —————————————————————— 1. 加载必要包 ——————————————————————
library(tidyverse)
library(mlr3verse)
library(readxl)
library(writexl)
# 注意:若未安装,请先运行:
# install.packages(c("tidyverse", "mlr3verse", "readxl", "writexl"))
# —————————————————————— 2. 读取并预处理训练数据 ——————————————————————
# 读取原始数据
data_train <- read_excel("data_HeartDisease.xlsx")
# 转换所有字符列为因子(factor)
data_clean <- data_train %>%
mutate(across(where(is.character), as.factor))
# 创建 mlr3 分类任务(目标变量:HeartDisease)
task_full <- as_task_classif(data_clean, target = "HeartDisease")
# —————————————————————— 3. 构造不同特征子集用于比较 ——————————————————————
# 方案A:医学相关核心特征(分类变量为主)
features_medical <- c("AgeCategory", "Sex", "Smoking", "Diabetic", "DiffWalking",
"Stroke", "KidneyDisease", "GenHealth", "PhysicalActivity")
task_medical <- task_full$select(features_medical)
# 方案B:增强型特征(加入数值型指标)
features_enhanced <- c(features_medical, "BMI", "PhysicalHealth", "MentalHealth", "SleepTime")
task_enhanced <- task_full$select(features_enhanced)
# 汇总所有任务
tasks <- list(
full = task_full,
medical = task_medical,
enhanced = task_enhanced
)
# —————————————————————— 4. 定义多个机器学习算法(学习器) ——————————————————————
learners <- list(
lrn("classif.rpart", predict_type = "prob"), # 决策树
lrn("classif.ranger", predict_type = "prob", num.threads = 1), # 随机森林
lrn("classif.log_reg", predict_type = "prob"), # 逻辑回归
lrn("classif.kknn", predict_type = "prob") # K近邻
)
# —————————————————————— 5. 设置5折交叉验证并执行基准测试 ——————————————————————
design <- benchmark_grid(
tasks = tasks,
learners = learners,
resamplings = rsmp("cv", folds = 5)
)
# 执行基准测试(耗时操作,建议运行一次后保存结果)
bmr <- benchmark(design)
# —————————————————————— 6. 汇总性能并找出最佳模型 ——————————————————————
performance <- bmr$aggregate(msrs(c("classif.acc", "classif.auc")))
# 按准确率排序
performance <- performance[order(-classif.acc), ]
print(performance)
# 获取最佳模型配置
best_model <- performance[1, ]
cat("\n🏆 最佳模型配置:\n")
cat("特征集:", best_model$task_id, "\n")
cat("算法:", best_model$learner_id, "\n")
cat("准确率:", round(best_model$classif.acc, 4), "\n")
cat("AUC:", round(best_model$classif.auc, 4), "\n\n")
# —————————————————————— 7. 在最优特征集上重新训练模型(可选调参) ——————————————————————
best_task <- tasks[[best_model$task_id]]
best_learner <- lrn(best_model$learner_id, predict_type = "prob")
# 若是决策树,则启用自动调参
if (grepl("rpart", best_model$learner_id)) {
library(mlr3tuning)
at <- auto_tuner(
tuner = tnr("random_search"),
learner = best_learner,
resampling = rsmp("cv", folds = 5),
measure = msr("classif.acc"),
search_space = ps(cp = p_dbl(0, 0.1), minsplit = p_int(2, 20)),
term_evals = 20
)
best_learner <- at
}
# 训练最终模型
best_learner$train(best_task)
# —————————————————————— 8. 读取学生数据并进行预测 ——————————————————————
# 读取待预测数据
data_student <- read_excel("student_HD.xlsx")
# 确保与训练数据结构一致
required_features <- best_task$feature_names
# 处理缺失列或类型不一致问题
for (col in required_features) {
if (!col %in% names(data_student)) {
stop("缺少必要特征:", col)
}
data_student[[col]] <- as.factor(data_student[[col]])
}
# 提取有效输入数据
X_new <- data_student[, required_features]
# 进行预测
pred <- best_learner$predict_newdata(X_new)
# 将预测结果合并回原始数据
result_df <- bind_cols(
data_student,
tibble(
prediction = pred$response,
prob_No = pred$prob[, "No"],
prob_Yes = pred$prob[, "Yes"]
)
)
# —————————————————————— 9. 导出预测结果到Excel ——————————————————————
write_xlsx(result_df, "predicted_student_HD.xlsx")
# 输出完成信息
cat("✅ 预测完成!\n")
cat("📁 文件已保存:predicted_student_HD.xlsx\n")
cat("📊 总样本数:", nrow(result_df), "\n")
cat("🔍 使用特征数:", length(required_features), "\n")
cat("🎯 最高准确率模型:", best_model$learner_id, "(", round(best_model$classif.acc, 4), ")\n")
# 题目重述
基于 `data_HeartDisease.xlsx` 数据构建分类模型,要求:
1. 尝试选择不同的特征(自变量);
2. 使用多种机器学习算法(如决策树、随机森林、逻辑回归、KNN等)建模;
3. 对模型进行优化(如超参数调优、阈值调整等);
4. 使用最优模型对 `student_HD.xlsx` 数据进行预测;
5. 分析并确定具有最高预测准确率的算法与特征组合;
6. 提交完整 R 代码和预测结果的 Excel 文件。
---
# 详解
以下为完整可运行的 **R语言解决方案**,适用于课程作业提交。代码实现了从数据加载、预处理、多特征集与多算法对比、交叉验证评估、最优模型训练到对 `student_HD.xlsx` 进行预测并输出结果文件的全流程。
```r
# —————————————————————— 1. 加载必要包 ——————————————————————
library(tidyverse)
library(mlr3verse)
library(readxl)
library(writexl)
# 注意:若未安装,请先运行:
# install.packages(c("tidyverse", "mlr3verse", "readxl", "writexl"))
# —————————————————————— 2. 读取并预处理训练数据 ——————————————————————
# 读取原始数据
data_train <- read_excel("data_HeartDisease.xlsx")
# 转换所有字符列为因子(factor)
data_clean <- data_train %>%
mutate(across(where(is.character), as.factor))
# 创建 mlr3 分类任务(目标变量:HeartDisease)
task_full <- as_task_classif(data_clean, target = "HeartDisease")
# —————————————————————— 3. 构造不同特征子集用于比较 ——————————————————————
# 方案A:医学相关核心特征(分类变量为主)
features_medical <- c("AgeCategory", "Sex", "Smoking", "Diabetic", "DiffWalking",
"Stroke", "KidneyDisease", "GenHealth", "PhysicalActivity")
task_medical <- task_full$select(features_medical)
# 方案B:增强型特征(加入数值型指标)
features_enhanced <- c(features_medical, "BMI", "PhysicalHealth", "MentalHealth", "SleepTime")
task_enhanced <- task_full$select(features_enhanced)
# 汇总所有任务
tasks <- list(
full = task_full,
medical = task_medical,
enhanced = task_enhanced
)
# —————————————————————— 4. 定义多个机器学习算法(学习器) ——————————————————————
learners <- list(
lrn("classif.rpart", predict_type = "prob"), # 决策树
lrn("classif.ranger", predict_type = "prob", num.threads = 1), # 随机森林
lrn("classif.log_reg", predict_type = "prob"), # 逻辑回归
lrn("classif.kknn", predict_type = "prob") # K近邻
)
# —————————————————————— 5. 设置5折交叉验证并执行基准测试 ——————————————————————
design <- benchmark_grid(
tasks = tasks,
learners = learners,
resamplings = rsmp("cv", folds = 5)
)
# 执行基准测试(耗时操作,建议运行一次后保存结果)
bmr <- benchmark(design)
# —————————————————————— 6. 汇总性能并找出最佳模型 ——————————————————————
performance <- bmr$aggregate(msrs(c("classif.acc", "classif.auc")))
# 按准确率排序
performance <- performance[order(-classif.acc), ]
print(performance)
# 获取最佳模型配置
best_model <- performance[1, ]
cat("\n🏆 最佳模型配置:\n")
cat("特征集:", best_model$task_id, "\n")
cat("算法:", best_model$learner_id, "\n")
cat("准确率:", round(best_model$classif.acc, 4), "\n")
cat("AUC:", round(best_model$classif.auc, 4), "\n\n")
# —————————————————————— 7. 在最优特征集上重新训练模型(可选调参) ——————————————————————
best_task <- tasks[[best_model$task_id]]
best_learner <- lrn(best_model$learner_id, predict_type = "prob")
# 若是决策树,则启用自动调参
if (grepl("rpart", best_model$learner_id)) {
library(mlr3tuning)
at <- auto_tuner(
tuner = tnr("random_search"),
learner = best_learner,
resampling = rsmp("cv", folds = 5),
measure = msr("classif.acc"),
search_space = ps(cp = p_dbl(0, 0.1), minsplit = p_int(2, 20)),
term_evals = 20
)
best_learner <- at
}
# 训练最终模型
best_learner$train(best_task)
# —————————————————————— 8. 读取学生数据并进行预测 ——————————————————————
# 读取待预测数据
data_student <- read_excel("student_HD.xlsx")
# 确保与训练数据结构一致
required_features <- best_task$feature_names
# 处理缺失列或类型不一致问题
for (col in required_features) {
if (!col %in% names(data_student)) {
stop("缺少必要特征:", col)
}
data_student[[col]] <- as.factor(data_student[[col]])
}
# 提取有效输入数据
X_new <- data_student[, required_features]
# 进行预测
pred <- best_learner$predict_newdata(X_new)
# 将预测结果合并回原始数据
result_df <- bind_cols(
data_student,
tibble(
prediction = pred$response,
prob_No = pred$prob[, "No"],
prob_Yes = pred$prob[, "Yes"]
)
)
# —————————————————————— 9. 导出预测结果到Excel ——————————————————————
write_xlsx(result_df, "predicted_student_HD.xlsx")
# 输出完成信息
cat("✅ 预测完成!\n")
cat("📁 文件已保存:predicted_student_HD.xlsx\n")
cat("📊 总样本数:", nrow(result_df), "\n")
cat("🔍 使用特征数:", length(required_features), "\n")
cat("🎯 最高准确率模型:", best_model$learner_id, "(", round(best_model$classif.acc, 4), ")\n")
错误于.__Task__select(self = self, private = private, super = super, :
Assertion on 'cols' failed: Must be a subset of {'AgeCategory','Diabetic','DiffWalking','GenHealth','KidneyDisease','PhysicalActivity','Sex','Smoking','Stroke'}, but has additional elements {'BMI','PhysicalHealth','MentalHealth','SleepTime'}.
最新发布