## ===== 仅在缺包时安装,并加载 =====
need_pkgs <- c("glmnet", "readxl", "readr")
to_install <- setdiff(need_pkgs, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install, dependencies = TRUE)
install.packages("readxl")
library(readxl)
install.packages("glmnet")
library(glmnet)
install.packages("readr")
library(readr) # 为 parse_number
# 1.读取数据(直接跳过前两行说明/单位)
dat <- read_excel("武汉数据lasso.xlsx")
# 删除第1和第3行
dat <- dat[-c(1, 2), ]
View(dat)
# 2.定义所有需要处理的分类变量(按实际列名)
categorical_cols <- c(
"gender", "age", "nationality", "marig", "edu",
"hospital", "pos", "career", "title", "appointment",
"meditime", "salary", "commutetime", "nightworkif",
"shiftwork", "workweekend"
)
# 3. 为每个变量定义标签映射
label_mappings <- list(
gender = c("1" = "Female", "2" = "Male"),
age = c("1" = "<30", "2" = ">=30,<40", "3" = ">=40,<50", "4" = ">=50"),
nationality = c("1" = "Han_nationality", "2" = "Minority"),
marig = c("1" = "Spinsterhood", "2" = "Married", "3" = "Other_Marig"),
edu = c("1" = "High_school_below", "2" = "Junior_College",
"3" = "Undergraduate", "4" = "Master", "5" = "Doctor"),
hospital = c("1" = "Red_Crossd", "2" = "Third_Hospital",
"3" = "People's_Hospital", "4" = "Fifth_Hospital"),
pos = c("1" = "Senior_manager", "2" = "Department_manager",
"3" = "Department_staff", "4" = "Intern/Trainee/student"),
career = c("1" = "Physician", "2" = "Nurse",
"3" = "Medical_Technician", "4" = "Administrative/scientific",
"5" = "Support_Crew", "6" = "Other_Career"),
title = c("1" = "Advanced", "2" = "Intermediate",
"3" = "Primary", "4" = "No_Title"),
appointment = c("1" = "Establishment", "2" = "Contract",
"3" = "Labor_Dispatching", "4" = "Personnel_Agency",
"5" = "Other_Appointment"),
meditime = c("1" = "<1year", "2" = "1~5years",
"3" = "6~10years", "4" = "11~15years", "5" = ">15years"),
salary = c("1" = "<2000", "2" = "2000~4000", "3" = "4000~6000",
"4" = "6000~8000", "5" = "8000~10000", "6" = ">10000"),
commutetime = c("1" = "<0.5h", "2" = "0.5~1h", "3" = ">1h"),
nightworkif = c("1" = "Nightwork", "2" = "Not_Nightwork"),
shiftwork = c("1" = "Shiftwork", "2" = "Not_Shiftwork"),
workweekend = c("1" = "Never", "2" = "Seldom", "3" = "Sometimes",
"4" = "Often", "5" = "Always")
)
# 4.选择特征列(X)和目标列(y) =====
x_cols <- c("gender", "age", "edu",
"pos", "career", "title",
"meditime", "salary", "commutetime", "nightworkif",
"shiftwork", "workweekend","ED","HE","RE","SS","SC")
# 构造 X(不再做任何行切片)
X <- dat[, x_cols, drop = FALSE]
# 清洗+转数值:
X[] <- lapply(X, function(v){
if (is.numeric(v)) return(v)
s <- as.character(v)
out <- readr::parse_number(s)
out[grepl("%", s)] <- out[grepl("%", s)] / 100
out
})
# 若只能用列号,就明确一个数字,二选一(确认后只保留一个)
y <- dat[["BO"]]
# 把 y 也安全转成数值(万一是字符/因子)
if (!is.numeric(y)) y <- parse_number(as.character(y))
# 5.同步清理:X 用均值填补,y 去掉非有限值,并同步子集
# 计算每列均值
col_means <- sapply(X, function(v) mean(v, na.rm = TRUE))
# 用列均值填补 X 的 NA
for (j in seq_along(X)) {
idx <- is.na(X[[j]])
if (any(idx)) X[[j]][idx] <- col_means[j]
}
# y 的非有限值行剔除(并同步到 X)
ok <- is.finite(y)
X <- X[ok, , drop = FALSE]
y <- y[ok]
# 基本检查
stopifnot(nrow(X) == length(y))
if (sd(y) == 0) stop("y 为常数列,无法做高斯回归;请换一个有变动的 y。")
if (any(!is.finite(as.matrix(X)))) stop("X 中仍有 NA/Inf/-Inf,请检查数据清洗。")
# 转矩阵/向量
x_mat <- as.matrix(X)
y_vec <- as.numeric(y)
# 6.拟合 LASSO 与 CV
set.seed(123)
fit <- glmnet(x_mat, y_vec, alpha = 1, family = "gaussian", standardize = TRUE)
cvfit <- cv.glmnet(x_mat, y_vec, alpha = 1, family = "gaussian",
nfolds = 10, standardize = TRUE)
# 7.作图
# xvar = "lambda" 时横轴是 log(lambda),下面两条竖线用 log(lambda) 是对的
plot(fit, xvar = "lambda", label = TRUE,
xlab = "log(Lambda)", ylab = "Coefficients",
main = "LASSO Coefficient Paths (log λ)",sign.lambda=1)
abline(v = log(cvfit$lambda.min), lty = 2, col = "red",)
abline(v = log(cvfit$lambda.1se), lty = 2, col = "blue")
plot(cvfit, sign.lambda=1) # CV 曲线(自带两条竖线)
# 8.提取非零系数
coef_min <- coef(cvfit, s = "lambda.min")
nz_min <- setdiff(which(coef_min != 0), 1) # 去掉截距
result_min <- data.frame(
variable = rownames(coef_min)[nz_min],
coef = as.numeric(coef_min[nz_min])
)
result_min <- result_min[order(-abs(result_min$coef)), ]
coef_1se <- coef(cvfit, s = "lambda.1se")
nz_1se <- setdiff(which(coef_1se != 0), 1)
result_1se <- data.frame(
variable = rownames(coef_1se)[nz_1se],
coef = as.numeric(coef_1se[nz_1se])
)
result_1se <- result_1se[order(-abs(result_1se$coef)), ]
cat("\n--- 选出的特征(lambda.min)---\n"); print(result_min)
cat("\n--- 选出的特征(lambda.1se)---\n"); print(result_1se)
cat("\nlambda.min =", cvfit$lambda.min, "\nlambda.1se =", cvfit$lambda.1se, "\n") 这是一份做Lasso回归分析的代码,请检查其中是否有错误,并提出改进方法。