本章介绍最优子集回归和正则化方法(Ridge回归、Lasso回归和弹性网络)
#第5章 线性模型中的高级特征选择技术 ####
#本章介绍五种变量选择方法:最优子集、Ridge回归、Lasso回归、弹性网络、交叉验证
install.packages("devtools")
devtools::install_github("cran/ElemStatLearn")
library(ElemStatLearn) #包含要用的数据框的包
library(car) #计算方差膨胀因子VIF的包
library(corrplot)
library(leaps) #最优子集包
library(glmnet) #正则化方法包
library(caret)
library(dplyr)
data(prostate)
View(prostate)
#数据集lalonde变量解释:lcavol肿瘤体积的对数值,lweight前列腺重量的对数值,age患者年龄,
#lbph良性前列腺增生量的对数值,svi精囊是否受侵犯,lcp包膜穿透度的对数值,
#gleason一种评分,pgg45Gleason评分为4或5所占的比值,lpsaPSA值的对数值(因变量),
#Train = TRUE表明数据是训练集,Train = FALSE表明数据是验证集
plot(prostate)
plot(prostate$gleason,ylab = "Gleason Score")
table(prostate$gleason)
boxplot(prostate$lpsa ~ prostate$gleason,xlab = "Gleason Score",ylab = "Log of PSA")
prostate$gleason <- ifelse(prostate$gleason == 6,0,1)
table(prostate$gleason)
#数据清洗,对数据框中的变量gleason进行处理
p.cor <- cor(prostate)
corrplot.mixed(p.cor)
#绘制数据框prostate相关性热图
# prostate_allx <- prostate[1:8]
# p.cor1 <- cor(prostate_allx)
# corrplot.mixed(p.cor1)
#上述代码为剔除了数据框prostate的因变量lpsa之后绘制的自变量之间的相关性热图
train <- subset(prostate,train == TRUE)[,1:9]
str(train)
test <- subset(prostate,train == FALSE)[,1:9]
str(test)
#提取训练集和验证集
#5.1 最优子集回归 ####
#下面用最优子集回归筛选自变量,用线性回归建模
subfit <- regsubsets(lpsa ~ .,data = train)
b.sum <- summary(subfit)
b.sum
b.sum$bic
#输出每个子集模型的BIC值
which.min(b.sum$bic)
#找到BIC最小的子集所对应的特征数
#综合上述代码:summary(subfit)、b.sum$bic、which.min(b.sum$bic),可以发现在纳入的自变量
#个数为3时,此时的BIC最小,为-52,此时模型所纳入的自变量为lcavol、lweight和gleason
plot(b.sum$bic,type = "l",xlab = "# of Features",ylab = "BIC",main = "BIC Score by Feature Inclusion")
#绘制变量个数与BIC值之间的关系图,拐点为BIC最小时对应的变量个数
plot(subfit,scale = "bic",main = "Best Subset Features")
#将最优子集回归进行可视化
#图咋看?横轴表示自变量,从左到右依次为变量名,共8个
#纵轴表示每个子集纳入的自变量数量,从上到下依次为3个、2个、4个、5个、6个、7个、1个、8个
#颜色深浅表示BIC值,BIC越小越好
#观察发现BIC最小为-52,此时子集纳入了3个自变量,分别为lcavol、lweight和gleason,与上条代码
#所绘制的变量个数与BIC值之间的关系图一致,当自变量的个数为3个时,BIC的值最小。
#第一张线图仅发现当自变量为3个时BIC值最小,未能明确是哪3个自变量。
ols <- lm(lpsa ~ lcavol + lweight + gleason,data = train)
#用筛选出的自变量在训练集上拟合线性模型,训练出基于最优子集选择结果的线性模型ols
plot(ols$fitted.values,train$lpsa,xlab = "Predicted",ylab = "Actual",main = "Predicted vs Actual")
#在训练集上绘制预测值与实际值的关系
abline(0,1,col=2)
pred.subfit <- predict(ols,newdata=test)
plot(pred.subfit,test$lpsa,xlab = "Predicted",ylab = "Actual",main = "Predicted vs Actual")
#在验证集上绘制基于最优子集的预测值与实际值的关系
abline(0,1,col=2)
resid.subfit <- test$lpsa - pred.subfit
mean(resid.subfit^2)
#计算残差的均方误差MSE,以便量化通过最优子集回归训练的线性模型在测试集上的预测误差
#MSE越小,说明模型的预测值越接近真实值
#下面演示用ggplot2包绘制在验证集上预测值与实际值之间的关系
df <- data.frame(predicted = pred.subfit, actual = test$lpsa)
ggplot(data = df,aes(x = predicted,y = actual)) + geom_point() +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed")
labs(title = "Predicted vs Actual",x = "Predicted",y = "Actual") + theme_bw()
#5.2 正则化方法(包括Ridge回归、Lasso回归和弹性网络) ####
#下面用岭回归(Ridge回归)建模,注意Ridge回归无法使某些自变量的系数缩减到0
x <- as.matrix(train[,1:8])
y <- train[,9]
ridge <- glmnet(x,y,family = "gaussian",alpha = 0)
#alpha=0拟合Ridge回归,alpha=1拟合Lasso回归,alpha介于0~1之间拟合弹性网络
print(ridge)
plot(ridge,label = TRUE)
#绘制Ridge回归的系数路径图
plot(ridge,xvar = "lambda",label = TRUE)
#绘制lambda与系数之间的关系图
ridge.coef <- predict(ridge,s=0.1,type = "coefficients")
ridge.coef
#指定正则化强度参数lambda=0.1时,查看每个变量所对应的系数
#这里可以做个小实验,当指定一系列lambda的值时,可以看到随着lambda的增大,每个自变量的系数
#在缩小。至于如何选择合适的lambda,将在本章第三节中叙述(使用包含了交叉验证的正则化方法函数
#cv.glmnet来实现)
plot(ridge,xvar = "dev",lable = TRUE)
newx <- as.matrix(test[,1:8])
ridge.y <- predict(ridge,newx=newx,type = "response",s = 0.1)
plot(ridge.y,test$lpsa,xlab = "Predicted",ylab = "Actual",main = "Ridge Regression")
#绘制基于Ridge回归的预测值与实际值之间的关系
ridge.resid <- ridge.y - test$lpsa
mean(ridge.resid^2)
#计算MSE,量化Ridge回归在测试集上的预测误差
#下面用Lasso回归建模,与Ridge回归不同,Lasso回归可以使某些自变量的系数缩减到0
lasso <- glmnet(x,y,family = "gaussian",alpha = 1)
#alpha=1表示拟合Lasso回归
print(lasso)
plot(lasso,label = TRUE)
#绘制Lasso回归的系数路径图
plot(lasso,xvar = "lambda",label = TRUE)
#绘制lambda与系数之间的关系图
#怎么看?最上方的一行数字代表纳入的自变量的数量,横轴为log(lambda),纵轴为系数
#系数随着log(lambda)而变化,随着log(lambda)增大,系数减小,直到被压缩到0
#某个变量越晚被压缩到0,说明该变量所占的权重越大
#在构建模型时,我们会选定一个lambda来界定变量的数量(具体参见本章第三节通过应用包含了
#交叉验证的正则化方法函数cv.glmnet来选择lambda的值)
lasso.coef <- predict(lasso,s=0.045,type = "coefficients")
lasso.coef
#查看每个变量所对应的系数
#根据print(lasso)的输出结果,当lambda=0.045时被Lasso所选择的变量数量为7个,所以只有
#7个自变量有系数值
plot(lasso,xvar = "dev",lable = TRUE)
lasso.y <- predict(lasso,newx=newx,type = "response",s = 0.045)
plot(lasso.y,test$lpsa,xlab = "Predicted",ylab = "Actual",main = "Lasso Regression")
#绘制基于Lasso回归的预测值与实际值之间的关系
lasso.resid <- lasso.y - test$lpsa
mean(lasso.resid^2)
#计算MSE,量化Lasso回归在测试集上的预测误差
#下面用弹性网络建模
grid <- expand.grid(.alpha = seq(0,1,by = .2),.lambda = seq(0.00,0.2,by = 0.02))
#生成一系列alpha和lambda的组合
table(grid)
control <- trainControl(method = "LOOCV")
set.seed(701)
enet.train <- train(lpsa ~ .,data = train,method = "glmnet",
trControl = control,tuneGrid = grid)
enet.train
#最优调优参数为alpha=0.0,lambda=0.08,此时RMSE最小,相当于glmnet中lambda=0.08
#的Ridge回归,此时R方为61%
enet <- glmnet(x,y,family = "gaussian",alpha = 0.0,lambda = 0.08)
#alpha介于0和1之间用于拟合弹性网络
enet.coef <- predict(enet,s=0.08,type = "coefficients")
enet.coef
enet.y <- predict(enet,newx=newx,type = "response",s = 0.08)
plot(enet.y,test$lpsa,xlab = "Predicted",ylab = "Actual",main = "Elastic Net")
#绘制基于弹性网络的预测值与实际值之间的关系
enet.resid <- enet.y - test$lpsa
mean(enet.resid^2)
#计算MSE,量化弹性网络在测试集上的预测误差
#5.3 下面利用glmnet包中的cv.glmnet函数进行含交叉验证的正则化方法 ####
set.seed(317)
lasso.cv <- cv.glmnet(x,y,nfolds = 3)
plot(lasso.cv)
#怎么看?最上方的一行数字代表纳入的自变量的数量,横轴为log(lambda),纵轴为平均预测误差MSE
#左边的虚线代表lambda.min,右边的虚线代表lambda.1se
#具体选择哪个lambda依结果而定
lasso.cv$lambda.min #模型的最佳正则化参数
lasso.cv$lambda.1se #提供了一个较为保守的模型以避免过拟合
coef(lasso.cv,s = "lambda.min")
#结合图示,当lambda=lambda.min时,对应图形左侧虚线,此时保留的自变量为8个,
#故全部8个自变量均有系数
coef(lasso.cv,s = "lambda.1se")
#结合图示,当lambda=lambda.1se时,对应图形右侧虚线,此时保留的自变量为3个,
#此时8个自变量中仅有3个自变量有系数
lasso.y.cv.1se <- predict(lasso.cv,newx = newx,type = "response",s = "lambda.1se")
lasso.cv.resid1 <- lasso.y.cv.1se - test$lpsa
mean(lasso.cv.resid1^2)
#计算当指定lambda=lambda.1se时的MSE
lasso.y.cv.min <- predict(lasso.cv,newx = newx,type = "response",s = "lambda.min")
lasso.cv.resid2 <- lasso.y.cv.min - test$lpsa
mean(lasso.cv.resid2^2)
#计算当指定lambda=lambda.min时的MSE
#下面为使用包含了交叉验证的正则化方法函数cv.glmnet的示例
#所选用的数据框为第三章使用的数据
library(MASS)
data(biopsy,package = "MASS")
biopsy$ID <- NULL
names(biopsy) <- c("thick","u.size","u.shape","adhsn","s.size","nucl","chrom","n.nuc","mit","class")
biopsy.v2 <- na.omit(biopsy)
set.seed(123)
ind <- sample(2,nrow(biopsy.v2),replace = TRUE,prob = c("0.7","0.3"))
train <- biopsy.v2[ind==1,]
test <- biopsy.v2[ind==2,]
x <- as.matrix(train[,1:9])
y <- train[,10]
set.seed(3)
fitCV <- cv.glmnet(x,y,family = "binomial",type.measure = "auc",nfolds = 5)
#此为二分类问题,故指定family=binomial
plot(fitCV)
fitCV$lambda.min
fitCV$lambda.1se
coef(fitCV,s = "lambda.1se")
#当指定lambda=lambda.1se时,Lasso回归筛选出的有意义的自变量为:thick、u.size
#、u.shape、nucl、n.nuc
install.packages("devtools")
devtools::install_github("cran/InformationValue")
library(InformationValue)
predCV <- predict(fitCV,newx = as.matrix(test[,1:9]),s = "lambda.1se",type = "response")
actuals <- ifelse(test$class == "malignant",1,0)
misClassError(actuals,predCV)
plotROC(actuals,predCV)
predCV.min <- predict(fitCV,newx = as.matrix(test[,1:9]),s = "lambda.min",type = "response")
misClassError(actuals,predCV.min)
# fit.lasso.cv <- glm(class ~ thick+u.size+u.shape+nucl+n.nuc,data = train,
# family = binomial)
# summary(fit.lasso.cv)
# test.lasso.probs <- predict(fit.lasso.cv,newdata = test,type = "response")
# test.lasso.probs <- ifelse(test.lasso.probs > 0.5, 1, 0)
# test.lasso.probs <- factor(test.lasso.probs, levels = levels(testY))
# confusionMatrix(test.lasso.probs,testY)
#使用通过Lasso回归筛选出的自变量进行Logistic回归建模,并评估模型在验证集上的效能