# use SelectFromModel meta-transformers along with Lasso to select the best couple of features from the Boston dataset
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_boston
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
# load the boston dataset
# 波士顿房价数据集(一般用于回归模型预测房价),506个样本、13个特征,返回(data,target)二元组
boston = load_boston()
X, y = boston['data'], boston['target']
# we use the base estimator LassoCV since the L1 norm promotes sparsity of features
# linear_model模块下主要实现一些线性模型,比如岭回归、贝叶斯回归、基于最小角回归计算的Lasso回归、Elastic Net等模型,还实现了随机梯度下降相关的算法
# LassoCV,基于交叉验证,优化函数为:(1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1,模型使用坐标轴下降算法优化
# 参数n_alphas,正则过程需验证的超参数数量
# 参数alphas,超参数列表
# 参数normalize,为真时,模型先计算平均值除以L2范数实现正则化,再进行回归计算
# 参数precompute,是否预计算格拉姆矩阵(Gram matrix)加速回归过程
# 参数max_iter,最大迭代次数
# 参数tol,模型容忍度,优化目标
# 参数cv,交叉验证策略
# 属性coef_,特征参数矩阵
# 属性alphas_,超参数数组
# 属性n_iter_,使用坐标轴下降达到容忍度目标的过程中迭代的次数
clf = LassoCV(cv=5)
# set a minimum threshold of 0.25
# 元转换器,基于影响度权重选择特征
# 参数estimator,基础转换器,有没有实现fit属性的模型都可以,但必须拥有feature_importances_属性或coef_属性中的一个
# 参数threshold,特征选取的阈值,权重大于等于阈值的特征将被选择,其余的特征被放弃,参数性质支持字符串、浮点数、或空值
# 参数max_features,最大选择特征数
sfm = SelectFromModel(clf, threshold=0.25)
# 调用转换器
sfm.fit(X, y)
# transform方法,基于被选择的特征构建数据子集,获得数据子集中的特征数
n_features = sfm.transform(X).shape[1]
# reset the threshold till the number of features equals two
# note that the attribute can be set directly instead of repeatedly
# fitting the metatransformer
# 当被选择的特征数大于2时,提高阈值,重新转换并选择特征,直到选择的特征数小于等于2个
while n_features > 2:
sfm.threshold += 0.1
X_transform = sfm.transform(X)
n_features = X_transform.shape[1]
# plot the selected two features from X
plt.title('features selected from boston using SelectFromModel with threshold %0.3f.' % sfm.threshold)
feature1 = X_transform[:, 0]
feature2 = X_transform[:, 1]
# 将被选择的两个特征值用点状图画出来
plt.plot(feature1, feature2, 'r.')
plt.xlabel('feature number 1')
plt.ylabel('feature number 2')
plt.ylim([np.min(feature2), np.max(feature2)])
plt.show()
print(X_transform.shape)
print(X_transform[:10])