特征选取
1、删除方差低的特征
from sklearn.feature_selection import VarianceThreshold
X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)#fit相当于建模,并没有应用
sel.fit_transform(X)#这步是应用,但是没有赋值,所以应用后的新值不会存储到X
X= sel.fit_transform(X)#赋值新值,X可以换名字,也可以不换
2、递归特征消除
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)#只用了数据的50行,10个字段来实验
estimator = SVR(kernel="linear")
# step=1每次删除的特征数目为1, cv=5是交叉验证
selector = RFECV(estimator, step=1, cv=5)
selector = selector.fit(X, y) #训练
X_SVR_NEW = selector.transform(X) #赋值
3、基于树的特征选择
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
iris = load_iris()
X, y = iris.data, iris.target
X.shape
clf = ExtraTreesClassifier()
clf = clf.fit(X, y)
clf.feature_importances_
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
X_new.shape