1. 方差分析特征筛选
1.1 原理 & 手动实现
import pandas as pd
import numpy as np
X_train_OE = pd.DataFrame({'tenure': [1, 3, 2, 4, 2, 3, 4]
,'Churn': [0,0,1,0,0,1,1]
,'MonthlyCharges': [29.85,56.95,53.85,42.30,70.70,73.12,76.37]
,'gender': ['F','M','M','F','M','M','F']})
X_train = X_train_OE['MonthlyCharges']
y_train = X_train_OE['Churn']
'''
方差分析零假设:
H0:流失用户群体和非流失用户群体的月消费金额没有差异
H1:流失用户群体和非流失用户群体的月消费金额存在显著差异
'''
cat_0 = X_train[y_train == 0]
cat_1 = X_train[y_train == 1]
cat_0, cat_1
'''
(0 29.85
1 56.95
3 42.30
4 70.70
Name: MonthlyCharges, dtype: float64,
2 53.85
5 73.12
6 76.37
Name: MonthlyCharges, dtype: float64)
'''
'''
SST 样本整体偏差
SSEj 第j组内偏差平方和
SSE = 累加SSEj 衡量组内离散程度
SSB = SST - SSE 组间偏差平方和 衡量组间离散程度
F 统计检验量 满足F(k-1, n-k)概率分布
带入数据算出F值大小,查看对应在F(k-1, n-k)中的概率值,判断是否接受原假设
'''
k = y_train.nunique()
n = len(y_train)
k, n
'''
(2, 7)
'''
cat_0_mean = cat_0.mean()
cat_1_mean = cat_1.mean()
SSE0 = np.power(cat_0 - cat_0_mean, 2).sum()
SSE1 = np.power(cat_1 - cat_1_mean, 2).sum()
SSE = SSE0 + SSE1
SSE
'''
1238.4436000000003
'''
n0 = len(cat_0)
n1 = len(cat_1)
cat_mean = X_train.mean()
SSB = n0 * np.power(cat_0_mean-cat_mean, 2) + n1 * np.power(cat_1_mean-cat_mean, 2)
SSB
'''
544.9866857142856
'''
SST = np.power(X_train - cat_mean, 2).sum()
SSE + SSB , SST
'''
(1783.430285714286, 1783.4302857142861)
'''
MSB = SSB / (k - 1)
MSE = SSE / (n - k)
F_score = MSB / MSE
F_score
'''
2.2002886756986166
'''
'''
查看临界值表的方式查看显著性水平为0.01时的临界值(超过该值时p小于显著性水平)
使用scipy.special.fdtrc进行p值计算
'''
import scipy.special
scipy.special.fdtrc(k-1, n-k, F_score)
'''
F取值2.2的概率为0.198。
若几乎为零则代表零假设概率几乎为零,可以推翻H0,H1成立。
不同类别人群在消费金额上存在显著差异,Churn的标签与MonthlyCharges数据存在显著关联关系,
MonthlyCharges适合带入模型进行训练,来学习不同标签样本的特征。
'''
1.2 scipy.stats.f_oneway(d1, d2)实现
import scipy.stats
'''
statistic F值
pvalue 事件发生的概率
结果与手动实现相同
'''
scipy.stats.f_oneway(cat_0, cat_1)
'''
F_onewayResult(statistic=2.200288675698618, pvalue=0.19809649741527557)
'''
a1 = np.random.randn(1000)
a2 = np.random.randn(1000)
scipy.stats.f_oneway(a1, a2)
'''
F_onewayResult(statistic=0.27750725017678485, pvalue=0.5983982050004926)
'''
1.3 sklearn.feature_selection.f_classif(X, y)实现
from sklearn.feature_selection import f_classif
'''
X_train.shape = (7,)
X_train.values.__class__ >>> array
y_train.shape = (7,)
y_train.__class__ >>> pandas.core.series.Series
本质是调用f_oneway
'''
f_classif(X_train.values.reshape(-1, 1), y_train)
'''
(array([2.20028868]), array([0.1980965]))
'''
1.2 特征选择 sklearn.feature_selection.SelectKBest
X_train2 = pd.DataFrame(X_train_OE, columns=['tenure','MonthlyCharges'])
y_train = X_train_OE['Churn']
from sklearn.feature_selection import SelectKBest
'''
score_func 特征选择使用的方法,默认f_classif
k 取得分最好的前k个特征,所有特征也可输入"all"
越靠前和标签关联度越强
.fit(X, y) 传入特征集和标签,拟合数据
.transform(X) 转换数据,返回特征过滤后的特征数据集
'''
KB_CF = SelectKBest(score_func=f_classif, k=2)
KB_CF.fit(X_train2, y_train)
KB_CF.scores_, KB_CF.pvalues_
'''
(array([0.30612245, 2.20028868]), array([0.6038969, 0.1980965]))
'''
def SelectName(feature_data, model):
scores = model.scores_
indices = np.argsort(scores)[::-1]
return list(feature_data.columns.values[indices[0:model.k]])
SelectName(X_train2, KB_CF)
'''
['MonthlyCharges', 'tenure']
'''
2. 特征递归消除(RFE)特征筛选
2.1 原理
- 给定评估器,在指定数据集上计算每个特征的重要性,即计算每个特征的coef_或feature_importances_;
- 剔除特征重要性计算结果最小的特征,再次训练模型,计算剩余特征的重要性;
- 重复第二步,直到特征子集个数小于n_features_to_select
2.2 sklearn实现 sklearn.feature_selection.RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
'''
estimator 带入训练的评估器,必须能输出特征重要性指标
n_features_to_select 筛选后特征个数,默认1/2
step 每次剔除特征数,默认1
importance_getter 特征重要性评估指标,默认coef_或feature_importance
verbose 1显示中间过程,默认0
'''
tree_rfe = DecisionTreeClassifier()
rfe = RFE(estimator=tree_rfe)
rfe.fit(X_train2, y_train)
rfe.n_features_in_
'''
2
'''
rfe.n_features_
'''
1
'''
rfe.get_feature_names_out(), rfe.feature_names_in_[rfe.get_support()], rfe.get_support()
'''
(array(['MonthlyCharges'], dtype=object),
array(['MonthlyCharges'], dtype=object),
array([False, True]))
'''
rfe.ranking_
'''
array([2, 1])
'''
rfe_res = pd.Series(rfe.feature_names_in_, index=rfe.ranking_)
rfe_res.sort_index(inplace=False)
'''
1 MonthlyCharges
2 tenure
dtype: object
'''
'''
原始数据筛选后的特征,带入最后一轮模型进行预测
'''
rfe.predict(X_train2), rfe.score(X_train2, y_train)
'''
(array([0, 0, 1, 0, 0, 1, 1]), 1.0)
'''
tree_rfe.fit(X_train2[rfe.get_feature_names_out()], y_train)
tree_rfe.predict(X_train2[rfe.get_feature_names_out()]), tree_rfe.score(X_train2[rfe.get_feature_names_out()], y_train)
'''
(array([0, 0, 1, 0, 0, 1, 1]), 1.0)
'''
X_train2[rfe.get_feature_names_out()]
| MonthlyCharges |
---|
0 | 29.85 |
---|
1 | 56.95 |
---|
2 | 53.85 |
---|
3 | 42.30 |
---|
4 | 70.70 |
---|
5 | 73.12 |
---|
6 | 76.37 |
---|
2.3 网格搜索防止过拟合 sklearn.model_selection.GridSearchCV
'''
RFE过程只是对模型进行简单训练,并未进行参数调优,会使每一轮的模型都过拟合,
基于过拟合模型产出的feature_importances_进行特征筛选不可靠
网格搜索 https://blog.youkuaiyun.com/qq_45249685/article/details/125937140
'''
import pandas as pd
import numpy as np
X_train_OE = pd.DataFrame({'tenure': [1, 3, 2, 4, 2, 3, 4, 1, 3, 2]
,'Churn': [0,0,1,0,0,1,1,0,1,1]
,'MonthlyCharges': [29.85,56.95,53.85,42.30,70.70,73.12,76.37,12.4,67.2,34.98]
,'gender': [0.1,1.2,10.3,8.4,1.9,6.2,7.1,3.2,4.3,9.8]})
X_train3 = pd.DataFrame(X_train_OE, columns=['tenure', 'MonthlyCharges', 'gender'])
X_test3 = X_train3[:2]
y_train = X_train_OE['Churn']
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
tree_model = DecisionTreeClassifier()
tree_param = {'ccp_alpha': np.arange(0, 1, 0.1).tolist()
,'max_depth': np.arange(2, 8, 1).tolist()
,'min_samples_split': np.arange(2, 4, 1).tolist()
,'min_samples_leaf': np.arange(1, 4, 1).tolist()
,'max_leaf_nodes': np.arange(10, 20, 1).tolist()
}
tree_search_RFE = GridSearchCV(estimator=tree_model
,param_grid=tree_param
,n_jobs=-1)
tree_search_RFE.fit(X_train3, y_train)
tree_search_RFE.best_estimator_
'''
DecisionTreeClassifier(max_depth=2, max_leaf_nodes=10, min_samples_leaf=3)
'''
tree_search_RFE.best_params_
'''
{'ccp_alpha': 0.0,
'max_depth': 2,
'max_leaf_nodes': 10,
'min_samples_leaf': 1,
'min_samples_split': 3}
'''
rfe_search = RFE(estimator=tree_search_RFE.best_estimator_, n_features_to_select=1).fit(X_train3,y_train)
rfe_search.ranking_
'''
array([2, 3, 1])
'''
rfe_res_search = pd.Series(rfe_search.feature_names_in_, index=rfe_search.ranking_)
rfe_res_search.sort_index(inplace=False)
'''
1 gender
2 tenure
3 MonthlyCharges
dtype: object
'''
2.4 循环网格搜索
'''
每次RFE剔除特征后,利用特征子集训练一个最优超参数搜索后的决策树,
再将这个模型带入下次RFE特征搜索
'''
tree_param = {'ccp_alpha': np.arange(0, 1, 0.1).tolist()
,'max_depth': np.arange(2, 8, 1).tolist()
,'min_samples_split': np.arange(2, 4, 1).tolist()
,'min_samples_leaf': np.arange(1, 4, 1).tolist()
,'max_leaf_nodes': np.arange(10, 20, 1).tolist()
}
from tqdm import tqdm
import gc
rfe_res_search1 = []
for i in tqdm(range(5)):
i = 5 - i
tree_model = DecisionTreeClassifier()
tree_search_RFE = GridSearchCV(estimator=tree_model
,param_grid=tree_param
,n_jobs=-1)
if i == 5:
X_train_temp = (X_train3).copy()
X_test_temp = (X_test3).copy()
tree_search_RFE.fit(X_train_temp, y_train)
rfe_search = RFE(estimator=tree_search_RFE.best_estimator_, n_features_to_select=i).fit(X_train_temp, y_train)
X_train_temp = X_train_OE[rfe_search.get_feature_names_out()]
rfe_res_search1.append(rfe_search.feature_names_in_[rfe_search.ranking_ != 1])
gc.collect()
'''
100%|█████████████████████████████████████████████████████| 5/5 [00:29<00:00, 5.95s/it]
30843
'''
rfe_res_search1
'''
[array([], dtype=object),
array([], dtype=object),
array([], dtype=object),
array(['tenure'], dtype=object),
array(['MonthlyCharges'], dtype=object)]
'''
rfe_search.get_feature_names_out().tolist() + rfe_res_search1[::-1]
'''
['gender',
array(['tenure'], dtype=object),
array(['MonthlyCharges'], dtype=object),
array([], dtype=object),
array([], dtype=object),
array([], dtype=object)]
'''