例子:鸢尾花数据集,使用Xgboost输出特征重要程度
import xgboost as xgb
from sklearn import datasets
iris = datasets.load_iris()
y = iris['target']
X = iris['data']
xgb_model = xgb.XGBClassifier().fit(X,y)
temp = pd.DataFrame()
temp['feature_names'] = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
temp['feature_importances'] = xgb_model.feature_importances_
temp = temp.sort_values('feature_importances',ascending = False)
temp
分组交叉特征筛选
- 当前关注训练后从模型中获取特征重要程度
lst = ['td_score', 'jxl_score', 'mj_score', 'rh_score', 'zzc_score', 'zcx_score', 'person_info', 'finance_info', 'credit_info', 'act_info']
#定义lgb函数
def LGB_test(train_x,train_y,test_x,test_y):
from multiprocessing import cpu_count
clf = lgb.LGBMClassifier(
boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
max_depth=2, n_estimators=800,max_features = 140, objective='binary',
subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
learning_rate=0.05, min_child_weight=50,random_state=None,n_jobs=cpu_count()-1,
num_iterations = 800 #迭代次数
)
clf.fit(train_x, train_y,eval_set=[(train_x, train_y),(test_x,test_y)],eval_metric='auc',early_stopping_rounds=100)
print(clf.n_features_)
return clf,clf.best_score_[ 'valid_1']['auc']
feature_lst = []
ks_train_lst = []
ks_test_lst = []
for rk in set(df_train['rank']):
# rank为分箱后的特征列,值包含0,1,2,3,4
# 训练集已分5组,4组训练,一组测试。
#定义模型训练集与测试集
ttest = df_train[df_train['rank'] == rk]
ttrain = df_train[df_train['rank'] != rk]
train = ttrain[lst]
train_y = ttrain.bad_ind
test = ttest[lst]
test_y = ttest.bad_ind
model,auc = LGB_test(train,train_y,test,test_y)
#模型贡献度放在feture中
feature = pd.DataFrame(
{'name' : model.booster_.feature_name(),
'importance' : model.feature_importances_
}).set_index('name')
feature_lst.append(feature)
#计算训练集、测试集、验证集上的KS和AUC
y_pred_train_lgb = model.predict_proba(train)[:, 1]
y_pred_test_lgb = model.predict_proba(test)[:, 1]
train_fpr_lgb, train_tpr_lgb, _ = roc_curve(train_y, y_pred_train_lgb)
test_fpr_lgb, test_tpr_lgb, _ = roc_curve(test_y, y_pred_test_lgb)
train_ks = abs(train_fpr_lgb - train_tpr_lgb).max()
test_ks = abs(test_fpr_lgb - test_tpr_lgb).max()
train_auc = metrics.auc(train_fpr_lgb, train_tpr_lgb)
test_auc = metrics.auc(test_fpr_lgb, test_tpr_lgb)
ks_train_lst.append(train_ks)
ks_test_lst.append(test_ks)
train_ks = np.mean(ks_train_lst)
test_ks = np.mean(ks_test_lst)
print('train_ks: ',train_ks)
print('test_ks: ',test_ks)
分组后进行5轮计算,得到5组特征重要程度
- 将5组重要程度拼接后排序
feature_importance = pd.concat(feature_lst,axis = 1).mean(1).sort_values(ascending = False)
feature_importance
输出内容
- 取出特征
feature_importance[(feature_importance>20)].index.tolist()
输出内容
心得:记录一下使用Xgboost输出特征重要程度,分组交叉筛选可以使数据更加可靠