数据特征筛选
通过相关性
corrmat = X.corr()
f,ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, square=True)
从这里可以发现每个特征彼此之间的相关性。
k = 10
f, ax = plt.subplots(figsize=(12, 9))
cols = corrmat.nlargest(k, 'status').index
data = X[cols]
cm = pd.DataFrame(data).corr()
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, square=True, annot=True, fmt='.2f', annot_kws={'size':10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
这里我选取了与‘status’最相关的10个特征
通过随机森林筛选特征
from sklearn.ensemble import RandomForestClassifier
feat_lables = X.columns
forest = RandomForestClassifier(n_estimators=100,n_jobs=1)
forest.fit(X, label)
importance = forest.feature_importances_
imp_result = np.argsort(importance)[::-1]
for i in range(X.shape[1]):
print("%2d. %-*s %f"%(i+1, 30, feat_lables[i], importance[imp_result[i]]))
threshold = 0.01
data_index = list(X.columns[ importance < threshold])
X.drop(data_index,axis=1,inplace=True)
data_for=pd.concat([X,label],axis=1)
这里一共筛选出49个特征
通过IV值筛选
def woe(X, y, event=1):
res_woe = []
iv_dict = {}
for feature in X.columns:
x = X[feature].values
# 1) 连续特征离散化
if type_of_target(x) == 'continuous':
x = discrete(x)
# 2) 计算该特征的woe和iv
# woe_dict, iv = woe_single_x(x, y, feature, event)
woe_dict, iv = woe_single_x(x, y, feature, event)
iv_dict[feature] = iv
res_woe.append(woe_dict)
return iv_dict
def discrete(x):
# 使用5等分离散化特征
res = np.zeros(x.shape)
for i in range(5):
point1 = stats.scoreatpercentile(x, i * 20)
point2 = stats.scoreatpercentile(x, (i + 1) * 20)
x1 = x[np.where((x >= point1) & (x <= point2))]
mask = np.in1d(x, x1)
res[mask] = i + 1 # 将[i, i+1]块内的值标记成i+1
return res
def woe_single_x(x, y, feature, event=1):
# event代表预测正例的标签
event_total = sum(y == event)
non_event_total = y.shape[-1] - event_total
iv = 0
woe_dict = {}
for x1 in set(x): # 遍历各个块
y1 = y.reindex(np.where(x == x1)[0])
event_count = sum(y1 == event)
non_event_count = y1.shape[-1] - event_count
rate_event = event_count / event_total
rate_non_event = non_event_count / non_event_total
if rate_event == 0:
rate_event = 0.0001
# woei = -20
elif rate_non_event == 0:
rate_non_event = 0.0001
# woei = 20
woei = math.log(rate_event / rate_non_event)
woe_dict[x1] = woei
iv += (rate_event - rate_non_event) * woei
return woe_dict, iv
iv_dict = woe(X, label)
iv = sorted(iv_dict.items(), key = lambda x:x[1],reverse = True)
iv
- 问题
由于刚开始接触,所以很多东西了解的不是很全面,下面两种特征筛选方式都是借鉴大佬的 - 参考:
ML - 贷款用户逾期情况分析5 - 特征工程2(特征选择) - libh的博客 - 优快云博客 https://blog.youkuaiyun.com/a786150017/article/details/84573202