if not os.path.exists('model/easy_feature_select.csv'):
df_importances = df_importances[:150]
df_importances.to_csv('model/easy_feature_select.csv', encoding='gbk',
index=False)
# 根据筛选后的特征重新加载数据
x_train, x_test, y_train, y_test, df_ft = set_data(df_0, df_1, df_9, cfg_dict)
# 相关系数,补充未被筛选为重要特征但与重要特征相关性较大的其他特征
feature_list = x_train.columns.tolist()
df_corr = x_train.corr()
df_corr = df_corr.replace(1, 0)
# 筛选出相关系数大于0.85的特征
for i in range(len(df_corr.columns)):
if i >= len(df_corr.columns): break
column = df_corr.columns[i]
names = df_corr[abs(df_corr[column]) >= 0.85].index.tolist()
if names:
print(column, '的强相关特征:', names)
feature_list = [i for i in feature_list if i not in names]
df_corr = x_train[feature_list].corr()
continue
#feature_list = list(set(feature_list + ['呼叫次数', '入网时长(月)',
# 'MOU_avg', 'DOU_avg', '省外流量占比_avg']))
df_feature = pd.DataFrame(feature_list, columns=['features'])
df_importances = pd.merge(df_feature, df_importances, on='features',
how='left')
df_importances.to_csv('model/easy_feature_select.csv', encoding='gbk',
index=False)
# 根据筛选后的特征重新加载数据
x_train, x_test, y_train, y_test, df_ft = set_data(df_0, df_1, df_9, cfg_dict)
# 重新训练
bst = fit(cfg_dict, x_train, y_train, x_test, y_test)
df_importances = feature_imp(model=bst, x_train=x_train, plot=True)
df_importances.to_csv('model/easy_feature_select.csv', encoding='gbk',
index=False)
# 根据重新排序的特征训练模型
x_train, x_test, y_train, y_test, df_ft = set_data(df_0, df_1, df_9, cfg_dict)
bst = fit(cfg_dict, x_train, y_train, x_test, y_test)