2021年全国大学生数据统计与分析竞赛的B题代码精简版
模块加载
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,roc_curve,roc_auc_score
from sklearn.model_selection import train_test_splitGridSearchCV,cross_val_score
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
数据读取
#读取经过预处理的数据
df = pd.read_csv('修正数据1.csv')
特征重要性
#选取重要的10个特征
new_data_0 = df[['coupon',
'distance_day',
'coupon_visit',
'study_num',
'course_order_num',
'login_diff_time',
'chinese_subscribe_num',
'learn_num',
'platform_num',
'first_order_price','result']]
数据切分
#切分数据
dataset = new_data_0
X = dataset.iloc[:,:10]
y = dataset.iloc[:,10]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=7,stratify=y)
模型建立
#初始
model = XGBClassifier(max_depth=6, #初始值4
learning_rate=0.1,
n_estimators = 151, #初始值200
objective='binary:logistic',
booster='gbtree',
reg_lambda = 0.3, #初始值1
reg_alpha = 0.01, #初始值0
gamma=0.7, #初始值0
eval_metric = 'error',
min_child_weight=1,
subsample=0.8, # 初始值0.5
colsample_bytree=0.5, #初始值1
seed=7)
调参
dtrain = xgb.DMatrix(X_train,label=y_train)
#使用交叉验证cv找到最佳的迭代次数(决策树的数量)
cv_result = xgb.cv(model.get_xgb_params(), #352
dtrain,
num_boost_round=353,
nfold=6,
metrics='auc',
early_stopping_rounds=20,
callbacks=[xgb.callback.early_stop(20),
xgb.callback.print_evaluation(period=1,show_stdv=True)])
#绘制n_estimators-精度图
plt.figure(dpi=100,figsize=(5,3))
plt.plot(cv_result['test-auc-mean'][:151])
plt.xlabel('n_estimators')
plt.ylabel('精度')
plt.show()

## max_depth、min_child_weight
param_grid = {'max_depth':[3,4,5,6,7,8,9],
'min_child_weight':[1,2,3,4]} #'gamma':[1,2,3,4,5,6,7,8,9]
grid_search = GridSearchCV(model,param_grid,scoring='roc_auc',iid=False,cv=5)
grid_search.fit(X_train,y_train)
print('best_params:',grid_search.best_params_)
print('best_score:',grid_search.best_score_)
## gamma
param_grid = {'gamma':[i/10.0 for i in range(0,11)] }
grid_search = GridSearchCV(model,param_grid,scoring='roc_auc',iid=False,cv=5)
grid_search.fit(X_train,y_train)
print('best_params:',grid_search.best_params_)
print('best_score:',grid_search.best_score_)
##subsample、colsample_bytree
param_grid = {'subsample':[i/10.0 for i in range(5,10)], #5
'colsample_bytree':[i/10.0 for i in range(5,10)]
}
grid_search = GridSearchCV(model,param_grid,scoring='roc_auc',iid=False,cv=5)
grid_search.fit(X_train,y_train)
print('best_params:',grid_search.best_params_)
print('best_score:',grid_search.best_score_)
##reg_alpha、reg_lambda
#'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
param_grid = {'reg_alpha':[0, 0.005, 0.01, 0.05],
'reg_lambda':[0, 0.3, 0.5, 1]
}
grid_search = GridSearchCV(model,param_grid,scoring='roc_auc',iid=False,cv=5)
grid_search.fit(X_train,y_train)
print('best_params:',grid_search.best_params_)
print('best_score:',grid_search.best_score_)
#最后再调节学习率
param_grid = {'learning_rate': [0.01, 0.05, 0.07, 0.1, 0.12,0.15]
}
grid_search = GridSearchCV(model,param_grid,scoring='roc_auc',iid=False,cv=5)
grid_search.fit(X_train,y_train)
print('best_params:',grid_search.best_params_)
print('best_score:',grid_search.best_score_)
交叉验证
#最优参数
from sklearn.model_selection import KFold
splits = KFold(n_splits=5, shuffle=True, random_state=20211)
parameters = {'max_depth':6,
'learning_rate':0.01,
'objective':'binary:logistic',
'booster':'gbtree',
'n_jobs':4,
'reg_alpha': 0.01,
'reg_lambda' : 0.3,
'gamma':0.7,
'eval_metric' : 'error',
'min_child_weight':1,
'subsample':0.8,
'colsample_bytree':0.5,
'seed':7 }
predicted_train_xgb = np.zeros(len(X_train))
predicted_test_xgb = np.zeros(len(X_test))
for fold_, (trn_idx, val_idx) in enumerate(splits.split(X_train, y_train)):
print("fold {}".format(fold_+1))
# print(list(trn_idx))
trn_data = xgb.DMatrix(X_train.iloc[trn_idx], y_train.iloc[trn_idx])
val_data = xgb.DMatrix(X_train.iloc[val_idx], y_train.iloc[val_idx])
watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
clf = xgb.train(dtrain=trn_data,
num_boost_round=5000,
evals=watchlist,
early_stopping_rounds=200,
verbose_eval=100,
params=parameters,
)
predicted_train_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train.iloc[val_idx]), ntree_limit=clf.best_ntree_limit)
predicted_test_xgb += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / splits.n_splits
模型评估
#混淆矩阵
#bool -> 0-1
pre_data = (predicted_test_xgb >=0.40) +0
#打印评估报告
print(classification_report(pre_data, y_test))

注:实际比赛中还使用了特征工程、模型融合等。
总结
这次比赛是大学生数据分析领域一次比较难得的比赛,由于是第一届,参赛的人不多,知名度也没啥,但是这样完整的做下来,还是感觉收获挺多的,首先是数据的预处理,这里就没有展示了,但是这部分可以说是最重要的,每个字段的数据要根据字段的具体含义去判断,还要联系几个字段之间的关系。例如在调查报告中,给出一个人的驾驶时长、调查时间、出生日期,很明显通过调查时间和出生日期可以计算年龄,年龄肯定大于驾龄+18,不然就算是异常的,可能数据中没有这种异常出现,但是也是值得分析的!再就是模型调参了,这部分的提升不算明显,但是这样能使得建模更加完整!特征工程的话,简单来说就是对已有字段(特征)进行挖掘,可以添加某个字段(特征)均值方差作为新的字段(特征),也可以是几个特征的组合等等,模型都是次要的,因为模型的结构就那样了,每个人的模型都差不多,但是每个人的特征工程做的可能都不一样。
该博客介绍了参与2021年全国大学生数据统计与分析竞赛的过程,重点在于使用XGBoost进行模型训练和调参。首先,通过加载必要的库和数据,选择了关键特征。然后,数据被划分为训练集和测试集,并初始化了一个XGBoost模型。接下来,进行了参数网格搜索,优化了包括最大深度、最小权重、正则化参数等多个超参数。通过交叉验证确定了最优参数,并利用这些参数进行了模型训练。最后,评估了模型性能,包括准确率和AUC值,并强调了特征工程和模型融合在比赛中的重要性。
3167

被折叠的 条评论
为什么被折叠?



