数据竞赛实践：Catboost与LightGBM模型调参-优快云博客

本文链接：https://blog.youkuaiyun.com/qq_36523203/article/details/106430374

本文记录了一次数据比赛的过程，包括使用Python进行数据处理和特征工程，然后详细介绍了利用Catboost和LightGBM模型进行训练和预测，并重点解析了模型参数的含义和调参策略。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

数据比赛里用写的代码，可快速应用到其他比赛

使用库

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold #数据进行交叉验证的，模型选择时使用
from lightgbm import LGBMClassifier#lightgbm做分类的模型
from lightgbm import LGBMRegressor#lightgbm做回归预测
from sklearn.metrics import f1_score#模型评价指标函数 f1
from sklearn.metrics import mean_squared_error#均方误差
import matplotlib.pyplot as plt
from tqdm import tqdm
import catboost as cbt#模型catboost
from sklearn.cluster import KMeans#聚类算法
from sklearn.preprocessing import LabelEncoder#标签

数据处理

df_train = pd.read_csv('/home/kesci/input/smart_edu7557/exam_score.csv')
df_train = df_train[(df_train['score']!=0)].reset_index(drop=True)#将某特征满足某条件的样本删除
process_index = lambda x: list(x[((x >= (np.percentile(x,25) - 1.5*(np.percentile(x,75)-np.percentile(x,25)))) & 
(x <= (np.percentile(x,75) + 1.5*(np.percentile(x,75)-np.percentile(x,25)))))].index)
tmp_process = df_train.groupby(by=['student_id','course'], as_index=False)['score'].agg({'process_index':process_index})
df_test = pd.read_csv('/home/kesci/input/smart_edu7557/submission_s2.csv')
df_test.rename(columns={'pred':'score'},inplace = True)
course_class = pd.read_csv('/home/kesci/input/smart_edu7557/course.csv')
student = pd.read_csv('/home/kesci/input/smart_edu7557/student.csv')
all_know = pd.read_csv('/home/kesci/input/smart_edu7557/all_knowledge.csv')
df_all = df_train.append(df_test)
df_all = df_all.merge(course_class, on='course', how='left')
df_all = df_all.merge(student, on='student_id', how='left')

特征工程

增加数据样本的特征维度，找到与目标相关性更强的特征

from sklearn.metrics.pairwise import cosine_distances
course1_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course1_exams.csv')
course2_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course2_exams.csv')
course3_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course3_exams.csv')
course4_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course4_exams.csv')
course5_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course5_exams.csv')
course6_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course6_exams.csv')
course7_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course7_exams.csv')
course8_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course8_exams.csv')
#col_c1 = [i for i in course1_exam.columns if i not in ['course','exam_id']]
tmp4=1
for i in [course1_exam,course2_exam,course3_exam,course4_exam,course5_exam,course6_exam,course7_exam,course8_exam]:
    name = i
    col_c1 = [i for i in name.columns if i not in ['course','exam_id']]
    name['course'] ='course'+str(tmp4)
    tmp2 =np.array(all_know.loc[all_know['course'] == ('course'+str(tmp4)),:]['complexity'])
    tmp = name[col_c1]
    tmp3 =np.dot(tmp.values,tmp2)
    name['hard'] = tmp3
    name['hard_inverse'] = name['hard'].apply(lambda x:1/(x+1e-10))
    tmp4 = tmp4+1
    
    np_tmp = name[col_c1].values
    np_tmp= np_tmp.astype(np.bool)
    np_tmp2 = np.sum(np_tmp,axis=1)
    np_tmp = np.sum(np_tmp,axis=1)/len(col_c1)
    np_tmp = (np_tmp-np_tmp.min())/(np_tmp.max()-np_tmp.min())
    name['ration_know'] = np_tmp
    name['number_know'] = np_tmp2
    #添加每个知识点的均分
    name['know_mean']=[100/i for i in np_tmp2]
    
for i in [course1_exam,course2_exam,course3_exam,course4_exam,course5_exam,course6_exam,course7_exam,course8_exam]:
    name = i
    col_c1 = [i for i in name.columns if i not in ['course','exam_id','hard','hard_inverse','ration_know','number_know']]
    pd_ = name[col_c1]
    e = 1-cosine_distances(pd_)
    e = e -np.diag([1] * len(name))
    inde = np.argmax(e,axis=1)
    pd_['inde'] = inde
    name['sim_exam_id'] = pd_['inde'].apply(lambda x:i.loc[x,'exam_id'])
    
for key in all_know.groupby(['course','section'])['knowledge_point'].groups.keys():
    course = key[0]
    section = key[1]
    if course == 'course1':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course1_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course1_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course2':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course2_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course2_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course3':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course3_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course3_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course4':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course4_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course4_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course5':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course5_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course5_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course6':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course6_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course6_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course7':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course7_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course7_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course8':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course8_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course8_exam[section] = tmp.apply(np.sum,axis = 1)
for key in all_know.groupby(['course','category'])['knowledge_point'].groups.keys():
    course = key[0]
    section = key[1]
    if course == 'course1':
        s0_inde = all_know.groupby(['course','category'])['knowledge_point'].groups[key]
        tmp = course1_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course1_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course2':
        s0_inde = all_know.groupby(['course','category'])['knowledge_point'].groups[key]
        tmp = course2_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course2_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course3':
        s0_inde = all_know.groupby(['course','category'])['knowledge_point'].groups[key]
        tmp = course3_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course3_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course4':
        s0_inde = all_know.groupby(['course','category'])['knowledge_point'].groups[key]
        tmp = course4_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course4_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course5':
        s0_inde = all_know.groupby(['course','category'])['knowledge_point'].groups[key]
        tmp = course5_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course5_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course6':
        s0_inde = all_know.groupby(['course','category'])['knowledge_point'].groups[key]
        tmp = course6_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course6_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course7':
        s0_inde = all_know.groupby(['course','category'])['knowledge_point'].groups[key]
        tmp = course7_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course7_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course8':
        s0_inde = all_know.groupby(['course','category'])['knowledge_point'].groups[key]
        tmp = course8_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course8_exam[section] = tmp.apply(np.sum,axis = 1)     
for i in [course1_exam,course2_exam,course3_exam,course4_exam,course5_exam,course6_exam,course7_exam,course8_exam]:
    name = i
    col_c1 = [i for i in name.columns if 'S:' in i ]
    pd_ = name[col_c1]
    e = 1-cosine_distances(pd_)
    e = e -np.diag([1] * len(name))
    inde = np.argmax(e,axis=1)
    pd_['inde'] = inde
    name['sectionsim_exam_id'] = pd_['inde'].apply(lambda x:i.loc[x,'exam_id'])
for i in [course1_exam,course2_exam,course3_exam,course4_exam,course5_exam,course6_exam,course7_exam,course8_exam]:
    name = i
    col_c1 = [i for i in name.columns if 'C:' in i ]
    pd_ = name[col_c1]
    e = 1-cosine_distances(pd_)
    e = e -np.diag([1] * len(name))
    inde = np.argmax(e,axis=1)
    pd_['inde'] = inde
    name['catsim_exam_id'] = pd_['inde'].apply(lambda x:i.loc[x,'exam_id'])
cluster_count =1    
for name in [course1_exam,course2_exam,course3_exam,course4_exam,course5_exam,course6_exam,course7_exam,course8_exam]:
    col_K = [i for i in name.columns if 'K:' in i]
    kmeans = KMeans(n_clusters=20, random_state=0).fit(name[col_K])
    name['cluster'] = kmeans.labels_
    name['cluster']= name['cluster'].apply(lambda x:'c'+str(cluster_count)+str(x))
    cluster_count = cluster_count+1
for name in [course1_exam,course2_exam,course3_exam,course4_exam,course5_exam,course6_exam,course7_exam,course8_exam]:
    col_K = [i for i in name.columns if 'K:' in i]
    kmeans = KMeans(n_clusters=30, random_state=0).fit(name[col_K])
    name['cluster_30'] = kmeans.labels_
    name['cluster_30']= name['cluster_30'].apply(lambda x:'c'+str(cluster_count)+str(x))
    cluster_count = cluster_count+1
for name in [course1_exam,course2_exam,course3_exam,course4_exam,course5_exam,course6_exam,course7_exam,course8_exam]:
    col_K = [i for i in name.columns if 'K:' in i]
    kmeans = KMeans(n_clusters=40, random_state=0).fit(name[col_K])
    name['cluster_40'] = kmeans.labels_
    name['cluster_40']= name['cluster_40'].apply(lambda x:'c'+str(cluster_count)+str(x))
    cluster_count = cluster_count+1
for name in [course1_exam,course2_exam,course3_exam,course4_exam,course5_exam,course6_exam,course7_exam,course8_exam]:
    col_K = [i for i in name.columns if 'K:' in i]
    kmeans = KMeans(n_clusters=50, random_state=0).fit(name[col_K])
    name['cluster_50'] = kmeans.labels_
    name['cluster_50']= name['cluster_50'].apply(lambda x:'c'+str(cluster_count)+str(x))
    cluster_count = cluster_count+1
course_exam = course1_exam.append(course2_exam)
course_exam = course_exam.append(course3_exam)
course_exam = course_exam.append(course4_exam)
course_exam = course_exam.append(course5_exam)
course_exam = course_exam.append(course6_exam)
course_exam = course_exam.append(course7_exam)
course_exam = course_exam.append(course8_exam)
course_exam.fillna(0,inplace = True)
#特征提取

from sklearn.decomposition import PCA
col=[i for i in course_exam.columns if i not in ['cluster_50','cluster_40','cluster_30','cluster','course','exam_id','hard','hard_inverse','ration_know','number_know','sim_exam_id','sectionsim_exam_id','catsim_exam_id']]
print(len(col))
col_K = [i for i in col if 'K:' in i]
col_S = [i for i in col if 'S:' in i]
col_C = [i for i in col if 'C:' in i]
x_K=course_exam[col_K].values
x_S=course_exam[col_S].values
x_C=course_exam[col_C].values
#print(type(x))
pca=PCA(n_components=20)     #加载PCA算法，设置降维后主成分数目为5
reduced_x_K=pca.fit_transform(x_K)#对样本进行降维，x为特征组成的矩阵
#print(reduced_x[:,0].shape)
#将特征添加到course_exam
course_exam['k_feature_1']=reduced_x_K[:,0]
course_exam['k_feature_2']=reduced_x_K[:,1]
course_exam['k_feature_3']=reduced_x_K[:,2]
course_exam['k_feature_4']=reduced_x_K[:,3]
course_exam['k_feature_5']=reduced_x_K[:,4]
course_exam['k_feature_6']=reduced_x_K[:,5]
course_exam['k_feature_7']=reduced_x_K[:,6]
course_exam['k_feature_8']=reduced_x_K[:,7]
course_exam['k_feature_9']=reduced_x_K[:,8]
course_exam['k_feature_10']=reduced_x_K[:,9]
course_exam['k_feature_11']=reduced_x_K[:,10]
course_exam['k_feature_12']=reduced_x_K[:,11]
course_exam['k_feature_13']=reduced_x_K[:,12]
course_exam['k_feature_14']=reduced_x_K[:,13]
course_exam['k_feature_15']=reduced_x_K[:,14]
course_exam['k_feature_16']=reduced_x_K[:,15]
course_exam['k_feature_17']=reduced_x_K[:,16]
course_exam['k_feature_18']=reduced_x_K[:,17]
course_exam['k_feature_19']=reduced_x_K[:,18]
course_exam['k_feature_20']=reduced_x_K[:,19]

reduced_x_C=pca.fit_transform(x_C)#对样本进行降维，x为特征组成的矩阵
#print(reduced_x[:,0].shape)
#将特征添加到course_exam
course_exam['C_feature_1']=reduced_x_C[:,0]
course_exam['C_feature_2']=reduced_x_C[:,1]
course_exam['C_feature_3']=reduced_x_C[:,2]
course_exam['C_feature_4']=reduced_x_C[:,3]
course_exam['C_feature_5']=reduced_x_C[:,4]
course_exam['C_feature_6']=reduced_x_C[:,5]
course_exam['C_feature_7']=reduced_x_C[:,6]
course_exam['C_feature_8']=reduced_x_C[:,7]
course_exam['C_feature_9']=reduced_x_C[:,8]
course_exam['C_feature_10']=reduced_x_C[:,9]
course_exam['C_feature_11']=reduced_x_C[:,10]
course_exam['C_feature_12']=reduced_x_C[:,11]
course_exam['C_feature_13']=reduced_x_C[:,12]
course_exam['C_feature_14']=reduced_x_C[:,13]
course_exam['C_feature_15']=reduced_x_C[:,14]
course_exam['C_feature_16']=reduced_x_C[:,15]
course_exam['C_feature_17']=reduced_x_C[:,16]
course_exam['C_feature_18']=reduced_x_C[:,17]
course_exam['C_feature_19']=reduced_x_C[:,18]
course_exam['C_feature_20']=reduced_x_C[:,19]

reduced_x_S=pca.fit_transform(x_S)#对样本进行降维，x为特征组成的矩阵
#print(reduced_x[:,0].shape)
#将特征添加到course_exam
course_exam['S_feature_1']=reduced_x_S[:,0]
course_exam['S_feature_2']=reduced_x_S[:,1]
course_exam['S_feature_3']=reduced_x_S[:,2]
course_exam['S_feature_4']=reduced_x_S[:,3]
course_exam['S_feature_5']=reduced_x_S[:,4]
course_exam['S_feature_6']=reduced_x_S[:,5]
course_exam['S_feature_7']=reduced_x_S[:,6]
course_exam['S_feature_8']=reduced_x_S[:,7]
course_exam['S_feature_9']=reduced_x_S[:,8]
course_exam['S_feature_10']=reduced_x_S[:,9]
course_exam['S_feature_11']=reduced_x_S[:,10]
course_exam['S_feature_12']=reduced_x_S[:,11]
course_exam['S_feature_13']=reduced_x_S[:,12]
course_exam['S_feature_14']=reduced_x_S[:,13]
course_exam['S_feature_15']=reduced_x_S[:,14]
course_exam['S_feature_16']=reduced_x_S[:,15]
course_exam['S_feature_17']=reduced_x_S[:,16]
course_exam['S_feature_18']=reduced_x_S[:,17]
course_exam['S_feature_19']=reduced_x_S[:,18]
course_exam['S_feature_20']=reduced_x_S[:,19]

sub_course_exam = course_exam[['cluster_50','cluster_40','cluster_30','cluster','course','exam_id','hard','hard_inverse','ration_know','number_know','sim_exam_id','sectionsim_exam_id','catsim_exam_id']]
sub_course_exam_K = course_exam[['course','exam_id','k_feature_1','k_feature_2','k_feature_3','k_feature_4','k_feature_5','k_feature_6','k_feature_7','k_feature_8','k_feature_9','k_feature_10','k_feature_11','k_feature_12','k_feature_13','k_feature_14','k_feature_15','k_feature_16','k_feature_17','k_feature_18','k_feature_19','k_feature_20']]
sub_course_exam_S = course_exam[['course','exam_id','S_feature_1','S_feature_2','S_feature_3','S_feature_4','S_feature_5','S_feature_6','S_feature_7','S_feature_8','S_feature_9','S_feature_10','S_feature_11','S_feature_12','S_feature_13','S_feature_14','S_feature_15','S_feature_16','S_feature_17','S_feature_18','S_feature_19','S_feature_20']]
sub_course_exam_C = course_exam[['course','exam_id','C_feature_1','C_feature_2','C_feature_3','C_feature_4','C_feature_5','C_feature_6','C_feature_7','C_feature_8','C_feature_9','C_feature_10','C_feature_11','C_feature_12','C_feature_13','C_feature_14','C_feature_15','C_feature_16','C_feature_17','C_feature_18','C_feature_19','C_feature_20']]
df_all=df_all.merge(sub_course_exam, on=['exam_id','course'], how='left')
df_all=df_all.merge(sub_course_exam_K, on=['exam_id','course'], how='left')
df_all=df_all.merge(sub_course_exam_S, on=['exam_id','course'], how='left')
df_all=df_all.merge(sub_course_exam_C, on=['exam_id','course'], how='left')

get_mean = lambda x: x.sort_values(ascending = False)[1:-1].mean()
get_mean2 = lambda x: x.sort_values(ascending = False)[2:-2].mean()
get_ptp = lambda x: x.sort_values(ascending = False)[2:-2].ptp()
get_std = lambda x: x.sort_values(ascending = False)[2:-2].std()
get_mod = lambda x: pd.Series(data=x).mode().max()
#最后5次的成绩/最后10次的成绩
get_mean_last5 = lambda x: x[-5:].mean()
get_mean_last10 = lambda x: x[-10:].mean()
get_mean_last20 = lambda x: x[-20:].mean()
get_mean_last30 = lambda x: x[-30:].mean()
get_mean_lasthalf = lambda x: x[-int(len(x)/2):].mean()
df_train['per_exam_rank']=df_train.groupby(by=['exam_id'], as_index=False)['score'].rank()
tmp_rank =df_train.groupby(by=['student_id','course'], as_index=False)['per_exam_rank'].agg({'rank_last5_mean':get_mean_last5,'rank_last10_mean':get_mean_last10,'rank_last20_mean':get_mean_last20,'rank_last30_mean':get_mean_last30,'rank_lasthalf_mean':get_mean_lasthalf,'mean_rank':np.mean, 'std_rank':np.std,'max_rank':np.max,'min_rank':np.min,'median_rank':np.median})
df_all=df_all.merge(tmp_rank, on=['student_id','course'], how='left')
tmp1 = df_train.groupby(by=['student_id','course'], as_index=False)['score'].agg({'lasthalf':get_mean_lasthalf,'last30':get_mean_last30,'last20':get_mean_last20,'last10':get_mean_last10,'last5':get_mean_last5,'mode':get_mod,'mean_score':np.mean, 'median_score':np.median, 'std_score':np.std,'max_score':np.max,'min_score':np.min,'cos_mean_score':get_mean,'cos_mean_score2':get_mean2,'ptp':np.ptp ,'var':np.var})
tmp2 = df_train.groupby(by=['student_id'], as_index=False)['score'].agg({'s_mode':get_mod,'s_mean_score':np.mean, 's_median_score':np.median, 's_std_score':np.std,'s_max_score':np.max,'s_min_score':np.min,'s_cos_mean_score':get_mean,'s_cos_mean_score2':get_mean2,'s_ptp':np.ptp  ,'s_var':np.var})

tmp_hard = df_train.groupby(by=['student_id','course'], as_index=False)['hard'].agg({'hard_lasthalf':get_mean_lasthalf,'hard_last30':get_mean_last30,'hard_last20':get_mean_last20,'hard_last10':get_mean_last10,
'hard_last5':get_mean_last5,'hard_mean':np.mean})
rank_course1=tmp1.loc[tmp1['course']=='course1']['mean_score'].rank()
rank_course2=tmp1.loc[tmp1['course']=='course2']['mean_score'].rank()
rank_course3=tmp1.loc[tmp1['course']=='course3']['mean_score'].rank()
rank_course4=tmp1.loc[tmp1['course']=='course4']['mean_score'].rank()
rank_course5=tmp1.loc[tmp1['course']=='course5']['mean_score'].rank()
rank_course6=tmp1.loc[tmp1['course']=='course6']['mean_score'].rank()
rank_course7=tmp1.loc[tmp1['course']=='course7']['mean_score'].rank()
rank_course8=tmp1.loc[tmp1['course']=='course8']['mean_score'].rank()
dict_course1 = {'student_id':tmp2['student_id'],'course':'course1','rank':rank_course1.values}
df_course1 = pd.DataFrame(dict_course1)
dict_course2 = {'student_id':tmp2['student_id'],'course':'course2','rank':rank_course2.values}
df_course2 = pd.DataFrame(dict_course2)
dict_course3 = {'student_id':tmp2['student_id'],'course':'course3','rank':rank_course3.values}
df_course3= pd.DataFrame(dict_course3)
dict_course4 = {'student_id':tmp2['student_id'],'course':'course4','rank':rank_course4.values}
df_course4 = pd.DataFrame(dict_course4)
dict_course5 = {'student_id':tmp2['student_id'],'course':'course5','rank':rank_course5.values}
df_course5 = pd.DataFrame(dict_course5)
dict_course6 = {'student_id':tmp2['student_id'],'course':'course6','rank':rank_course6.values}
df_course6 = pd.DataFrame(dict_course6)
dict_course7 = {'student_id':tmp2['student_id'],'course':'course7','rank':rank_course7.values}
df_course7 = pd.DataFrame(dict_course7)
dict_course8 = {'student_id':tmp2['student_id'],'course':'course8','rank':rank_course8.values}
df_course8 = pd.DataFrame(dict_course8)
#tmp1=tmp1.merge(rank_course1,on=rank_course1.index, how='left')
df_rank=pd.concat([df_course1,df_course2,df_course3,df_course4,df_course5,df_course6,df_course7,df_course8])
#exam_sum = df_train.groupby(by=['student_id','exam_id'], as_index=False)['score'].agg({'s_sum_score':np.sum})
#exam_rank=exam_sum.groupby(bu=['student_id','exam_id'],as_index=False)['s_sum_score'].agg({'s_rank_sum':np.sum()}))
df_all=df_all.merge(tmp1, on=['student_id','course'], how='left')
df_all=df_all.merge(tmp_hard, on=['student_id','course'], how='left')
#df_all=df_all.merge(tmp_process, on=['student_id','course'], how='left')
df_all=df_all.merge(tmp2, on=['student_id'], how='left')
df_all=df_all.merge(df_rank, on=['student_id','course'], how='left')
df_all['trend5-10'] = df_all['last5']/df_all['last10']
df_all['trend5-20'] = df_all['last5']/df_all['last20']
df_all['trend5-30'] = df_all['last5']/df_all['last30']
df_all['trend5-half'] = df_all['last5']/df_all['lasthalf']
df_all['trend5-all'] = df_all['last5']/df_all['mean_score']
df_all['trend10-20'] = df_all['last10']/df_all['last20']
df_all['trend10-20'] = df_all['last10']/df_all['last30']
df_all['trend10-half'] = df_all['last10']/df_all['lasthalf']
df_all['trend10-all'] = df_all['last10']/df_all['mean_score']
df_all['trend20-half'] = df_all['last20']/df_all['lasthalf']
df_all['trend10-all'] = df_all['last10']/df_all['mean_score']
df_all['score_gap']=df_all['max_score']-df_all['min_score']

#排名的的变动
df_all['rank5-10']=df_all['rank_last5_mean']/df_all['rank_last10_mean']
df_all['rank5-20']=df_all['rank_last5_mean']/df_all['rank_last20_mean']
df_all['rank5-30']=df_all['rank_last5_mean']/df_all['rank_last30_mean']
df_all['rank5-half']=df_all['rank_last5_mean']/df_all['rank_lasthalf_mean']
df_all['rank5-all']=df_all['rank_last5_mean']/df_all['mean_rank']
df_all['rank10-20']=df_all['rank_last10_mean']/df_all['rank_last20_mean']
df_all['rank10-30']=df_all['rank_last10_mean']/df_all['rank_last30_mean']
df_all['rank10-half']=df_all['rank_last10_mean']/df_all['rank_lasthalf_mean']
df_all['rank10-all']=df_all['rank_last10_mean']/df_all['mean_rank']
df_all['rank20-half']=df_all['rank_last20_mean']/df_all['rank_lasthalf_mean']
df_all['rank20-all']=df_all['rank_last20_mean']/df_all['mean_rank']
df_all['rank_gap']=df_all['max_rank']-df_all['min_rank']


#成绩和难度的占比
df_all['trend5-10_hard'] = (df_all['last5']/df_all['hard_last5'])/(df_all['last10']/df_all['hard_last10'])
df_all['trend5-20_hard'] = (df_all['last5']/df_all['hard_last5'])/(df_all['last20']/df_all['hard_last20'])
df_all['trend5-30_hard'] = (df_all['last5']/df_all['hard_last5'])/(df_all['last30']/df_all['hard_last30'])
df_all['trend5-half_hard'] = (df_all['last5']/df_all['hard_last5'])/(df_all['lasthalf']/df_all['hard_lasthalf'])
df_all['trend5-all_hard'] = (df_all['last5']/df_all['hard_last5'])/(df_all['mean_score']/df_all['hard_mean'])
df_all['trend10-20_hard'] = (df_all['last10']/df_all['hard_last10'])/(df_all['last20']/df_all['hard_last20'])
df_all['trend10-20_hard'] = (df_all['last10']/df_all['hard_last10'])/(df_all['last30']/df_all['hard_last30'])
df_all['trend10-half_hard'] = (df_all['last10']/df_all['hard_last10'])/(df_all['lasthalf']/df_all['hard_lasthalf'])
df_all['trend10-all_hard'] = (df_all['last10']/df_all['hard_last10'])/(df_all['mean_score']/df_all['hard_mean'])
df_all['trend20-half_hard'] = (df_all['last20']/df_all['hard_last20'])/(df_all['lasthalf']/df_all['hard_lasthalf'])
df_all['trend20-all_hard'] = (df_all['last20']/df_all['hard_last20'])/(df_all['mean_score']/df_all['hard_mean'])

#排名和难度的占比
df_all['rank5-10_hard'] = (df_all['rank_last5_mean']/df_all['hard_last5'])/(df_all['rank_last10_mean']/df_all['hard_last10'])
df_all['rank5-20_hard'] = (df_all['rank_last5_mean']/df_all['hard_last5'])/(df_all['rank_last20_mean']/df_all['hard_last20'])
df_all['rank5-30_hard'] = (df_all['rank_last5_mean']/df_all['hard_last5'])/(df_all['rank_last30_mean']/df_all['hard_last30'])
df_all['rank5-half_hard'] = (df_all['rank_last5_mean']/df_all['hard_last5'])/(df_all['rank_lasthalf_mean']/df_all['hard_lasthalf'])
df_all['rank5-all_hard'] = (df_all['rank_last5_mean']/df_all['hard_last5'])/(df_all['mean_rank']/df_all['hard_mean'])
df_all['rank10-20_hard'] = (df_all['rank_last10_mean']/df_all['hard_last10'])/(df_all['rank_last20_mean']/df_all['hard_last20'])
df_all['rank10-20_hard'] = (df_all['rank_last10_mean']/df_all['hard_last10'])/(df_all['rank_last30_mean']/df_all['hard_last30'])
df_all['rank10-half_hard'] = (df_all['rank_last10_mean']/df_all['hard_last10'])/(df_all['rank_lasthalf_mean']/df_all['hard_lasthalf'])
df_all['rank10-all_hard'] = (df_all['rank_last10_mean']/df_all['hard_last10'])/(df_all['mean_rank']/df_all['hard_mean'])
df_all['rank20-half_hard'] = (df_all['rank_last20_mean']/df_all['hard_last20'])/(df_all['rank_lasthalf_mean']/df_all['hard_lasthalf'])
df_all['rank20-all_hard'] = (df_all['rank_last20_mean']/df_all['hard_last20'])/(df_all['mean_rank']/df_all['hard_mean'])
#成绩和排名的占比
df_all['trend5-10_rank'] = (df_all['last5']/df_all['rank_last5_mean'])/(df_all['last10']/df_all['rank_last10_mean'])
df_all['trend5-20_rank'] = (df_all['last5']/df_all['rank_last5_mean'])/(df_all['last20']/df_all['rank_last20_mean'])
df_all['trend5-30_rank'] = (df_all['last5']/df_all['rank_last5_mean'])/(df_all['last30']/df_all['rank_last30_mean'])
df_all['trend5-half_rank'] = (df_all['last5']/df_all['rank_last5_mean'])/(df_all['lasthalf']/df_all['rank_lasthalf_mean'])
df_all['trend5-all_rank'] = (df_all['last5']/df_all['rank_last5_mean'])/(df_all['mean_score']/df_all['mean_rank'])
df_all['trend10-20_rank'] = (df_all['last10']/df_all['rank_last10_mean'])/(df_all['last20']/df_all['rank_last20_mean'])
df_all['trend10-20_rank'] = (df_all['last10']/df_all['rank_last10_mean'])/(df_all['last30']/df_all['rank_last30_mean'])
df_all['trend10-half_rank'] = (df_all['last10']/df_all['rank_last10_mean'])/(df_all['lasthalf']/df_all['rank_lasthalf_mean'])
df_all['trend10-all_rank'] = (df_all['last10']/df_all['rank_last10_mean'])/(df_all['mean_score']/df_all['mean_rank'])
df_all['trend20-half_rank'] = (df_all['last20']/df_all['rank_last20_mean'])/(df_all['lasthalf']/df_all['rank_lasthalf_mean'])
df_all['trend20-all_rank'] = (df_all['last20']/df_all['rank_last20_mean'])/(df_all['mean_score']/df_all['mean_rank'])
df_all['std/mean'] = df_all['std_score']/df_all['mean_score']
df_all['s_std/mean'] = df_all['s_std_score']/df_all['s_mean_score']
df_all['rank_std/mean']=df_all['std_rank']/df_all['mean_rank']
#将文本类特征数值化
for i in ['cluster_50','cluster_40','cluster_30','cluster','course','course_class','exam_id','student_id','sim_exam_id','sectionsim_exam_id','catsim_exam_id']:
    lbl = LabelEncoder()
    #all_data[i+"_count"] = all_data.groupby([i])[i].transform('count')
    #all_data[i+"_rank"] = all_data[i+"_count"].rank(method='min')
    df_all[i] = lbl.fit_transform(df_all[i].astype(str))
cat_list = ['cluster_50','cluster_40','cluster_30','cluster','course','course_class','exam_id','student_id','sim_exam_id','sectionsim_exam_id','catsim_exam_id']
#cat_list.extend(['k_feature_1','k_feature_2','k_feature_3','k_feature_4','k_feature_5','k_feature_6','k_feature_7','k_feature_8','k_feature_9','k_feature_10'])

模型训练和预测

df_train = df_all[:len(df_train)]
df_test = df_all[len(df_train):].reset_index(drop=True)
#df_train = df_all[:281352]
#df_test = df_all[281352:].reset_index(drop=True)
col = [i for i in df_all.columns if i not in ['score']]
#X_train = df_train.drop(columns=['score'])
X_train = df_train[col]
y = df_train['score']
feature_name = list(X_train.columns)
sample_weight = df_train['score']/100
cat_col_index = [feature_name.index(value) for value in cat_list]

模型：Catboost

test_y = np.zeros(len(df_test))
random_seed = 20190806
cv_model = []
cv_score = []
skf = StratifiedKFold(n_splits=3, random_state=random_seed, shuffle=True)
for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):
    #print(index)
    train_x, val_x, train_y, val_y = X_train.iloc[train_index], X_train.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    #sw = sample_weight[train_index]
    
    cbt_model = cbt.CatBoostRegressor(iterations=8000,learning_rate=0.05,max_depth=11,
                                       l2_leaf_reg=10,verbose=200,#early_stopping_rounds=250,
                                       eval_metric='RMSE',task_type='CPU')
    cbt_model.fit(train_x, train_y,eval_set=(val_x,val_y),cat_features=cat_col_index,early_stopping_rounds=250)
    
    cv_model.append(cbt_model)
    #lgb.n_estimators = lgb.best_iteration_
    val_y_pred = cbt_model.predict(val_x)
    cv_score.append( np.sqrt(mean_squared_error(val_y,val_y_pred)))
    test_y += cbt_model.predict(df_test[col])/3
print("CV score: ",np.mean(cv_score))

sub_test = pd.read_csv('/home/kesci/input/smart_edu7557/submission_s2.csv')
sub_test['pred'] = test_y
sub_test.to_csv('./result_new.csv',index=None)

模型：LightGBM

test_y = np.zeros(len(df_test))
random_seed = 20190806
cv_model = []
cv_score = []
skf = StratifiedKFold(n_splits=5, random_state=random_seed, shuffle=True)#将数据随机切分为5分
for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):
    #print(index)
    train_x, val_x, train_y, val_y = X_train.iloc[train_index], X_train.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    #sw = sample_weight[train_index]
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'n_estimators': 10000,
        #'metric': 'mae',
        'learning_rate': 0.01,
        'min_child_samples': 46,
        'min_child_weight': 0.01,
        'subsample_freq': 1,
        'num_leaves': 40,
        'max_depth': 7,
        'subsample': 0.42,
        'colsample_bytree': 0.48,
        'reg_alpha': 0.15,
        'reg_lambda': 5,
        'verbose': -1,
        'seed': random_seed
    }
    lgb = LGBMRegressor(**lgb_params)#回归模型
    lgb.fit(
        train_x,
        train_y,
        eval_set=[(train_x, train_y), (val_x, val_y)],
        #sample_weight = sw,
        eval_names=['train', 'val'],
        eval_metric='rmse',
        #eval_metric='mae',
        early_stopping_rounds=100,
        categorical_feature= cat_list,
        verbose=500,
    )
    cv_model.append(lgb)
    lgb.n_estimators = lgb.best_iteration_
    val_y_pred = lgb.predict(val_x)
    cv_score.append( np.sqrt(mean_squared_error(val_y,val_y_pred)))
    test_y += lgb.predict(df_test[col])/5
print("CV score: ",np.mean(cv_score))