利用不同机器学习方法对数据建模
# 模型选择
# 交叉验证
#基于随机森林的交叉验证
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix,log_loss
import time
print('find best n_estimators for RandomForestClassifier')
min_score = 100000
best_n = 0
scores_n = []
range_n = np.logspace(0,2,num=3).astype(int)#表示1 10 100
for n in range_n:
print('the number of stress:{0}'.format(n))
t1 = time.time()
rfc_score = 0
rfc = RandomForestClassifier(n_estimators=n)#指定用多少棵树进行训练
#shuffle 在每次划分时,是否进行洗牌 train_k,test_k为索引值
print(train_kobe.columns)
for train_k,test_k in KFold(len(train_kobe),n_folds=10,shuffle = True):
#代码出错在没法将特征中的IND字符串转换为 float类型
rfc.fit(train_kobe.iloc[train_k],train_label.iloc[train_k]) #iloc比 是基于索引位来选取数据集train_k
pred = rfc.predict(train_kobe.iloc[test_k])
rfc_score += log_loss(train_label.iloc[test_k],pred)/10
scores_n.append(rfc_score)
if rfc_score < min_score:
min_score = rfc_score
best_n = n
t2 = time.time()
print('Done processing {0} trees ({1:.3f}sec)'.format(n,t2-t1))
print(best_n,min_score)
print('find best max_depth for RandomForestClassifier')
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0,2,num=3).astype(int)
for m in range_m:
print('the number of stress:{0}'.format(m))
t1 = time.time()
rfc_score = 0
rfc = RandomForestClassifier(max_depth=m,n_estimators=best_n)#max_depth 指定用多少深度/层的树
for train_k,test_k in KFold(len(train_kobe),n_folds=10,shuffle = True):
rfc.fit(train_kobe.iloc[train_k],train_label.iloc[train_k])
pred = rfc.predict(train_kobe.iloc[test_k])
rfc_score += log_loss(train_label.iloc[test_k],pred)/10
scores_n.append(rfc_score)
if rfc_score < min_score:
min_score = rfc_score
best_m = m
t2 = time.time()
print('Done processing {0} trees ({1:.3f}sec)'.format(m,t2-t1))
print(best_m,min_score)
# 模型参数优化
#********************结果分析与改进*******************#
# 性能度量:
# 分类任务:精度 召回率 混淆矩阵
# #在处理后的数据集上计算混淆矩阵
# 利用逻辑回归对数据进行二分类
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold,cross_val_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report
def printing_Kfold_scores(X_train_data,y_train_data):
fold = KFold(len(y_train_data),5,shuffle = False)
c_param_range = [0.01,0.1,1,10,100]
results_table = pd.DataFrame(index = range(len(c_param_range),2),columns = ['C_parameter','Mean recall score'])
results_table['C_parameter'] = c_param_range
j = 0
for c_param in c_param_range:
print('------------------')
print('C parameter:',c_param)
print('------------------')
print('')
recall_accs = []
for iteration,indices in enumerate(fold,start = 1): #start = 1 表示下标起始位置
#iteration,indices 分别为枚举编号 和 枚举值
lr = LogisticRegression(C = c_param,penalty='l1') #c:正则化系数λ的倒数,float类型,默认为1.0。必须是正浮点型数。像SVM一样,越小的数值表示越强的正则化。
lr.fit( X_train_data.iloc[indices[0]:] , y_train_data.iloc[indices[0],:].values.ravel() ) # ravel 表示将多维降成一维 与 flatten 功能一样
y_pred_undersample = lr.predict(X_train_data.iloc[indices[1]:].values)
recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
recall_accs.append(recall_acc)
print('Iteration',iteration,'recall score = ',recall_acc)
results_table.ix[j,'Mean recall score'] = np.mean(recall_accs)
j += 1
print('')
print('Mean recall score',np.mean(recall_accs))
print('')
best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']
print('******************************************************************************')
print('Best model to choose from cross validation is with C parameter = ', best_c)
print('******************************************************************************')
return best_c
best_c = printing_Kfold_scores(X_train_undersample, y_train_undersample)
#在处理后的数据集上计算混淆矩阵
# import itertools
lr = LogisticRegression(C = best_c , penalty= 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_unders