建模与分析

最新推荐文章于 2024-09-02 14:19:03 发布
原创
最新推荐文章于 2024-09-02 14:19:03 发布 · 917 阅读
0 ·
CC 4.0 BY-SA版权
利用不同机器学习方法对数据建模
#     模型选择
#     交叉验证
#基于随机森林的交叉验证
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix,log_loss
import time
print('find best n_estimators for RandomForestClassifier')
min_score = 100000
best_n = 0
scores_n = []
range_n = np.logspace(0,2,num=3).astype(int)#表示1 10 100
for n in range_n:
    print('the number of stress:{0}'.format(n))
    t1 = time.time()
    rfc_score = 0
    rfc = RandomForestClassifier(n_estimators=n)#指定用多少棵树进行训练
    #shuffle 在每次划分时，是否进行洗牌  train_k,test_k为索引值 
    print(train_kobe.columns)
    for train_k,test_k in KFold(len(train_kobe),n_folds=10,shuffle = True): 
        #代码出错在没法将特征中的IND字符串转换为 float类型
        rfc.fit(train_kobe.iloc[train_k],train_label.iloc[train_k])  #iloc比 是基于索引位来选取数据集train_k
        pred = rfc.predict(train_kobe.iloc[test_k])
        rfc_score += log_loss(train_label.iloc[test_k],pred)/10
    scores_n.append(rfc_score)
    if rfc_score < min_score:
        min_score = rfc_score
        best_n = n
    t2 = time.time()
    print('Done processing {0} trees ({1:.3f}sec)'.format(n,t2-t1))
print(best_n,min_score)
print('find best max_depth for RandomForestClassifier')
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0,2,num=3).astype(int)
for m in range_m:
    print('the number of stress:{0}'.format(m))
    t1 = time.time()
    rfc_score = 0
    rfc = RandomForestClassifier(max_depth=m,n_estimators=best_n)#max_depth 指定用多少深度/层的树
    for train_k,test_k in KFold(len(train_kobe),n_folds=10,shuffle = True): 
        rfc.fit(train_kobe.iloc[train_k],train_label.iloc[train_k])  
        pred = rfc.predict(train_kobe.iloc[test_k])
        rfc_score += log_loss(train_label.iloc[test_k],pred)/10
    scores_n.append(rfc_score)
    if rfc_score < min_score:
        min_score = rfc_score
        best_m = m
    t2 = time.time()
    print('Done processing {0} trees ({1:.3f}sec)'.format(m,t2-t1))
print(best_m,min_score)


#     模型参数优化
#********************结果分析与改进*******************#
#     性能度量：
#         分类任务：精度 召回率 混淆矩阵

# #在处理后的数据集上计算混淆矩阵
# 利用逻辑回归对数据进行二分类
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold,cross_val_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report
def printing_Kfold_scores(X_train_data,y_train_data):
    fold = KFold(len(y_train_data),5,shuffle = False)
    c_param_range = [0.01,0.1,1,10,100]
    results_table = pd.DataFrame(index = range(len(c_param_range),2),columns = ['C_parameter','Mean recall score'])
    results_table['C_parameter'] = c_param_range
    j = 0
    for c_param in c_param_range:
        print('------------------')
        print('C parameter:',c_param)
        print('------------------')
        print('')
        recall_accs = []
        for iteration,indices in enumerate(fold,start = 1): #start = 1 表示下标起始位置
            #iteration,indices 分别为枚举编号 和 枚举值
            lr = LogisticRegression(C = c_param,penalty='l1') #c：正则化系数λ的倒数，float类型，默认为1.0。必须是正浮点型数。像SVM一样，越小的数值表示越强的正则化。
            lr.fit( X_train_data.iloc[indices[0]:] , y_train_data.iloc[indices[0],:].values.ravel() ) # ravel 表示将多维降成一维 与 flatten 功能一样
            y_pred_undersample = lr.predict(X_train_data.iloc[indices[1]:].values)
            recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
            recall_accs.append(recall_acc)
            print('Iteration',iteration,'recall score = ',recall_acc)
        results_table.ix[j,'Mean recall score'] = np.mean(recall_accs)
        j += 1
        print('')
        print('Mean recall score',np.mean(recall_accs))
        print('')
    best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']
    print('******************************************************************************')
    print('Best model to choose from cross validation is with C parameter = ', best_c)
    print('******************************************************************************')
    return best_c
best_c = printing_Kfold_scores(X_train_undersample, y_train_undersample)
 
#在处理后的数据集上计算混淆矩阵
# import itertools
lr = LogisticRegression(C = best_c , penalty= 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_unders