机器学习工作流

 本文的目的是演示如何构建一个相对完整的机器学习工作流

import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import wave
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from pydub import AudioSegment
from pydub.utils import mediainfo
import librosa
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import ShortTermFeatures
from scipy.signal import medfilt
import wave
import sys
from scipy.stats import entropy
from scipy.fftpack import fft
from pyAudioAnalysis import MidTermFeatures
import scipy
from scipy.signal import stft
from IPython.display import Audio
from scipy.signal import butter, lfilter
from pathlib import Path
import seaborn as sns
from librosa import display as librosadisplay
sr = 32000 
import warnings
warnings.simplefilter("ignore")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
from IPython import display
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score
import xgboost as xgb
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import warnings
from xgboost.sklearn import XGBClassifier
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, f1_score  
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_absolute_error
from sklearn.metrics import classification_report
from scipy.stats import pearsonr
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, make_scorer, confusion_matrix
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier
#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from skopt import BayesSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
from skopt.space import Real, Categorical, Integer
import numpy as np
from collections import Counter
from sklearn.metrics import roc_curve, auc
import timeit
import time
from sklearn.ensemble import StackingClassifier
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.preprocessing import scale, normalize

1.首先对工程进行基本的参数配置

# 进行建模基本配置
SCORE_EVA = 'roc_auc'
random_state_clf = 1
n_jobs= 16
cv_split = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
cv_split2 = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
X, y = data_of_features, label
X = StandardScaler().fit_transform(X)

2.基于各ML模型默认参数进行建模评估,了解模型针对当前任务的基本建模能力,筛选出有前途的模型进行超参数调优

#Machine Learning Algorithm (MLA) Selection and Initialization
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(random_state=random_state_clf),
    ensemble.BaggingClassifier(random_state=random_state_clf),
    ensemble.ExtraTreesClassifier(random_state=random_state_clf),
    ensemble.GradientBoostingClassifier(random_state=random_state_clf),
    ensemble.RandomForestClassifier(random_state=random_state_clf),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(random_state=random_state_clf),
    
    #GLM
    linear_model.LogisticRegressionCV(random_state=random_state_clf),
    linear_model.PassiveAggressiveClassifier(random_state=random_state_clf),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(random_state=random_state_clf),
    linear_model.Perceptron(random_state=random_state_clf),
    
    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    #SVM
    svm.SVC(probability=True,random_state=random_state_clf),
    svm.NuSVC(probability=True,random_state=random_state_clf),
    svm.LinearSVC(random_state=random_state_clf),
    
    #Trees    
    tree.DecisionTreeClassifier(random_state=random_state_clf),
    tree.ExtraTreeClassifier(random_state=random_state_clf),
    
    #Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),

    XGBClassifier(random_state=random_state_clf)    
    ]
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Matric Mean', 'MLA Test Matric Mean', 'MLA Test Matric 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)
row_index = 0
for alg in MLA:

    #set name and parameters
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    
    #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
    cv_results = model_selection.cross_validate(alg, X, y, cv  = cv_split,return_train_score=True,scoring = SCORE_EVA)

    MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    MLA_compare.loc[row_index, 'MLA Train Matric Mean'] = cv_results['train_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Matric Mean'] = cv_results['test_score'].mean()   
    #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
    MLA_compare.loc[row_index, 'MLA Test Matric 3*STD'] = cv_results['test_score'].std()*3   #let's know the worst that can happen!
    
    row_index+=1

MLA_compare = MLA_compare.sort_values(by='MLA Test Matric Mean', ascending=False)  # 降序排列
MLA_compare

3.挑选出表现较好的模型,结合交叉验证和递归特征消除技术,同时进行超参数调优和特征选择

#函数可以选择使用REF或者SelectKBest方法结合贝叶斯或网格进行交叉验证寻优
def SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,cv_split,SCORE_EVA='roc_auc',Search_method ='Bayes',feature_method = 'ref', bayes_n_iter=10,verbose = 0,n_jobs=1):
    if feature_method == 'ref':
        pipe = Pipeline( [('scaler', StandardScaler()),('feature_selector', RFE(estimator=clf_model)),('model', clf_model)])
    else:
        pipe = Pipeline( [('scaler', StandardScaler()),('feature_selector', SelectKBest(f_classif)),  ('model', clf_model)])
        
    if Search_method =='grid':
        grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=cv_split, verbose=verbose,scoring = SCORE_EVA,n_jobs=n_jobs)
    else:
        grid_search = BayesSearchCV(pipe, search_spaces=param_grid, verbose=verbose, scoring=SCORE_EVA, cv=cv_split, n_iter=bayes_n_iter,n_jobs=n_jobs)
    grid_search.fit(X, y)
    return grid_search
#函数通过多次,多折交叉验证的形式找到最佳特征集和超参数,在多次的交叉验证结果中,以出现最多的特征和参数作为最终的优选结果。这里如果用贝叶斯方法,可能每次交叉验证的超参数结果都是唯一的,导致所有
#结果出现次数都是1

def mutil_times_SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,cv_outter=10,cv_inner=5,SCORE_EVA='roc_auc',Search_method ='Bayes',feature_method = 'ref',bayes_n_iter=2,verbose=0,n_jobs=1):
    start_time = timeit.default_timer()
    inner_cv = StratifiedKFold(n_splits=cv_inner, shuffle=True, random_state=1)
    if cv_outter==1:
        grid_search_result = SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,inner_cv,SCORE_EVA,Search_method,feature_method,bayes_n_iter,verbose,n_jobs)
        end_time = timeit.default_timer()
        print(f"函数运行时间为 {(end_time - start_time)/60} 分")
        print("Best score found: ", grid_search_result.best_score_)
        print("Best parameters found: ", grid_search_result.best_params_)
        print("Selected features:", np.array(features43)[grid_search_result.best_estimator_.named_steps['feature_selector'].support_])
        return grid_search_result
    else:      
        outer_cv = StratifiedKFold(n_splits=cv_split, shuffle=True, random_state=0)
        
        roc =[]
        best_params_history = [] 
        selected_features_history = [] 
        # 执行超参数优化
        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
            search = SearchCV_Feature_and_Parameter(X_train,y_train,clf_model,param_grid,inner_cv,SCORE_EVA,Search_method,feature_method,bayes_n_iter,verbose,n_jobs)
            best_params_history.append(search.best_params_)
            best_model = search.best_estimator_

            selected_features = best_model.named_steps['feature_selector'].get_support()
            selected_features_history.append(selected_features)

            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.predict_proba(X_test)[:,1]
            roc.append(roc_auc_score(y_test, y_pred_proba))

        best_params_history_df = pd.DataFrame([dict(ordered_dict) for ordered_dict in best_params_history])
        best_params_history_df[SCORE_EVA] = roc

        print(f"{cv_split}次{cv_split}折交叉验证平均ROC: {np.mean(roc):.4f}  std: {np.std(roc):.4f} ,{[round(meta,3) for meta in roc]}")
        #for i, selected_features in enumerate(selected_features_history, start=1):
        #    print(f"第{i}次交叉验证所选择的特征: {np.array(features)[selected_features]}")
        param_names = best_params_history[0].keys()
        overall_best_params = {}
        for param_name in param_names:
            value_counts = Counter([params[param_name] for params in best_params_history])
            most_common_value = value_counts.most_common(1)[0][0]
            overall_best_params[param_name] = most_common_value
        print("整体最佳超参数: ", overall_best_params)
        # 多模型集成+计算整体最佳作为最终超参数
        total_features = X.shape[1]
        feature_selection_counts = np.zeros(total_features)
        # 统计每个特征被选中的次数
        for selected_features in selected_features_history:
            feature_selection_counts += selected_features.astype(int)
        # 设置阈值以确定整体最佳特征集
        threshold = len(selected_features_history) // 2
        overall_best_features = feature_selection_counts > threshold
        print("整体最佳特征集: ", np.array(features)[overall_best_features])
        end_time = timeit.default_timer()
        print(f"函数运行时间为 {(end_time - start_time)/60} 分")
        return best_params_history_df,selected_features_history,roc
def model_evaluate(X,y,model,n_times,test_size=0.3):
    scores = []
    # 进行多次随机数据划分
    for i in range(n_times):
        # 划分训练集和测试集
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,random_state=i)
        # 训练并评估模型,每次调用fit都会重新初始化模型权重
        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)[:,1]
        score = roc_auc_score(y_test, y_pred)
        scores.append(score)
    return scores
# 通过建立ML工作流对,针对较优模型进行RFE特征选择,和超参数调优,作为本次建模的核心模型
grid_n_estimator = Integer(1, 300)
grid_ratio = Real(0.01, 1.0, 'log-uniform')
grid_learn = Real(0.01, 1.0, 'log-uniform')
grid_max_depth =  Integer(1, 15)
grid_min_samples = [5, 10, .03, .05, .10]
grid_criterion = ['gini', 'entropy']
grid_bool = [True, False]
grid_seed = [0]

# 定义超参数搜索空间
param_grid = {
    #'feature_selector__k': Integer(5, 15),
    'feature_selector__n_features_to_select': Integer(5, 15),
    'model__learning_rate': Real(0.01, 1.0, 'log-uniform'),
    'model__max_depth': Integer(1, 50),
    'model__n_estimators': Integer(50, 200),
    'model__random_state': grid_seed
}

clf_model = XGBClassifier(scale_pos_weight=2,objective='binary:logistic',seed=0)
grid_search_result = mutil_times_SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,cv_outter=1,cv_inner=5,SCORE_EVA='roc_auc',Search_method ='Bayes',feature_method = 'ref',bayes_n_iter=10,verbose=0,n_jobs=n_jobs)
# 得到optimal特征子集和超参数后,通过多次数据划分,评估模型整体和泛化性能,其中,泛化性能以std结果体现
X_best = X[np.array(features43)[grid_search_result.best_estimator_.named_steps['feature_selector'].support_]]
X_best = StandardScaler().fit_transform(X_best)
clf_model.set_params(**{k.replace("model__", ""): v for k, v in grid_search_result.best_params_.items() if k.startswith("model__")})
scores = model_evaluate(X_best,y,clf_model_EVA,n_times=100,test_size=0.3)
mean_score = round(np.mean(scores),3)
std_score = round(np.std(scores),3)
print('最佳模型',mean_score,std_score)

4.完成特征选择后,基于优选特征子集,结合贝叶斯交叉验证,对备选模型进行超参数调优

#基于optimal特征子集,进行多模型集成
#why choose one model, when you can pick them all with voting classifier
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
#removed models w/o attribute 'predict_proba' required for vote classifier and models with a 1.0 correlation to another model
vote_est = [
    #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html
    ('ada', ensemble.AdaBoostClassifier()),
    ('rfc', ensemble.RandomForestClassifier()),
    ('gbc', ensemble.GradientBoostingClassifier()),
    
    ('xgb', XGBClassifier())
    #('bc', ensemble.BaggingClassifier()),
    #('etc',ensemble.ExtraTreesClassifier()),
    #('gbc', ensemble.GradientBoostingClassifier()),
    #('rfc', ensemble.RandomForestClassifier()),

    #Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc
    #('gpc', gaussian_process.GaussianProcessClassifier()),
    
    #GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
    #('lr', linear_model.LogisticRegressionCV()),
    
    #Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html
    #('bnb', naive_bayes.BernoulliNB()),
    #('gnb', naive_bayes.GaussianNB()),
    
    #Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html
    #('knn', neighbors.KNeighborsClassifier()),
    
    #SVM: http://scikit-learn.org/stable/modules/svm.html
   # ('svc', svm.SVC(probability=True)),
    
    #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
   #('xgb', XGBClassifier())

]


#WARNING: Running is very computational intensive and time expensive.
#Code is written for experimental/developmental purposes and not production ready!
#Hyperparameter Tune with GridSearchCV: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
grid_param = [
            [{
            #AdaBoostClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
            'n_estimators': grid_n_estimator, #default=50
            'learning_rate': grid_learn, #default=1
            #'algorithm': ['SAMME', 'SAMME.R'], #default=’SAMME.R
            'random_state': grid_seed
            }],
       
            [{
            #RandomForestClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
            'n_estimators': grid_n_estimator, #default=10
            'criterion': grid_criterion, #default=”gini”
            'max_depth': grid_max_depth, #default=None
            'oob_score': [True], #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds.
            'random_state': grid_seed
             }],
    
             [{
            #GradientBoostingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier
            #'loss': ['deviance', 'exponential'], #default=’deviance’
            'learning_rate': [.05], #default=0.1 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
            'n_estimators': [300], #default=100 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
            #'criterion': ['friedman_mse', 'mse', 'mae'], #default=”friedman_mse”
            'max_depth': grid_max_depth, #default=3   
            'random_state': grid_seed
             }],
    
            [{
            #XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html
            'learning_rate': grid_learn, #default: .3
            'max_depth': [1,2,4,6,8,10], #default 2
            'n_estimators': grid_n_estimator, 
            'seed': grid_seed  
             }] ,
    '''
            [{
            #ExtraTreesClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier
            'n_estimators': grid_n_estimator, #default=10
            'criterion': grid_criterion, #default=”gini”
            'max_depth': grid_max_depth, #default=None
            'random_state': grid_seed
             }],
             
             
            [{
            #BaggingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier
            'n_estimators': grid_n_estimator, #default=10
            'max_samples': grid_ratio, #default=1.0
            'random_state': grid_seed
             }],
             

    
            [{    
            #GaussianProcessClassifier
            'max_iter_predict': grid_n_estimator, #default: 100
            'random_state': grid_seed
            }],
        
    
            [{
            #LogisticRegressionCV - http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV
            'fit_intercept': grid_bool, #default: True
            #'penalty': ['l1','l2'],
            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], #default: lbfgs
            'random_state': grid_seed
             }],
            
    
            [{
            #BernoulliNB - http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB
            'alpha': grid_ratio, #default: 1.0
             }],
    
    
            #GaussianNB - 
            [{}],
    
            [{
            #KNeighborsClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
            'n_neighbors': [1,2,3,4,5,6,7], #default: 5
            'weights': ['uniform', 'distance'], #default = ‘uniform’
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
            }],
            
    
            [{
            #SVC - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
            #http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r
            #'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'C': [1,2,3,4,5], #default=1.0
            'gamma': grid_ratio, #edfault: auto
            'decision_function_shape': ['ovo', 'ovr'], #default:ovr
            'probability': [True],
            'random_state': grid_seed
             }],
            '''
           
        ]

start_total = time.perf_counter() #https://docs.python.org/3/library/time.html#time.perf_counter
for clf, param in zip (vote_est, grid_param): #https://docs.python.org/3/library/functions.html#zip

    #print(clf[0],clf[1]) #vote_est is a list of tuples, index 0 is the name and index 1 is the algorithm
    #print(param)
    
    start = time.perf_counter()        
    #best_search = model_selection.GridSearchCV(estimator = clf[1], param_grid = param, cv = cv_split, scoring = SCORE_EVA)
    best_search = BayesSearchCV(clf[1], search_spaces=param, scoring=SCORE_EVA, cv=cv_split, n_iter=50,n_jobs=16)
    best_search.fit(X_best, y)
    run = time.perf_counter() - start

    best_param = best_search.best_params_
    clf[1].set_params(**best_param) 
    #对模型进行多次评估,以分析泛化能力
    scores = model_evaluate(X_best, y,clf[1],10,test_size=0.3)
    print('The best parameter for {} is {} with a runtime of {:.2f} seconds, scoreing is  {:.3f}, std: {:.3f}'.format(clf[1].__class__.__name__, best_param, run,np.mean(scores),np.std(scores)))

run_total = time.perf_counter() - start_total
print('Total optimization time was {:.2f} minutes.'.format(run_total/60))
print('-'*10)

5.完成各模型超参数调优后,进行多模型集成,包括ensemble 或 stacking

#通过投票法 进行多模型集成
#Soft Vote or weighted probabilities w/Tuned Hyperparameters
vote = ensemble.VotingClassifier(estimators = vote_est , voting = 'soft')  #voting = 'hard'
vote_cv = model_selection.cross_validate(vote, X_best, y, cv  = cv_split,scoring = SCORE_EVA,return_train_score=True,n_jobs=16)
print("Soft Voting Training w/bin score mean: {:.2f}". format(vote_cv['train_score'].mean()*100)) 
print("Soft Voting Test w/bin score mean: {:.2f}". format(vote_cv['test_score'].mean()*100))
print("Soft Voting Test w/bin score 3*std: +/- {:.2f}". format(vote_cv['test_score'].std()*100*3))
print('-'*10)

# stacking 法
meta_learner = LogisticRegression()
stacking_model = StackingClassifier(estimators=vote_est, final_estimator=meta_learner)
stacking_cv = model_selection.cross_validate(stacking_model, X_best, y, cv  = cv_split,scoring = SCORE_EVA,return_train_score=True,n_jobs=16)
print("Stacking Training w/bin score mean: {:.2f}". format(stacking_cv['train_score'].mean()*100)) 
print("Stacking Test w/bin score mean: {:.2f}". format(stacking_cv['test_score'].mean()*100))
print("Stacking Test w/bin score 3*std: +/- {:.2f}". format(stacking_cv['test_score'].std()*100*3))
print('-'*10)

6.使用hyperopt库进行调参

space_multi = hp.choice('classifier_type', [
    {
        'type': 'AdaBoostClassifier',
        'params': {
            'learning_rate': hp.uniform('learning_rate_AdaBoost', 0.01, 2.0),
            'n_estimators': hp.choice('n_estimators_AdaBoost', range(1, 20)),
            'random_state': hp.choice('random_state_AdaBoost', [1])
        }
    },
    {
        'type': 'RandomForestClassifier',
        'params': {
            'n_estimators': hp.choice('n_estimators_RF', range(1, 20)),
            'criterion': hp.choice('criterion_RF', ["gini", "entropy"]),
            'max_depth': hp.choice('max_depth_RF', range(1, 20)),
            'random_state': hp.choice('random_state_RF', [1])
        }
    },
    {
        'type': 'GradientBoostingClassifier',
        'params': {
            'learning_rate': hp.uniform('learning_rate_GBC', 0.01, 2.0),
            'n_estimators': hp.choice('n_estimators_GBC', range(1, 20)),
            'criterion': hp.choice('criterion_GBC', ["friedman_mse"]),
            'max_depth': hp.choice('max_depth_GBC', range(1, 20)),
            'random_state': hp.choice('random_state_GBC', [1])
        }
    },
    {
        'type': 'XGBClassifier',
        'params': {
            'learning_rate': hp.uniform('learning_rate_XGB', 0.01, 2.0),
            'n_estimators': hp.choice('n_estimators_XGB', range(1, 20)),
            'random_state': hp.choice('random_state_XGB', [1])
        }
    }
])

def hyperopt_train_test(params):
    t = params['type']
    params = params['params']
    if t == 'AdaBoostClassifier':
        clf = ensemble.AdaBoostClassifier(**params)
    elif t == 'RandomForestClassifier':
        clf = ensemble.RandomForestClassifier(**params)
    elif t == 'GradientBoostingClassifier':
        clf = ensemble.GradientBoostingClassifier(**params)
    elif t == 'XGBClassifier':
        clf = XGBClassifier(**params)
    else:
        return 0
    
    return cross_val_score(clf, X_best, y, scoring=SCORE_EVA, cv=cv_split, n_jobs=n_jobs).mean()


count = 0
best = 0
def f(params):
    global best, count
    count += 1
    acc = hyperopt_train_test(params.copy())
    if acc > best:
        print("new best:{:.3f}, using {}".format(acc, params['type'])) 
        best = acc
    #if count % 50 == 0:
    #print("iters:{:.3f}, score: {:.3f} using {}".format(count, acc, params) )
    return {'loss': -acc, 'status': STATUS_OK}

trials = Trials()
best = fmin(f, space_multi, algo=tpe.suggest, max_evals=1500, trials=trials)
print('best: ',best)

"""
new best:0.823, using AdaBoostClassifier                
new best:0.961, using AdaBoostClassifier                                           
new best:0.962, using XGBClassifier                                                
new best:0.973, using XGBClassifier                                                
new best:0.975, using XGBClassifier                                                 
new best:0.977, using XGBClassifier                                                 
new best:0.978, using XGBClassifier                                                  
100%|██████████| 1500/1500 [01:41<00:00, 14.73trial/s, best loss: -0.9783106134371957]
best:  {'classifier_type': 3, 'learning_rate_XGB': 0.47638758660359, 'n_estimators_XGB': 6, 'random_state_XGB': 0}
"""
param_ranges = {
    'n_estimators': list(range(1, 50)),
    'random_state': [1],
    'max_depth': list(range(1, 50)),
    'min_child_weight': list(range(1, 2)),
}

# 初始化空间字典并为learning_rate添加条目,实际是初始化以hp.uniform为搜索范围的参数,因为hp.uniform返回的是实际值,无需后续为hp.choice 建立索引到实际值的映射
space = {'learning_rate': hp.uniform('learning_rate', 0.01, 1.0)}
# 使用循环添加其他参数到空间字典
for k, values in param_ranges.items():
    space[k] = hp.choice(k, values)

def f(params):
    X_ = X_best[:]
    if 'normalize' in params:
        if params['normalize'] == 1:
            X_ = normalize(X_)
        del params['normalize']

    if 'scale' in params:
        if params['scale'] == 1:
            X_ = scale(X_)
        del params['scale']
        
    clf = XGBClassifier(**params)
    
    score =cross_val_score(clf, X_best, y,scoring=SCORE_EVA,cv=cv_split,n_jobs=n_jobs).mean()
    
    return {'loss': -score, 'status': STATUS_OK}

trials = Trials()
best = fmin(f, space, algo=tpe.suggest, max_evals=100, trials=trials)
# 将索引转为实际值
for key in best:
    if key in param_ranges:  # 只对 hp.choice 使用的参数做转换
        best[key] = param_ranges[key][best[key]]

print('best: ',best)
f, axes = plt.subplots(nrows=int(len(space.keys())/3+0.5), ncols=3, figsize=(15,5*int(len(space.keys())/3+0.5)))
cmap = plt.cm.jet
for i, val in enumerate(space.keys()):
    xs = np.array([t['misc']['vals'][val] for t in trials.trials]).ravel()
    ys = [-t['result']['loss'] for t in trials.trials]
    xs, ys = zip(*sorted(zip(xs, ys)))
    ys = np.array(ys)
    if int(len(space.keys())/3+0.5)==1:
        axes[int(i%3)].scatter(xs, ys, s=20, linewidth=0.01, alpha=0.5, c=cmap(float(i)/len(space.keys())))
        axes[int(i%3)].set_title(val)
    else:
        axes[int(i/3),int(i%3)].scatter(xs, ys, s=20, linewidth=0.01, alpha=0.5, c=cmap(float(i)/len(space.keys())))
        axes[int(i/3),int(i%3)].set_title(val)

6 ​​​​​​​参考: A Data Science Framework: To Achieve 99% Accuracy | Kaggle

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值