Kaggle-泰坦尼克号预测

模型数据 -- Model Data

#Machine Learning Algorithm (MLA) Selection and Initialization
MLA = [
    #Ensemble Methods -- 集成方法
    ensemble.AdaBoostClassifier(),       # 自适应Boosting
    ensemble.BaggingClassifier(),        # Bagging(Bootstrap Aggregating)
    ensemble.ExtraTreesClassifier(),     # 极端随机树
    ensemble.GradientBoostingClassifier(), # 梯度提升决策树(GBDT)
    ensemble.RandomForestClassifier(),   # 随机森林


    #Gaussian Processes -- 高斯过程分类器
    gaussian_process.GaussianProcessClassifier(),  # 贝叶斯推理
    
    #GLM -- 广义线性模型
    linear_model.LogisticRegressionCV(),    # 带交叉验证的逻辑回归
    linear_model.PassiveAggressiveClassifier(),  # 被动攻击分类器(用于在线学习)
    linear_model.RidgeClassifierCV(),       # 带交叉验证的岭回归分类器
    linear_model.SGDClassifier(),           # 随机梯度下降(SGD)
    linear_model.Perceptron(),              # 感知机

    
    #Navies Bayes -- 朴素贝叶斯分类器
    naive_bayes.BernoulliNB(),  # 伯努利朴素贝叶斯(适用于二元特征)
    naive_bayes.GaussianNB(),   # 高斯朴素贝叶斯(适用于连续数据)
    

    #Nearest Neighbor -- 最近邻分类
    neighbors.KNeighborsClassifier(), # KNN
    
    #SVM -- 支持向量机
    svm.SVC(probability=True),  # 支持向量分类器(带概率输出)
    svm.NuSVC(probability=True),  # 支持向量分类器,使用不同的正则化参数 Nu
    svm.LinearSVC(),  # 线性支持向量机(适合大数据)

    
    #Trees -- 决策树分类
    tree.DecisionTreeClassifier(),  # 经典决策树
    tree.ExtraTreeClassifier(),     # 极端随机树

    
    #Discriminant Analysis -- 判别分析
    discriminant_analysis.LinearDiscriminantAnalysis(),  # 线性判别分析(LDA)
    discriminant_analysis.QuadraticDiscriminantAnalysis(), # 二次判别分析(QDA)


    
    #xgboost -- 梯度提升树(GBDT)
    XGBClassifier() # XGBoost 分类器
    ]



#split dataset in cross-validation with this splitter class
#note: this is an alternative to train_test_split
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%

#create table to compare MLA metrics
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)

#create table to compare MLA predictions
MLA_predict = data1[Target]

#index through MLA and save performance to table
row_index = 0
for alg in MLA:

    #set name and parameters
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    
    #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
    cv_results = model_selection.cross_validate(alg, data1[data1_x_bin], data1[Target], cv  = cv_split)

    MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()   
    #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
    MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3   #let's know the worst that can happen!
    

    #save MLA predictions - see section 6 for usage
    alg.fit(data1[data1_x_bin], data1[Target])
    MLA_predict[MLA_name] = alg.predict(data1[data1_x_bin])
    
    row_index+=1

    
#print and sort table
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare
#MLA_predict

通过交叉验证来比较多种机器学习算法(MLA)在给定数据集上的性能,并将结果存储在一个表格中,以便后续分析和选择最佳模型

补充:

  • AdaBoostClassifier() 使用多个弱分类器(如决策树),通过调整权重来提升准确率。
  • BaggingClassifier() 通过**自助采样(bootstrap)**创建多个训练集并进行集成。
  • ExtraTreesClassifier() 类似随机森林,但增加了更多随机性
  • GradientBoostingClassifier() 通过逐步修正前一次模型的误差来提升性能,适用于小数据集。
  • RandomForestClassifier()多个决策树的集合,防止过拟合。
  • GaussianProcessClassifier() 使用贝叶斯推理进行分类,适合小数据集,但计算量大。
  • LogisticRegressionCV()逻辑回归,自动选择最佳正则化参数。
  • PassiveAggressiveClassifier() 适用于在线学习,适合大规模数据
  • RidgeClassifierCV() 使用 岭回归,可以减少多重共线性问题。
  • SGDClassifier() 适用于大规模数据,使用随机梯度下降优化。
  • Perceptron()单层神经网络(感知机),只能学习线性可分数据。
  • BernoulliNB() 适用于二元特征(如文本分类中的词袋模型)。
  • GaussianNB() 假设特征服从高斯分布,适用于连续变量的分类任务。
  • KNeighborsClassifier() 通过计算与训练样本的距离来分类,适合小数据集。
  • SVC() 适用于非线性分类,支持 kernel='rbf''poly' 等不同核函数。
  • NuSVC() 类似 SVC(),但超参数 Nu 取代 C 进行正则化控制。
  • LinearSVC() 适用于大规模数据,但仅支持线性分类
  • DecisionTreeClassifier()基本的决策树,容易过拟合。
  • ExtraTreeClassifier() 通过增加随机性来减少过拟合
  • LinearDiscriminantAnalysis() 适用于数据类别分布符合正态分布的情况。
  • QuadraticDiscriminantAnalysis() 允许不同类别具有不同协方差矩阵,更灵活。
  • XGBClassifier()梯度提升树(GBDT),比 GradientBoostingClassifier() 训练速度更快,适合大规模数据。

评估模型性能

以下是利用手工决策树做 Titanic 生存预测,不依赖机器学习算法

#handmade data model using brain power (and Microsoft Excel Pivot Tables for quick calculations)
def mytree(df):
    
    #initialize table to store predictions
    Model = pd.DataFrame(data = {'Predict':[]})
    male_title = ['Master'] #survived titles

    for index, row in df.iterrows():

        #Question 1: Were you on the Titanic; majority died
        Model.loc[index, 'Predict'] = 0

        #Question 2: Are you female; majority survived
        if (df.loc[index, 'Sex'] == 'female'):
                  Model.loc[index, 'Predict'] = 1

        #Question 3A Female - Class and Question 4 Embarked gain minimum information

        #Question 5B Female - FareBin; set anything less than .5 in female node decision tree back to 0       
        if ((df.loc[index, 'Sex'] == 'female') & 
            (df.loc[index, 'Pclass'] == 3) & 
            (df.loc[index, 'Embarked'] == 'S')  &
            (df.loc[index, 'Fare'] > 8)

           ):
                  Model.loc[index, 'Predict'] = 0

        #Question 3B Male: Title; set anything greater than .5 to 1 for majority survived
        if ((df.loc[index, 'Sex'] == 'male') &
            (df.loc[index, 'Title'] in male_title)
            ):
            Model.loc[index, 'Predict'] = 1
        
        
    return Model


#model data
Tree_Predict = mytree(data1)
print('Decision Tree Model Accuracy/Precision Score: {:.2f}%\n'.format(metrics.accuracy_score(data1['Survived'], Tree_Predict)*100))


print(metrics.classification_report(data1['Survived'], Tree_Predict))

通过以上的自定义树的预测,效果能达到82.04%的准确率,若机器算法的准确率低于该数值,则说明其机器算法效果较差

1.交叉验证(Cross-Validation)的模型性能

注意:训练数据使用不同的子集来构建模型,并使用测试数据来评估我们的模型,否则,模型会过拟合

2.使用超参数调整模型

2.1设置超参数搜索范围

param_grid = {
    'criterion': ['gini', 'entropy'],  # 两种不同的信息增益计算方式
    'max_depth': [2,4,6,8,10,None],  # 限制树的深度,防止过拟合
    'random_state': [0]  # 固定随机数种子,保证可复现
}

2.2运行GridSearchCV进行超参数搜索

tune_model = model_selection.GridSearchCV(
    tree.DecisionTreeClassifier(),
    param_grid=param_grid,
    scoring='roc_auc',  # 评估指标:AUC
    cv=cv_split  # 交叉验证
)

tune_model.fit(data1[data1_x_bin], data1[Target])

3.使用特征选择来调整模型

3.1使用RFECV(递归特征消除+交叉验证 -- Recursive Feature Elimination with Cross-Validation)

dtree_rfe = feature_selection.RFECV(dtree, step=1, scoring='accuracy', cv=cv_split)
dtree_rfe.fit(data1[data1_x_bin], data1[Target])

step=1每次消除一个特征,直到找到最优特征子集

cv=cv_split:使用交叉验证保证结果稳定性

3.2选择被 RFE 选出的特征,并重新训练模型

X_rfe = data1[data1_x_bin].columns.values[dtree_rfe.get_support()]
rfe_results = model_selection.cross_validate(dtree, data1[X_rfe], data1[Target], cv=cv_split)

dtree_rfe.get_support() 返回布尔数组,表示哪些特征被保留(True=保留,False=删除)

data1[X_rfe] 只保留最优特征子集,用于训练新模型

cross_validate() 重新评估使用 RFE 之后的新数据集

补充:

1.尝试 SelectFromModel 替代 RFE

from sklearn.feature_selection import SelectFromModel
sfm = SelectFromModel(dtree, threshold='median')
sfm.fit(data1[data1_x_bin], data1[Target])
selected_features = data1[data1_x_bin].columns[sfm.get_support()]

2.使用 RandomForestClassifier 替代单一决策树

决策树容易过拟合,随机森林可以提升模型稳定性

4.决策树可视化(Graphviz)

import graphviz 
dot_data = tree.export_graphviz(
    dtree, 
    out_file=None, 
    feature_names=data1_x_bin,  # 特征名称
    class_names=True,  # 类别名称
    filled=True,  # 颜色填充,增强可读性
    rounded=True  # 圆角节点
)
graph = graphviz.Source(dot_data) 
graph

验证和实施

1.选择多个分类模型

vote_est = [
    ('ada', ensemble.AdaBoostClassifier()),  # AdaBoost
    ('bc', ensemble.BaggingClassifier()),  # Bagging
    ('etc', ensemble.ExtraTreesClassifier()),  # Extra Trees
    ('gbc', ensemble.GradientBoostingClassifier()),  # Gradient Boosting
    ('rfc', ensemble.RandomForestClassifier()),  # Random Forest

    ('gpc', gaussian_process.GaussianProcessClassifier()),  # Gaussian Process
    ('lr', linear_model.LogisticRegressionCV()),  # Logistic Regression

    ('bnb', naive_bayes.BernoulliNB()),  # Bernoulli Naive Bayes
    ('gnb', naive_bayes.GaussianNB()),  # Gaussian Naive Bayes

    ('knn', neighbors.KNeighborsClassifier()),  # k-Nearest Neighbors
    ('svc', svm.SVC(probability=True)),  # SVM (支持概率输出)

    ('xgb', XGBClassifier())  # XGBoost
]

2.硬投票(Hard Voting)

vote_hard = ensemble.VotingClassifier(estimators=vote_est, voting='hard')
vote_hard_cv = model_selection.cross_validate(vote_hard, data1[data1_x_bin], data1[Target], cv=cv_split)
vote_hard.fit(data1[data1_x_bin], data1[Target])

3.软投票(Soft Voting)

vote_soft = ensemble.VotingClassifier(estimators=vote_est, voting='soft')
vote_soft_cv = model_selection.cross_validate(vote_soft, data1[data1_x_bin], data1[Target], cv=cv_split)
vote_soft.fit(data1[data1_x_bin], data1[Target])

多机器学习模型分类器进行超参数调优,并将优化后的分类器用于投票分类器

1.定义超参数搜索范围

grid_n_estimator = [10, 50, 100, 300]
grid_ratio = [.1, .25, .5, .75, 1.0]
grid_learn = [.01, .03, .05, .1, .25]
grid_max_depth = [2, 4, 6, 8, 10, None]
grid_min_samples = [5, 10, .03, .05, .10]
grid_criterion = ['gini', 'entropy']
grid_bool = [True, False]
grid_seed = [0]

2.定义不同模型的超参数搜索网格

grid_param = [
    [{
        'n_estimators': grid_n_estimator,
        'learning_rate': grid_learn,
        'random_state': grid_seed
    }],
    [{
        'n_estimators': grid_n_estimator,
        'max_samples': grid_ratio,
        'random_state': grid_seed
    }],
    [{
        'n_estimators': grid_n_estimator,
        'criterion': grid_criterion,
        'max_depth': grid_max_depth,
        'random_state': grid_seed
    }]
]

3.使用 GridSearchCV 进行超参数调优

start_total = time.perf_counter()
for clf, param in zip(vote_est, grid_param):  # 遍历模型和对应超参数
    start = time.perf_counter()        
    best_search = model_selection.GridSearchCV(estimator=clf[1], param_grid=param, cv=cv_split, scoring='roc_auc')
    best_search.fit(data1[data1_x_bin], data1[Target])
    run = time.perf_counter() - start

    best_param = best_search.best_params_
    print('The best parameter for {} is {} with a runtime of {:.2f} seconds.'.format(clf[1].__class__.__name__, best_param, run))
    
    # 用最优超参数更新分类器
    clf[1].set_params(**best_param)

run_total = time.perf_counter() - start_total
print('Total optimization time was {:.2f} minutes.'.format(run_total/60))

#handmade decision tree - submission score = 0.77990
data_val['Survived'] = mytree(data_val).astype(int)


#decision tree w/full dataset modeling submission score: defaults= 0.76555, tuned= 0.77990
#submit_dt = tree.DecisionTreeClassifier()
#submit_dt = model_selection.GridSearchCV(tree.DecisionTreeClassifier(), param_grid=param_grid, scoring = 'roc_auc', cv = cv_split)
#submit_dt.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_dt.best_params_) #Best Parameters:  {'criterion': 'gini', 'max_depth': 4, 'random_state': 0}
#data_val['Survived'] = submit_dt.predict(data_val[data1_x_bin])


#bagging w/full dataset modeling submission score: defaults= 0.75119, tuned= 0.77990
#submit_bc = ensemble.BaggingClassifier()
#submit_bc = model_selection.GridSearchCV(ensemble.BaggingClassifier(), param_grid= {'n_estimators':grid_n_estimator, 'max_samples': grid_ratio, 'oob_score': grid_bool, 'random_state': grid_seed}, scoring = 'roc_auc', cv = cv_split)
#submit_bc.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_bc.best_params_) #Best Parameters:  {'max_samples': 0.25, 'n_estimators': 500, 'oob_score': True, 'random_state': 0}
#data_val['Survived'] = submit_bc.predict(data_val[data1_x_bin])


#extra tree w/full dataset modeling submission score: defaults= 0.76555, tuned= 0.77990
#submit_etc = ensemble.ExtraTreesClassifier()
#submit_etc = model_selection.GridSearchCV(ensemble.ExtraTreesClassifier(), param_grid={'n_estimators': grid_n_estimator, 'criterion': grid_criterion, 'max_depth': grid_max_depth, 'random_state': grid_seed}, scoring = 'roc_auc', cv = cv_split)
#submit_etc.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_etc.best_params_) #Best Parameters:  {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'random_state': 0}
#data_val['Survived'] = submit_etc.predict(data_val[data1_x_bin])


#random foreset w/full dataset modeling submission score: defaults= 0.71291, tuned= 0.73205
#submit_rfc = ensemble.RandomForestClassifier()
#submit_rfc = model_selection.GridSearchCV(ensemble.RandomForestClassifier(), param_grid={'n_estimators': grid_n_estimator, 'criterion': grid_criterion, 'max_depth': grid_max_depth, 'random_state': grid_seed}, scoring = 'roc_auc', cv = cv_split)
#submit_rfc.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_rfc.best_params_) #Best Parameters:  {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'random_state': 0}
#data_val['Survived'] = submit_rfc.predict(data_val[data1_x_bin])



#ada boosting w/full dataset modeling submission score: defaults= 0.74162, tuned= 0.75119
#submit_abc = ensemble.AdaBoostClassifier()
#submit_abc = model_selection.GridSearchCV(ensemble.AdaBoostClassifier(), param_grid={'n_estimators': grid_n_estimator, 'learning_rate': grid_ratio, 'algorithm': ['SAMME', 'SAMME.R'], 'random_state': grid_seed}, scoring = 'roc_auc', cv = cv_split)
#submit_abc.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_abc.best_params_) #Best Parameters:  {'algorithm': 'SAMME.R', 'learning_rate': 0.1, 'n_estimators': 300, 'random_state': 0}
#data_val['Survived'] = submit_abc.predict(data_val[data1_x_bin])


#gradient boosting w/full dataset modeling submission score: defaults= 0.75119, tuned= 0.77033
#submit_gbc = ensemble.GradientBoostingClassifier()
#submit_gbc = model_selection.GridSearchCV(ensemble.GradientBoostingClassifier(), param_grid={'learning_rate': grid_ratio, 'n_estimators': grid_n_estimator, 'max_depth': grid_max_depth, 'random_state':grid_seed}, scoring = 'roc_auc', cv = cv_split)
#submit_gbc.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_gbc.best_params_) #Best Parameters:  {'learning_rate': 0.25, 'max_depth': 2, 'n_estimators': 50, 'random_state': 0}
#data_val['Survived'] = submit_gbc.predict(data_val[data1_x_bin])

#extreme boosting w/full dataset modeling submission score: defaults= 0.73684, tuned= 0.77990
#submit_xgb = XGBClassifier()
#submit_xgb = model_selection.GridSearchCV(XGBClassifier(), param_grid= {'learning_rate': grid_learn, 'max_depth': [0,2,4,6,8,10], 'n_estimators': grid_n_estimator, 'seed': grid_seed}, scoring = 'roc_auc', cv = cv_split)
#submit_xgb.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_xgb.best_params_) #Best Parameters:  {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 300, 'seed': 0}
#data_val['Survived'] = submit_xgb.predict(data_val[data1_x_bin])


#hard voting classifier w/full dataset modeling submission score: defaults= 0.75598, tuned = 0.77990
#data_val['Survived'] = vote_hard.predict(data_val[data1_x_bin])
data_val['Survived'] = grid_hard.predict(data_val[data1_x_bin])


#soft voting classifier w/full dataset modeling submission score: defaults= 0.73684, tuned = 0.74162
#data_val['Survived'] = vote_soft.predict(data_val[data1_x_bin])
#data_val['Survived'] = grid_soft.predict(data_val[data1_x_bin])

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值