模型数据 -- Model Data
#Machine Learning Algorithm (MLA) Selection and Initialization
MLA = [
#Ensemble Methods -- 集成方法
ensemble.AdaBoostClassifier(), # 自适应Boosting
ensemble.BaggingClassifier(), # Bagging(Bootstrap Aggregating)
ensemble.ExtraTreesClassifier(), # 极端随机树
ensemble.GradientBoostingClassifier(), # 梯度提升决策树(GBDT)
ensemble.RandomForestClassifier(), # 随机森林
#Gaussian Processes -- 高斯过程分类器
gaussian_process.GaussianProcessClassifier(), # 贝叶斯推理
#GLM -- 广义线性模型
linear_model.LogisticRegressionCV(), # 带交叉验证的逻辑回归
linear_model.PassiveAggressiveClassifier(), # 被动攻击分类器(用于在线学习)
linear_model.RidgeClassifierCV(), # 带交叉验证的岭回归分类器
linear_model.SGDClassifier(), # 随机梯度下降(SGD)
linear_model.Perceptron(), # 感知机
#Navies Bayes -- 朴素贝叶斯分类器
naive_bayes.BernoulliNB(), # 伯努利朴素贝叶斯(适用于二元特征)
naive_bayes.GaussianNB(), # 高斯朴素贝叶斯(适用于连续数据)
#Nearest Neighbor -- 最近邻分类
neighbors.KNeighborsClassifier(), # KNN
#SVM -- 支持向量机
svm.SVC(probability=True), # 支持向量分类器(带概率输出)
svm.NuSVC(probability=True), # 支持向量分类器,使用不同的正则化参数 Nu
svm.LinearSVC(), # 线性支持向量机(适合大数据)
#Trees -- 决策树分类
tree.DecisionTreeClassifier(), # 经典决策树
tree.ExtraTreeClassifier(), # 极端随机树
#Discriminant Analysis -- 判别分析
discriminant_analysis.LinearDiscriminantAnalysis(), # 线性判别分析(LDA)
discriminant_analysis.QuadraticDiscriminantAnalysis(), # 二次判别分析(QDA)
#xgboost -- 梯度提升树(GBDT)
XGBClassifier() # XGBoost 分类器
]
#split dataset in cross-validation with this splitter class
#note: this is an alternative to train_test_split
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%
#create table to compare MLA metrics
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)
#create table to compare MLA predictions
MLA_predict = data1[Target]
#index through MLA and save performance to table
row_index = 0
for alg in MLA:
#set name and parameters
MLA_name = alg.__class__.__name__
MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
#score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
cv_results = model_selection.cross_validate(alg, data1[data1_x_bin], data1[Target], cv = cv_split)
MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()
#if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3 #let's know the worst that can happen!
#save MLA predictions - see section 6 for usage
alg.fit(data1[data1_x_bin], data1[Target])
MLA_predict[MLA_name] = alg.predict(data1[data1_x_bin])
row_index+=1
#print and sort table
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare
#MLA_predict
通过交叉验证来比较多种机器学习算法(MLA)在给定数据集上的性能,并将结果存储在一个表格中,以便后续分析和选择最佳模型
补充:
AdaBoostClassifier()
使用多个弱分类器(如决策树),通过调整权重来提升准确率。BaggingClassifier()
通过**自助采样(bootstrap)**创建多个训练集并进行集成。ExtraTreesClassifier()
类似随机森林,但增加了更多随机性。GradientBoostingClassifier()
通过逐步修正前一次模型的误差来提升性能,适用于小数据集。RandomForestClassifier()
是多个决策树的集合,防止过拟合。GaussianProcessClassifier()
使用贝叶斯推理进行分类,适合小数据集,但计算量大。LogisticRegressionCV()
是 逻辑回归,自动选择最佳正则化参数。PassiveAggressiveClassifier()
适用于在线学习,适合大规模数据。RidgeClassifierCV()
使用 岭回归,可以减少多重共线性问题。SGDClassifier()
适用于大规模数据,使用随机梯度下降优化。Perceptron()
是 单层神经网络(感知机),只能学习线性可分数据。BernoulliNB()
适用于二元特征(如文本分类中的词袋模型)。GaussianNB()
假设特征服从高斯分布,适用于连续变量的分类任务。KNeighborsClassifier()
通过计算与训练样本的距离来分类,适合小数据集。SVC()
适用于非线性分类,支持kernel='rbf'
、'poly'
等不同核函数。NuSVC()
类似SVC()
,但超参数Nu
取代C
进行正则化控制。LinearSVC()
适用于大规模数据,但仅支持线性分类DecisionTreeClassifier()
是 基本的决策树,容易过拟合。ExtraTreeClassifier()
通过增加随机性来减少过拟合。LinearDiscriminantAnalysis()
适用于数据类别分布符合正态分布的情况。QuadraticDiscriminantAnalysis()
允许不同类别具有不同协方差矩阵,更灵活。XGBClassifier()
是梯度提升树(GBDT),比GradientBoostingClassifier()
训练速度更快,适合大规模数据。
评估模型性能
以下是利用手工决策树做 Titanic 生存预测,不依赖机器学习算法
#handmade data model using brain power (and Microsoft Excel Pivot Tables for quick calculations)
def mytree(df):
#initialize table to store predictions
Model = pd.DataFrame(data = {'Predict':[]})
male_title = ['Master'] #survived titles
for index, row in df.iterrows():
#Question 1: Were you on the Titanic; majority died
Model.loc[index, 'Predict'] = 0
#Question 2: Are you female; majority survived
if (df.loc[index, 'Sex'] == 'female'):
Model.loc[index, 'Predict'] = 1
#Question 3A Female - Class and Question 4 Embarked gain minimum information
#Question 5B Female - FareBin; set anything less than .5 in female node decision tree back to 0
if ((df.loc[index, 'Sex'] == 'female') &
(df.loc[index, 'Pclass'] == 3) &
(df.loc[index, 'Embarked'] == 'S') &
(df.loc[index, 'Fare'] > 8)
):
Model.loc[index, 'Predict'] = 0
#Question 3B Male: Title; set anything greater than .5 to 1 for majority survived
if ((df.loc[index, 'Sex'] == 'male') &
(df.loc[index, 'Title'] in male_title)
):
Model.loc[index, 'Predict'] = 1
return Model
#model data
Tree_Predict = mytree(data1)
print('Decision Tree Model Accuracy/Precision Score: {:.2f}%\n'.format(metrics.accuracy_score(data1['Survived'], Tree_Predict)*100))
print(metrics.classification_report(data1['Survived'], Tree_Predict))
通过以上的自定义树的预测,效果能达到82.04%的准确率,若机器算法的准确率低于该数值,则说明其机器算法效果较差
1.交叉验证(Cross-Validation)的模型性能
注意:训练数据使用不同的子集来构建模型,并使用测试数据来评估我们的模型,否则,模型会过拟合
2.使用超参数调整模型
2.1设置超参数搜索范围
param_grid = {
'criterion': ['gini', 'entropy'], # 两种不同的信息增益计算方式
'max_depth': [2,4,6,8,10,None], # 限制树的深度,防止过拟合
'random_state': [0] # 固定随机数种子,保证可复现
}
2.2运行GridSearchCV进行超参数搜索
tune_model = model_selection.GridSearchCV(
tree.DecisionTreeClassifier(),
param_grid=param_grid,
scoring='roc_auc', # 评估指标:AUC
cv=cv_split # 交叉验证
)
tune_model.fit(data1[data1_x_bin], data1[Target])
3.使用特征选择来调整模型
3.1使用RFECV(递归特征消除+交叉验证 -- Recursive Feature Elimination with Cross-Validation)
dtree_rfe = feature_selection.RFECV(dtree, step=1, scoring='accuracy', cv=cv_split)
dtree_rfe.fit(data1[data1_x_bin], data1[Target])
step=1
:每次消除一个特征,直到找到最优特征子集
cv=cv_split
:使用交叉验证保证结果稳定性
3.2选择被 RFE 选出的特征,并重新训练模型
X_rfe = data1[data1_x_bin].columns.values[dtree_rfe.get_support()]
rfe_results = model_selection.cross_validate(dtree, data1[X_rfe], data1[Target], cv=cv_split)
dtree_rfe.get_support()
返回布尔数组,表示哪些特征被保留(True=保留,False=删除)
data1[X_rfe]
只保留最优特征子集,用于训练新模型
cross_validate()
重新评估使用 RFE 之后的新数据集
补充:
1.尝试 SelectFromModel
替代 RFE
from sklearn.feature_selection import SelectFromModel
sfm = SelectFromModel(dtree, threshold='median')
sfm.fit(data1[data1_x_bin], data1[Target])
selected_features = data1[data1_x_bin].columns[sfm.get_support()]
2.使用 RandomForestClassifier
替代单一决策树
决策树容易过拟合,随机森林可以提升模型稳定性
4.决策树可视化(Graphviz
)
import graphviz
dot_data = tree.export_graphviz(
dtree,
out_file=None,
feature_names=data1_x_bin, # 特征名称
class_names=True, # 类别名称
filled=True, # 颜色填充,增强可读性
rounded=True # 圆角节点
)
graph = graphviz.Source(dot_data)
graph
验证和实施
1.选择多个分类模型
vote_est = [
('ada', ensemble.AdaBoostClassifier()), # AdaBoost
('bc', ensemble.BaggingClassifier()), # Bagging
('etc', ensemble.ExtraTreesClassifier()), # Extra Trees
('gbc', ensemble.GradientBoostingClassifier()), # Gradient Boosting
('rfc', ensemble.RandomForestClassifier()), # Random Forest
('gpc', gaussian_process.GaussianProcessClassifier()), # Gaussian Process
('lr', linear_model.LogisticRegressionCV()), # Logistic Regression
('bnb', naive_bayes.BernoulliNB()), # Bernoulli Naive Bayes
('gnb', naive_bayes.GaussianNB()), # Gaussian Naive Bayes
('knn', neighbors.KNeighborsClassifier()), # k-Nearest Neighbors
('svc', svm.SVC(probability=True)), # SVM (支持概率输出)
('xgb', XGBClassifier()) # XGBoost
]
2.硬投票(Hard Voting)
vote_hard = ensemble.VotingClassifier(estimators=vote_est, voting='hard')
vote_hard_cv = model_selection.cross_validate(vote_hard, data1[data1_x_bin], data1[Target], cv=cv_split)
vote_hard.fit(data1[data1_x_bin], data1[Target])
3.软投票(Soft Voting)
vote_soft = ensemble.VotingClassifier(estimators=vote_est, voting='soft')
vote_soft_cv = model_selection.cross_validate(vote_soft, data1[data1_x_bin], data1[Target], cv=cv_split)
vote_soft.fit(data1[data1_x_bin], data1[Target])
多机器学习模型分类器进行超参数调优,并将优化后的分类器用于投票分类器
1.定义超参数搜索范围
grid_n_estimator = [10, 50, 100, 300]
grid_ratio = [.1, .25, .5, .75, 1.0]
grid_learn = [.01, .03, .05, .1, .25]
grid_max_depth = [2, 4, 6, 8, 10, None]
grid_min_samples = [5, 10, .03, .05, .10]
grid_criterion = ['gini', 'entropy']
grid_bool = [True, False]
grid_seed = [0]
2.定义不同模型的超参数搜索网格
grid_param = [
[{
'n_estimators': grid_n_estimator,
'learning_rate': grid_learn,
'random_state': grid_seed
}],
[{
'n_estimators': grid_n_estimator,
'max_samples': grid_ratio,
'random_state': grid_seed
}],
[{
'n_estimators': grid_n_estimator,
'criterion': grid_criterion,
'max_depth': grid_max_depth,
'random_state': grid_seed
}]
]
3.使用 GridSearchCV
进行超参数调优
start_total = time.perf_counter()
for clf, param in zip(vote_est, grid_param): # 遍历模型和对应超参数
start = time.perf_counter()
best_search = model_selection.GridSearchCV(estimator=clf[1], param_grid=param, cv=cv_split, scoring='roc_auc')
best_search.fit(data1[data1_x_bin], data1[Target])
run = time.perf_counter() - start
best_param = best_search.best_params_
print('The best parameter for {} is {} with a runtime of {:.2f} seconds.'.format(clf[1].__class__.__name__, best_param, run))
# 用最优超参数更新分类器
clf[1].set_params(**best_param)
run_total = time.perf_counter() - start_total
print('Total optimization time was {:.2f} minutes.'.format(run_total/60))
#handmade decision tree - submission score = 0.77990
data_val['Survived'] = mytree(data_val).astype(int)
#decision tree w/full dataset modeling submission score: defaults= 0.76555, tuned= 0.77990
#submit_dt = tree.DecisionTreeClassifier()
#submit_dt = model_selection.GridSearchCV(tree.DecisionTreeClassifier(), param_grid=param_grid, scoring = 'roc_auc', cv = cv_split)
#submit_dt.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_dt.best_params_) #Best Parameters: {'criterion': 'gini', 'max_depth': 4, 'random_state': 0}
#data_val['Survived'] = submit_dt.predict(data_val[data1_x_bin])
#bagging w/full dataset modeling submission score: defaults= 0.75119, tuned= 0.77990
#submit_bc = ensemble.BaggingClassifier()
#submit_bc = model_selection.GridSearchCV(ensemble.BaggingClassifier(), param_grid= {'n_estimators':grid_n_estimator, 'max_samples': grid_ratio, 'oob_score': grid_bool, 'random_state': grid_seed}, scoring = 'roc_auc', cv = cv_split)
#submit_bc.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_bc.best_params_) #Best Parameters: {'max_samples': 0.25, 'n_estimators': 500, 'oob_score': True, 'random_state': 0}
#data_val['Survived'] = submit_bc.predict(data_val[data1_x_bin])
#extra tree w/full dataset modeling submission score: defaults= 0.76555, tuned= 0.77990
#submit_etc = ensemble.ExtraTreesClassifier()
#submit_etc = model_selection.GridSearchCV(ensemble.ExtraTreesClassifier(), param_grid={'n_estimators': grid_n_estimator, 'criterion': grid_criterion, 'max_depth': grid_max_depth, 'random_state': grid_seed}, scoring = 'roc_auc', cv = cv_split)
#submit_etc.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_etc.best_params_) #Best Parameters: {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'random_state': 0}
#data_val['Survived'] = submit_etc.predict(data_val[data1_x_bin])
#random foreset w/full dataset modeling submission score: defaults= 0.71291, tuned= 0.73205
#submit_rfc = ensemble.RandomForestClassifier()
#submit_rfc = model_selection.GridSearchCV(ensemble.RandomForestClassifier(), param_grid={'n_estimators': grid_n_estimator, 'criterion': grid_criterion, 'max_depth': grid_max_depth, 'random_state': grid_seed}, scoring = 'roc_auc', cv = cv_split)
#submit_rfc.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_rfc.best_params_) #Best Parameters: {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'random_state': 0}
#data_val['Survived'] = submit_rfc.predict(data_val[data1_x_bin])
#ada boosting w/full dataset modeling submission score: defaults= 0.74162, tuned= 0.75119
#submit_abc = ensemble.AdaBoostClassifier()
#submit_abc = model_selection.GridSearchCV(ensemble.AdaBoostClassifier(), param_grid={'n_estimators': grid_n_estimator, 'learning_rate': grid_ratio, 'algorithm': ['SAMME', 'SAMME.R'], 'random_state': grid_seed}, scoring = 'roc_auc', cv = cv_split)
#submit_abc.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_abc.best_params_) #Best Parameters: {'algorithm': 'SAMME.R', 'learning_rate': 0.1, 'n_estimators': 300, 'random_state': 0}
#data_val['Survived'] = submit_abc.predict(data_val[data1_x_bin])
#gradient boosting w/full dataset modeling submission score: defaults= 0.75119, tuned= 0.77033
#submit_gbc = ensemble.GradientBoostingClassifier()
#submit_gbc = model_selection.GridSearchCV(ensemble.GradientBoostingClassifier(), param_grid={'learning_rate': grid_ratio, 'n_estimators': grid_n_estimator, 'max_depth': grid_max_depth, 'random_state':grid_seed}, scoring = 'roc_auc', cv = cv_split)
#submit_gbc.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_gbc.best_params_) #Best Parameters: {'learning_rate': 0.25, 'max_depth': 2, 'n_estimators': 50, 'random_state': 0}
#data_val['Survived'] = submit_gbc.predict(data_val[data1_x_bin])
#extreme boosting w/full dataset modeling submission score: defaults= 0.73684, tuned= 0.77990
#submit_xgb = XGBClassifier()
#submit_xgb = model_selection.GridSearchCV(XGBClassifier(), param_grid= {'learning_rate': grid_learn, 'max_depth': [0,2,4,6,8,10], 'n_estimators': grid_n_estimator, 'seed': grid_seed}, scoring = 'roc_auc', cv = cv_split)
#submit_xgb.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_xgb.best_params_) #Best Parameters: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 300, 'seed': 0}
#data_val['Survived'] = submit_xgb.predict(data_val[data1_x_bin])
#hard voting classifier w/full dataset modeling submission score: defaults= 0.75598, tuned = 0.77990
#data_val['Survived'] = vote_hard.predict(data_val[data1_x_bin])
data_val['Survived'] = grid_hard.predict(data_val[data1_x_bin])
#soft voting classifier w/full dataset modeling submission score: defaults= 0.73684, tuned = 0.74162
#data_val['Survived'] = vote_soft.predict(data_val[data1_x_bin])
#data_val['Survived'] = grid_soft.predict(data_val[data1_x_bin])