from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, ShuffleSplit,GridSearchCV
from sklearn import model_selection
###############GridSearchCV可以定义参数n_jobs=几,使用几个cpu核运行,默认为1,-1为全部核运行###树模型参数寻优可以自己定义,参数越多运行代价成倍增加defget_top_n_features(titanic_train_data_X, titanic_train_data_Y, top_n_features):# random forest
rf_est = RandomForestRegressor(random_state=0)
rf_param_grid ={'n_estimators':[400],'min_samples_split':[2,3],'max_depth':[10]}
rf_grid = model_selection.GridSearchCV(rf_est, rf_param_grid,n_jobs=-1, cv=5, verbose=1)
rf_grid.fit(titanic_train_data_X, titanic_train_data_Y)print('Top N Features Best RF Params:'+str(rf_grid.best_params_))print('Top N Features Best RF Score:'+str(rf_grid.best_score_))print('Top N Features RF Train Score:'+str(rf_grid.score(titanic_train_data_X, titanic_train_data_Y)))
feature_imp_sorted_rf = pd.DataFrame({'feature':list(titanic_train_data_X),'importance': rf_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
features_top_n_rf = feature_imp_sorted_rf.head(100)['feature']print('Sample 10 Features from RF Classifier')print(str(features_top_n_rf[:10]))# AdaBoost
ada_est =AdaBoostRegressor(random_state=0)
ada_param_grid ={'n_estimators':[500],'learning_rate':[0.01,0.1]}
ada_grid = model_selection.GridSearchCV(ada_est, ada_param_grid, n_jobs=-1,cv=5, verbose=1)
ada_grid.fit(titanic_train_data_X, titanic_train_data_Y)print('Top N Features Best Ada Params:'+str(ada_grid.best_params_))print('Top N Features Best Ada Score:'+str(ada_grid.best_score_))print('Top N Features Ada Train Score:'+str(ada_grid.score(titanic_train_data_X, titanic_train_data_Y)))
feature_imp_sorted_ada = pd.DataFrame({'feature':list(titanic_train_data_X),'importance': ada_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
features_top_n_ada = feature_imp_sorted_ada.head(top_n_features)['feature']print('Sample 10 Feature from Ada Classifier:')print(str(features_top_n_ada[:10]))# ExtraTree
et_est = ExtraTreesRegressor(random_state=0)
et_param_grid ={'n_estimators':[500],'min_samples_split':[3,4],'max_depth':[10]}
et_grid = model_selection.GridSearchCV(et_est, et_param_grid,n_jobs=-1,cv=5, verbose=1)
et_grid.fit(titanic_train_data_X, titanic_train_data_Y)print('Top N Features Best ET Params:'+str(et_grid.best_params_))print('Top N Features Best ET Score:'+str(et_grid.best_score_))print('Top N Features ET Train Score:'+str(et_grid.score(titanic_train_data_X, titanic_train_data_Y)))
feature_imp_sorted_et = pd.DataFrame({'feature':list(titanic_train_data_X),'importance': et_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
features_top_n_et = feature_imp_sorted_et.head(top_n_features)['feature']print('Sample 10 Features from ET Classifier:')print(str(features_top_n_et[:10]))# GradientBoosting
gb_est =GradientBoostingRegressor(random_state=0)
gb_param_grid ={'n_estimators':[500],'learning_rate':[0.01,0.1],'max_depth':[10]}
gb_grid = model_selection.GridSearchCV(gb_est, gb_param_grid,n_jobs=-1,cv=5, verbose=1)
gb_grid.fit(titanic_train_data_X, titanic_train_data_Y)print('Top N Features Best GB Params:'+str(gb_grid.best_params_))print('Top N Features Best GB Score:'+str(gb_grid.best_score_))print('Top N Features GB Train Score:'+str(gb_grid.score(titanic_train_data_X, titanic_train_data_Y)))
feature_imp_sorted_gb = pd.DataFrame({'feature':list(titanic_train_data_X),'importance': gb_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
features_top_n_gb = feature_imp_sorted_gb.head(top_n_features)['feature']print('Sample 10 Feature from GB Classifier:')print(str(features_top_n_gb[:10]))# DecisionTree
dt_est = DecisionTreeRegressor(random_state=0)
dt_param_grid ={'min_samples_split':[2,4],'max_depth':[10]}
dt_grid = model_selection.GridSearchCV(dt_est, dt_param_grid, n_jobs=-1,cv=5, verbose=1)
dt_grid.fit(titanic_train_data_X, titanic_train_data_Y)print('Top N Features Best DT Params:'+str(dt_grid.best_params_))print('Top N Features Best DT Score:'+str(dt_grid.best_score_))print('Top N Features DT Train Score:'+str(dt_grid.score(titanic_train_data_X, titanic_train_data_Y)))
feature_imp_sorted_dt = pd.DataFrame({'feature':list(titanic_train_data_X),'importance': dt_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
features_top_n_dt = feature_imp_sorted_dt.head(top_n_features)['feature']print('Sample 10 Features from DT Classifier:')print(str(features_top_n_dt[:10]))# merge the three models
features_top_n = pd.concat([features_top_n_rf, features_top_n_ada, features_top_n_et, features_top_n_gb, features_top_n_dt]
ignore_index=True).drop_duplicates()
features_importance = pd.concat([feature_imp_sorted_rf, feature_imp_sorted_ada, feature_imp_sorted_et,
feature_imp_sorted_gb, feature_imp_sorted_dt],ignore_index=True)return features_top_n , features_importance
feature_to_pick =30
feature_top_n, feature_importance = get_top_n_features(train_input, train_output, feature_to_pick)
train_input = pd.DataFrame(train_input[features_top_n])
test_data = pd.DataFrame(test_data[features_top_n]