import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import Pipeline,make_pipeline from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.feature_selection import SelectKBest from sklearn import cross_validation, metrics from sklearn.grid_search import GridSearchCV, RandomizedSearchCV from sklearn import svm from sklearn.linear_model import LogisticRegression from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils import check_array from sklearn.preprocessing import LabelEncoder from scipy import sparse class CategoricalEncoder(BaseEstimator, TransformerMixin): def __init__(self, encoding='onehot', categories='auto', dtype=np.float64, handle_unknown='error'): self.encoding = encoding self.categories = categories self.dtype = dtype self.handle_unknown = handle_unknown def fit(self, X, y=None): if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']: template = ("encoding should be either 'onehot', 'onehot-dense' " "or 'ordinal', got %s") raise ValueError(template % self.handle_unknown) if self.handle_unknown not in ['error', 'ignore']: template = ("handle_unknown should be either 'error' or " "'ignore', got %s") raise ValueError(template % self.handle_unknown) if self.encoding == 'ordinal' and self.handle_unknown == 'ignore': raise ValueError("handle_unknown='ignore' is not supported for" " encoding='ordinal'") X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True) n_samples, n_features = X.shape self._label_encoders_ = [LabelEncoder() for _ in range(n_features)] for i in range(n_features): le = self._label_encoders_[i] Xi = X[:, i] if self.categories == 'auto': le.fit(Xi) else: valid_mask = np.in1d(Xi, self.categories[i]) if not np.all(valid_mask): if self.handle_unknown == 'error': diff = np.unique(Xi[~valid_mask]) msg = ("Found unknown categories {0} in column {1}" " during fit".format(diff, i)) raise ValueError(msg) le.classes_ = np.array(np.sort(self.categories[i])) self.categories_ = [le.classes_ for le in self._label_encoders_] return self def transform(self, X): X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True) n_samples, n_features = X.shape X_int = np.zeros_like(X, dtype=np.int) X_mask = np.ones_like(X, dtype=np.bool) for i in range(n_features): valid_mask = np.in1d(X[:, i], self.categories_[i]) if not np.all(valid_mask): if self.handle_unknown == 'error': diff = np.unique(X[~valid_mask, i]) msg = ("Found unknown categories {0} in column {1}" " during transform".format(diff, i)) raise ValueError(msg) else: # Set the problematic rows to an acceptable value and # continue `The rows are marked `X_mask` and will be # removed later. X_mask[:, i] = valid_mask X[:, i][~valid_mask] = self.categories_[i][0] X_int[:, i] = self._label_encoders_[i].transform(X[:, i]) if self.encoding == 'ordinal': return X_int.astype(self.dtype, copy=False) mask = X_mask.ravel() n_values = [cats.shape[0] for cats in self.categories_] n_values = np.array([0] + n_values) indices = np.cumsum(n_values) column_indices = (X_int + indices[:-1]).ravel()[mask] row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), n_features)[mask] data = np.ones(n_samples * n_features)[mask] out = sparse.csc_matrix((data, (row_indices, column_indices)), shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() if self.encoding == 'onehot-dense': return out.toarray() else: return out ###################loading data################## train = pd.read_csv('train.csv', dtype={"Age": np.float64}) test = pd.read_csv("test.csv", dtype={"Age": np.float64}) PassengerId = test['PassengerId'] all_data = pd.concat([train, test], ignore_index=True) print(all_data.info()) # print(all_data.head()) # corr_train = all_data.corr() # a = corr_train["Survived"].sort_values(ascending=False) # print(a) ##################Data munging####################### ##numerical variables: PassengerId,Age,SibSp,Parch,Fare ##categorical variables: Pclass,Sex,Embarked ##variables with text: Name,Cabin,Ticket ######1. numerical variables : Age Fare all_data["Fare"].fillna(all_data["Fare"].mean(),inplace=True) from sklearn.ensemble import RandomForestRegressor def fit_missing_Age(data): age_data = data[["Age","SibSp","Parch","Fare","Pclass"]] known_age = age_data[age_data.Age.notnull()].as_matrix() unknown_age = age_data[age_data.Age.isnull()].as_matrix() y = known_age[:,0] x = known_age[:,1:] rfreg = RandomForestRegressor(random_state=0,n_estimators=100) rfreg.fit(x,y) predict_age = rfreg.predict(unknown_age[:,1:]) data.loc[(data.Age.isnull()),"Age"] = predict_age return data all_data = fit_missing_Age(all_data) #all_data["Age"].fillna(all_data["Age"].median(),inplace=True) age_fare = pd.DataFrame({"Age":all_data.Age,"Fare":all_data.Fare}) # from sklearn.preprocessing import StandardScaler # std_scaler = StandardScaler() # data_num = std_scaler.fit_transform(age_fare) # print(data_num.max,data_num.min,data_num.mean) from sklearn.preprocessing import MinMaxScaler minmax_scaler = MinMaxScaler() data1 = minmax_scaler.fit_transform(age_fare) data_num = pd.DataFrame({"Agenum":data1[:,0],"Farenum":data1[:,1]}) print(data_num.describe()) ######2.categorical variabled def Familylabel(s): if (s >= 2) & (s <= 4): return 2 elif (s >= 5) & (s <= 7) | (s == 1): return 1 elif (s > 7): return 0 all_data["FamilySize"] = all_data["SibSp"] + all_data["Parch"] + 1 all_data["FamilyLabel"] = all_data["FamilySize"].apply(Familylabel) #sns.barplot(x="FamilyLabel", y="Survived", data=all_data, palette='Set3') #plt.show() all_data["Title"] = all_data["Name"].apply(lambda x: x.split(",")[1].split(".")[0].strip()) Title_Dict = {} Title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer')) Title_Dict.update(dict.fromkeys(['Don', 'Sir', 'the Countess', 'Dona', 'Lady'], 'Royalty')) Title_Dict.update(dict.fromkeys(['Mme', 'Ms', 'Mrs'], 'Mrs')) Title_Dict.update(dict.fromkeys(['Mlle', 'Miss'], 'Miss')) Title_Dict.update(dict.fromkeys(['Mr'], 'Mr')) Title_Dict.update(dict.fromkeys(['Master', 'Jonkheer'], 'Master')) all_data['Title'] = all_data['Title'].map(Title_Dict) # print(all_data["Title"]) plt.subplot(333) sns.barplot(x="Title", y="Survived", data=all_data, palette='Set3') def set_Cabin_type(df): df.loc[(df.Cabin.notnull()),"Cabin"] = "Yes" df.loc[(df.Cabin.isnull()),"Cabin"] = "No" return df all_data = set_Cabin_type(all_data) #1.categorical variables: Pclass Sex Embarked all_data.loc[(all_data.Embarked.isnull()),"Embarked"] = "S" c = all_data.isnull().sum() print(c) data = all_data[["Survived","Embarked","FamilyLabel","Title","Cabin","Pclass","Sex"]] #data = pd.get_dummies(data) dummies_Embarked = pd.get_dummies(all_data["Embarked"],prefix = "Embarked") dummies_FamilyLabel = pd.get_dummies(all_data["FamilyLabel"],prefix = "FamilyLabel") dummies_Title = pd.get_dummies(all_data["Title"],prefix = "Title") dummies_Cabin = pd.get_dummies(all_data["Cabin"],prefix = "Cabin") dummies_Pclass = pd.get_dummies(all_data["Pclass"],prefix = "Pclass") dummies_Sex = pd.get_dummies(all_data["Sex"],prefix = "Sex") data = pd.concat([all_data["Survived"],dummies_FamilyLabel,dummies_Title,dummies_Cabin, dummies_Pclass, dummies_Sex,dummies_Embarked, data_num.Agenum,data_num.Farenum], axis=1) train = data[data.Survived.notnull()] test = data[data.Survived.isnull()].drop("Survived",axis =1) x = train.drop("Survived",axis =1) y = train["Survived"] # from sklearn.learning_curve import learning_curve # # # 用sklearn的learning_curve得到training_score和cv_score,使用matplotlib画出learning curve # def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, # train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True): # """ # 画出data在某模型上的learning curve. # 参数解释 # ---------- # estimator : 你用的分类器。 # title : 表格的标题。 # X : 输入的feature,numpy类型 # y : 输入的target vector # ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点 # cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份) # n_jobs : 并行的的任务数(默认1) # """ # train_sizes, train_scores, test_scores = learning_curve( # estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose) # # train_scores_mean = np.mean(train_scores, axis=1) # train_scores_std = np.std(train_scores, axis=1) # test_scores_mean = np.mean(test_scores, axis=1) # test_scores_std = np.std(test_scores, axis=1) # # if plot: # plt.figure() # plt.title(title) # if ylim is not None: # plt.ylim(*ylim) # plt.xlabel(u"训练样本数") # plt.ylabel(u"得分") # plt.gca().invert_yaxis() # plt.grid() # # plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, # alpha=0.1, color="b") # plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, # alpha=0.1, color="r") # plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label="训练集上得分") # plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label="交叉验证集上得分") # # plt.legend(loc="best") # # plt.draw() # plt.show() # plt.gca().invert_yaxis() # # midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2 # diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1]) # return midpoint, diff # # plot_learning_curve(clf_svm, u"学习曲线", x, y) from sklearn.ensemble import GradientBoostingClassifier clf_gradient = GradientBoostingClassifier() clf_gradient.fit(x,y) #predict_gradient = gradient.predict(test) gradient_score = cross_validation.cross_val_score(clf_gradient,x,y,cv=10) print("GDBT:",gradient_score) print(gradient_score.mean()) predictons_gradient = clf_gradient.predict(test) submision_gradient = pd.DataFrame({"PassengerId":PassengerId,"Survived":predictons_gradient.astype(int)}) submision_gradient.to_csv("submission_gradient.csv",index=False) from sklearn.neighbors import KNeighborsClassifier clf_knn = KNeighborsClassifier() clf_knn.fit(x,y) #predict_gradient = gradient.predict(test) knn_score = cross_validation.cross_val_score(clf_knn,x,y,cv=10) print("KNN:",knn_score) print(knn_score.mean()) predictons_knn = clf_knn.predict(test) submision_knn = pd.DataFrame({"PassengerId":PassengerId,"Survived":predictons_knn.astype(int)}) submision_knn.to_csv("submission_knn.csv",index=False) # # param_grid = [ # {"kernel":["linear"],"C":[12,20,30,40,50,60],"gamma":[0.01,0.03,0.1,0.4,0.7,0.9]}, # {"kernel":["rbf"],"C":[1,3,5,7,9,11],"gamma":[0.01,0.03,0.1,0.4,0.7,0.9]}, # ] # svm_reg = svm.SVC() # grid_research = GridSearchCV(svm_reg,param_grid,cv=10) # grid_research.fit(x,y) # print(grid_research.best_estimator_) # print(grid_research.best_score_) # predict_svmgrid = grid_research.predict(test) # submision_svmgrid = pd.DataFrame({"PassengerId":PassengerId,"Survived":predict_svmgrid.astype(int)}) # submision_svmgrid.to_csv("submision_svmgrid.csv",index=False) # clf_svm =svm.SVC(kernel="rbf",C=1,gamma=0.1) clf_svm.fit(x,y) svm_score = cross_validation.cross_val_score(clf_svm,x,y,cv=10) predict_svm = clf_svm.predict(test) submision_svm = pd.DataFrame({"PassengerId":PassengerId,"Survived":predict_svm.astype(int)}) submision_svm.to_csv("submision_svm.csv",index=False) print("SVM:",svm_score) print(svm_score.mean()) # param_rf = { # "max_features":["auto","sqrt",0.2,0.6], # "n_estimators":[50,100,150,200], # "criterion":["gini","entropy"], # "min_samples_leaf":[4,8,12,20] # } # clf_rf = RandomForestClassifier(random_state=16) # grid = GridSearchCV(clf_rf,param_rf) # grid.fit(x,y) # rf_score = cross_validation.cross_val_score(grid,x,y,cv=10) # print("RF_Grid:",rf_score) # print(rf_score.mean()) # print(grid.best_params_) # print(grid.best_estimator_) # print(grid.best_score_) clf_rf = RandomForestClassifier(criterion="entropy",max_features=0.6,min_samples_leaf=8,n_estimators=200,random_state=16) clf_rf.fit(x,y) rf_score = cross_validation.cross_val_score(clf_rf,x,y,cv=10) predict_rf = clf_rf.predict(test) submision_rf = pd.DataFrame({"PassengerId":PassengerId,"Survived":predict_rf.astype(int)}) submision_rf.to_csv("submision_rf.csv",index=False) print("RF:",rf_score) print(rf_score.mean()) clf_lr = LogisticRegression() clf_lr.fit(x,y) lr_score = cross_validation.cross_val_score(clf_lr,x,y,cv=10) # a = pd.DataFrame({"columns":list(x.columns)[0:],"coef":list(clf_lr.coef_.T)}) # print(a) print("LR:",lr_score) print(lr_score.mean()) from xgboost import XGBClassifier # param_test = { # "n_estimators":[25,30,34,38,40], # "max_depth":[4,5,7,10,12], # "learning_rate":[0.1,0.2,0.3] # } # clf_xgboost = XGBClassifier() # xgboost_grid = GridSearchCV(estimator=clf_xgboost,param_grid=param_test,cv=5) # xgboost_grid.fit(x,y) # xgboost_score = cross_validation.cross_val_score(xgboost_grid,x,y,cv=10) # print(xgboost_grid.best_params_) # print(xgboost_grid.best_estimator_) # print(xgboost_score) # print("Xgboost:",xgboost_score) # print(xgboost_score.mean()) clf_xgboost = XGBClassifier(learning_rate=0.2,max_depth=5,silent=True,objective="binary:logistic") #clf_xgboost = XGBClassifier(learning_rate=0.1,max_depth=2,silent=True,objective="binary:logistic") clf_xgboost.fit(x,y) xgboost_score = cross_validation.cross_val_score(clf_xgboost,x,y,cv=10) predict_xgboost = clf_xgboost.predict(test) submision_xgboost = pd.DataFrame({"PassengerId":PassengerId,"Survived":predict_xgboost.astype(int)}) submision_xgboost.to_csv("submision_xgboost.csv",index=False) print("Xgboost:",xgboost_score) print(xgboost_score.mean()) ######stacking####################3 from sklearn.model_selection import KFold ntrain = train.shape[0] ntest = test.shape[0] kf = KFold(n_splits=5,random_state=16) # for i,(a,b) in enumerate(kf.split(x)): # print(i) # print(a) # print(b) x = x.as_matrix() y = y.as_matrix() def get_oof(clf,x_train,y_train,x_test): oof_train = np.zeros((ntrain,)) #891 oof_test = np.zeros((ntest,)) #418 oof_test_skf = np.empty((5,ntest)) #5 * 418 for i,(train_index,test_index) in enumerate(kf.split(x_train)): kf_x_train = x_train[train_index] kf_y_train = y_train[train_index] kf_x_test = x_train[test_index] clf.fit(kf_x_train,kf_y_train) oof_train[test_index] = clf.predict(kf_x_test) oof_test_skf[i,:] = clf.predict(x_test) oof_test = oof_test_skf.mean(axis=0) return oof_train.reshape(-1,1),oof_test.reshape(-1,1) train1,test1 = get_oof(clf_gradient,x,y,test) #print(train1.shape,test1.shape) train2,test2 = get_oof(clf_svm,x,y,test) #print(train1.shape,test1.shape) train3,test3 = get_oof(clf_rf,x,y,test) #print(train1.shape,test1.shape) train_temp = np.column_stack((train1,train2)) train_stacking = np.column_stack((train_temp,train3)) #print(train_stacking.shape) test_temp = np.column_stack((test1,test2)) test_stacking = np.column_stack((test_temp,test3)) print(test_stacking.shape) from sklearn.linear_model import LogisticRegression stacking_lr = LogisticRegression() stacking_lr.fit(train_stacking,y) stacking_lr_score = cross_validation.cross_val_score(stacking_lr,train_stacking,y,cv=10) predict_stacking = stacking_lr.predict(test_stacking) submision_stacking = pd.DataFrame({"PassengerId":PassengerId,"Survived":predict_stacking.astype(int)}) submision_stacking.to_csv("submision_stacking.csv",index=False) print("Stacking:",stacking_lr_score) print(stacking_lr_score.mean()) # train = np.empty((ntrain,3)) # test = np.empty((ntest,3)) # print(train.shape,test.shape) # list = [clf_gradient,clf_svm,clf_rf] # # for i,clf in enumerate(list): # print(clf,i) # train[:,i], test[:,i] = get_oof(clf, x, y, test)