数据预处理
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
data = pd.read_csv('mytrain.csv')
data.info()
data.drop(['Name','Ticket','Cabin','Unnamed: 12'],inplace=True,axis=1)
data['Age']=data['Age'].fillna(data['Age'].mean())
data=data.dropna()
data['Embarked'].unique().tolist()
import numpy as np
data['Embarked']=data['Embarked'].apply(lambda x:labels.index(x))
data['Sex']=(data['Sex'] == 'male').astype('int')
调参
x= data.iloc[:,data.columns !='Survived']
y=data['Survived']
from sklearn.model_selection import train_test_split
Xtrain,Xtest,Ytrain,Ytest=train_test_split(x,y,test_size=0.3)
for i in [Xtrain,Xtest,Ytrain,Ytest]:
i.index = range(i.shape[0])
clf = DecisionTreeClassifier(random_state = 25)
clf = clf.fit(Xtrain,Ytrain)
score = clf.score(Xtest,Ytest)
score
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier(random_state = 25)
score = cross_val_score(clf,x,y,cv=10).mean()
score
tr = []
te = []
for i in range(10):
clf = DecisionTreeClassifier(random_state = 25
,max_depth=i+1
,criterion='entropy'
)
clf = clf.fit(Xtrain,Ytrain)
score_tr = clf.score(Xtrain,Ytrain)
score_te = cross_val_score(clf,x,y,cv=10).mean()
tr.append(score_tr)
te.append(score_te)
print(max(te))
plt.plot(range(1,11),tr,color='red',label='train')
plt.plot(range(1,11),te,color='blue',label='test')
plt.xticks(range(1,11))
plt.legend()
plt.show()
网格搜索
import numpy as np
gini_threholds = np.linspace(0,0.5,50)
parameters = {'criterion':('gini','entropy')
,'splitter':('best','random')
,'max_depth':[*range(1,10)]
,'min_samples_leaf':[*range(1,50,5)]
,'min_impurity_decrease':[*np.linspace(0,0.5,50)]
}
clf = DecisionTreeClassifier(random_state = 25)
GS = GridSearchCV(clf,parameters,cv=10)
GS = GS.fit(Xtrain,Ytrain)
GS.best_params_
GS.best_score_