import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
import numpy as np
data = pd.read_csv(r"D:\train.csv")
print(data.head())
print(data.info())
"""
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
"""
print(data.shape)
data.drop(['Cabin','Name','Ticket'],inplace=True,axis=1)
print(data.shape)
print(data['Age'].fillna(data["Age"].mean()))
data["Age"] = data['Age'].fillna(data["Age"].mean())
print(data.info())
data = data.dropna(axis=0)
print(data.info())
print(data["Embarked"].unique())
labels = data["Embarked"].unique().tolist()
data["Embarked"] = data["Embarked"].apply(lambda x: labels.index(x))
print(data["Embarked"])
print(data.info())
data["Sex"] = (data["Sex"] == "male").astype("int")
print(data["Sex"])
print(data.info())
print(data.columns != "Survived")
x = data.iloc[:,data.columns != "Survived"]
y = data.iloc[:,data.columns == "Survived"]
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3)
for i in [xtrain,xtest,ytrain,ytest]:
i.index = range(i.shape[0])
print(xtest)
print(ytest)
clf = DecisionTreeClassifier(random_state=30)
score = cross_val_score(clf,x,y,cv=10).mean()
print(score)
parameters = {"criterion":("gini","entropy")
,"splitter":("best","random")
,"max_depth":[*range(1,10)]
,"min_samples_leaf":[*range(1,50,5)]
,"min_impurity_decrease":[*np.linspace(0,0.5,20)]
}
gs = GridSearchCV(clf,parameters,cv=10)
gs = gs.fit(xtrain,ytrain)
print(gs.best_params_)
print("-------")
print(gs.best_score_)