import pandas as pd
import numpy as np
data = pd.read_csv('./tt./train.csv')#pandas
data.columns#返回一个index类型的列索引列表
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
data.head(3)#输出前三行
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
# data.describe()#可以看到各个数值的和,平均数等等数据情况,对数据集整体有个大概的了解
data.isnull().sum() #检查数据是否有缺失,缺失值的数量
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
data.info()#显示所有信息
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
data = data[['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
# 保留是否存活,阶层,性别,年龄,非直系亲戚数量,直系亲戚数量(父母和孩子),船票价格,登陆的港口位置
data.head(3)#可以看到去除了PassengerId,Name,Ticket,Cabin
Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S |
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C |
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S |
处理缺失值
data['Age']=data['Age'].fillna(data['Age'].mean())#用Age特征的均值填充缺失数据
data.fillna(0,inplace=True) # 填充 0 Method1 选取某个固定值/默认值填充缺失值。
#inplace=True的意思是进行原地操作,对于Conv2d这样的上层网络传递下来的tensor直接进行修改,好处就是可以节省运算内存,不用多储存变量y。
# data.fillna(data.mean(),inplace=True)# Method2 填充均值
# data.fillna(data.median(),inplace=True)# Method3 填充中位数
#还可以填充众数mode(),上下条的数据
# train_data.fillna(method='pad', inplace=True) # 填充前一条数据的值,但是前一条也不一定有值
# 学习https://blog.youkuaiyun.com/jingyi130705008/article/details/82670011
(1)填充固定值
data.fillna(0, inplace=True) # 填充 0 选取某个固定值/默认值填充缺失值。
(2)填充均值
data.fillna(train_data.mean(),inplace=True) # 填充均值 对每一列的缺失值,填充当列的均值。
(3)填充中位数
对每一列的缺失值,填充当列的中位数。
data.fillna(train_data.median(),inplace=True) # 填充中位数
(4)填充众数
对每一列的缺失值,填充当列的众数。由于存在某列缺失值过多,众数为nan的情况,因此这里取的是每列删除掉nan值后的众数。
train_data.fillna(train_data.mode(),inplace=True) # 填充众数,该数据缺失太多众数出现为nan的情况
features_mode = {}
for f in features:
print f,’:’, list(train_data[f].dropna().mode().values)
features_mode[f] = list(train_data[f].dropna().mode().values)[0]
train_data.fillna(features_mode,inplace=True)
(5)填充上下条的数据
对每一条数据的缺失值,填充其上下条数据的值。
train_data.fillna(method=‘pad’, inplace=True) # 填充前一条数据的值,但是前一条也不一定有值
train_data.fillna(0, inplace=True)
train_data.fillna(method=‘bfill’, inplace=True) # 填充后一条数据的值,但是后一条也不一定有值
train_data.fillna(0, inplace=True)
inplace=True的意思是进行原地操作,对于Conv2d这样的上层网络传递下来的tensor直接进行修改,好处就是可以节省运算内存,不用多储存变量y。
学习https://blog.youkuaiyun.com/jingyi130705008/article/details/82670011
data['Sex']=[1 if x=='male' else 0 for x in data.Sex]
one-hot编码
data['p1']=np.array(data['Pclass']==1).astype(np.int32)#将Pclass为1的在p1中为1,p2p3为0
data['p2']=np.array(data['Pclass']==2).astype(np.int32)
data['p3']=np.array(data['Pclass']==3).astype(np.int32)#将Pclass为3的在p1中为3,p1p2为0
data.head(3)
Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | p1 | p2 | p3 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | 1 | 22.0 | 1 | 0 | 7.2500 | S | 0 | 0 | 1 |
1 | 1 | 1 | 0 | 38.0 | 1 | 0 | 71.2833 | C | 1 | 0 | 0 |
2 | 1 | 3 | 0 | 26.0 | 0 | 0 | 7.9250 | S | 0 | 0 | 1 |
del data['Pclass']
data.Embarked.unique()#知道这一列中有哪些值
array(['S', 'C', 'Q', 0], dtype=object)
data['e1']=np.array(data['Embarked']=='S').astype(np.int32)
data['e2']=np.array(data['Embarked']=='C').astype(np.int32)
data['e3']=np.array(data['Embarked']=='Q').astype(np.int32)
data.head(3)
Survived | Sex | Age | SibSp | Parch | Fare | Embarked | p1 | p2 | p3 | e1 | e2 | e3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 22.0 | 1 | 0 | 7.2500 | S | 0 | 0 | 1 | 1 | 0 | 0 |
1 | 1 | 0 | 38.0 | 1 | 0 | 71.2833 | C | 1 | 0 | 0 | 0 | 1 | 0 |
2 | 1 | 0 | 26.0 | 0 | 0 | 7.9250 | S | 0 | 0 | 1 | 1 | 0 | 0 |
del data['Embarked']
data.values.dtype
dtype('float64')
data.head(3)
Survived | Sex | Age | SibSp | Parch | Fare | p1 | p2 | p3 | e1 | e2 | e3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 22.0 | 1 | 0 | 7.2500 | 0 | 0 | 1 | 1 | 0 | 0 |
1 | 1 | 0 | 38.0 | 1 | 0 | 71.2833 | 1 | 0 | 0 | 0 | 1 | 0 |
2 | 1 | 0 | 26.0 | 0 | 0 | 7.9250 | 0 | 0 | 1 | 1 | 0 | 0 |
构建训练、目标集 train target
data_train = data[['Sex','Age','SibSp','Parch','Fare','p1','p2','p3','e1','e2','e3']].values
#去掉一列,并变成了numpy.ndarray
data_train#数组形式??numpy.array
# data_train.head(3)#'numpy.ndarray' object has no attribute 'head',head()是pnadas中的
array([[ 1. , 22. , 1. , ..., 1. ,
0. , 0. ],
[ 0. , 38. , 1. , ..., 0. ,
1. , 0. ],
[ 0. , 26. , 0. , ..., 1. ,
0. , 0. ],
...,
[ 0. , 29.69911765, 1. , ..., 1. ,
0. , 0. ],
[ 1. , 26. , 0. , ..., 0. ,
1. , 0. ],
[ 1. , 32. , 0. , ..., 0. ,
0. , 1. ]])
# data_target = data['Survived']#目标数据集为是否存活
# data_target
data_target = data['Survived'].values.reshape(len(data),1) #使其长度为1
# data_target
np.shape(data_train),np.shape(data_target)#两个文件的大小
((891, 11), (891, 1))
划分训练集、测试集
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(data_train,data_target,test_size=0.2)#以测试集20%来分割
x_train.shape,x_test.shape
((712, 11), (179, 11))
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(x_train,y_train)
DecisionTreeClassifier()
model.score(x_test, y_test)#测试集得分
0.7374301675977654
model.score(x_train,y_train)#训练集得分
0.9845505617977528
- onTreeClassifier过拟合的方法是剪枝pruning.#可以看到针对训练样本评分很高,但是针对test set的评分比较低,obviously这是overfitting的特征。解决DecisionTreeClassifier过拟合的方法是剪枝pruning.
- unfortunately, scikit-learn不支持 post-pruning(后剪枝)
优化模型参数,选出最优的max_depth
def m_score(depth):
model = DecisionTreeClassifier(max_depth=depth)
model.fit(x_train,y_train)
train_score = model.score(x_train, y_train)
test_score = model.score(x_test, y_test)
return train_score,test_score
depths = range(2,15) #从2到14,不包括15
scores = [m_score(depth) for depth in depths]
scores#不同深度下的分数
[(0.8089887640449438, 0.7262569832402235),
(0.8384831460674157, 0.8044692737430168),
(0.8398876404494382, 0.8044692737430168),
(0.8525280898876404, 0.7988826815642458),
(0.8665730337078652, 0.8044692737430168),
(0.8764044943820225, 0.7877094972067039),
(0.8876404494382022, 0.7877094972067039),
(0.8946629213483146, 0.7988826815642458),
(0.9073033707865169, 0.7932960893854749),
(0.9199438202247191, 0.7653631284916201),
(0.9325842696629213, 0.7877094972067039),
(0.9424157303370787, 0.7877094972067039),
(0.9508426966292135, 0.7932960893854749)]
train_s = [s[0] for s in scores]#train_s对应第1列
test_s = [s[1] for s in scores]#test_s第二列
best_score_index=np.argmax(test_s)#取出test_s中元素最大值所对应的索引,
best_param=depths[best_score_index]#对应索引的深度
best_score=test_s[best_score_index]#对应索引的分数
print('best param:{0:.4f}; best score:{1:.4f}'.format(best_param, best_score))
##由此可以看到针对模型深度这个参数,最佳的max_depth为8,而此时对应的交叉验证数据集评分为test_s=0.8547
best param:3.0000; best score:0.8045
import matplotlib.pyplot as plt
# plt.figure(figsize=(6,4), dpi=144)
# plt.grid()
# plt.xlabel('max depth of decision tree')
# plt.ylabel('score')
# plt.plot(depths, test_s, '.g-',label='cross-validation score')
# plt.plot(depths, train_s, '.r–',label='raining score')
# plt.legend()
# plt.show()
# plt.plot(depths,train_s)
# plt.plot(depths, test_s)
plt.show()
使用同样的方法,我们可以考察参数min_impurity_split。这个参数用来指定信息熵或基尼不纯度的阈值,当决策树分裂后,其信息熵增益低于这个阈值时,则不再分裂。
#训练模型并计算评分
def m_score(value):
model = DecisionTreeClassifier(min_impurity_decrease=value)
model.fit(x_train,y_train)
train_score = model.score(x_train, y_train)
test_score = model.score(x_test, y_test)
return train_score,test_score
# 指定参数范围,分别训练模型,并计算评分
values = np.linspace(0,0.5,50)
scores=[m_score(value) for value in values]
train_s = [s[0] for s in scores]#train_s对应第1列
test_s = [s[1] for s in scores]#test_s第二列
# 找出评分最高的模型参数
best_index = np.argmax(test_s) ##取出test_s中元素最大值所对应的索引,
best_param=values[best_index]#对应索引的值
best_score = test_s[best_index]#对该索引的分数
print('best param:{0:.4f}; best score:{1:.4f}'.format(best_param, best_score))
#可以看到最优的mini_impurity_split为0.0102,此时对应的交叉检验评分为0.8547
best param:0.0102; best score:0.8101
plt.plot(train_s)
plt.plot(test_s)
[<matplotlib.lines.Line2D at 0x258237c3388>]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-u18IvY2G-1608536283846)(output_57_1.png)]
plt.show()
在上述方法中,我们通过对单个参数的不同值进行模型拟合来找到最佳参数,但这种方法有很大局限性。
问题一:数据不稳定。由于划分随机,每次重新划分训练集和交叉验证集时,其数据集有不同,所得到的最优模型参数不同,训练出的模型有差异。
解决方法:多次计算,取平均值。针对模型某个特定参数,多次划分数据集,多次训练模型,计算出该参数的最低评分、最高评分、平均评分。
问题二:不能一次选择多个参数。无法平衡两个参数甚至多个参数的最优选择。
解决方法:优化代码使得能处理多个参数,或者,使用GridSearchCV类来实现多个参数同时处理。
利用GridSearchCV类来选择一个参数的最优值
# # 利用GridSearchCV类来选择一个参数的最优值
# from sklearn.model_selection import GridSearchCV
# # 设置参数矩阵
# thresholds = np.linspace(0, 0.5, 50)
# param_grid = {'min_impurity_decrease': thresholds}
# clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, return_train_score=True)
# clf.fit(data_train, data_target)
# print("best param: {0}\nbest score: {1}".format(clf.best_params_, clf.best_score_))
# clf.cv_results_ #保存了计算过程的所有中间结果。
from sklearn.model_selection import GridSearchCV
## 设置参数矩阵
values = np.linspace(0, 0.5, 50) #在指定的间隔内返回均匀间隔的数字。
depths = range(2,15)
#关键的参数是param_grid,它是一个字典,字典关键字所对应的的值是一个列表。
param_grid = {'max_depth': depths, 'min_impurity_decrease': values}
model = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
#GridSearchCV会枚举列表里所有的值来构建模型,多次计算模型,并计算模型评分,最终得出指定参数值的平均评分及标准差。
#cv参数用来指定交叉验证集的生成规则,代码中的cv=5表示每次计算都把数据集分成5份,取其中一份作为交叉验证数据集,其他的作为训练集。
model.fit(data_train, data_target)
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
param_grid={'max_depth': range(2, 15),
'min_impurity_decrease': array([0. , 0.01020408, 0.02040816, 0.03061224, 0.04081633,
0.05102041, 0.06122449, 0.07142857, 0.08163265, 0.09183673,
0.10204082, 0.1122449 , 0.12244898, 0.13265306, 0.14285714,
0.15306122, 0.16326531, 0.17346939, 0.18367347, 0.19387755,
0.20408163, 0.21428571, 0.2244898 , 0.23469388, 0.24489796,
0.25510204, 0.26530612, 0.2755102 , 0.28571429, 0.29591837,
0.30612245, 0.31632653, 0.32653061, 0.33673469, 0.34693878,
0.35714286, 0.36734694, 0.37755102, 0.3877551 , 0.39795918,
0.40816327, 0.41836735, 0.42857143, 0.43877551, 0.44897959,
0.45918367, 0.46938776, 0.47959184, 0.48979592, 0.5 ])})
model.best_params_
{'max_depth': 6, 'min_impurity_decrease': 0.0}
model.best_score_
0.8159500345238841
# 学习https://blog.youkuaiyun.com/silvia__y/article/details/103555781