一、实例
泰坦尼克号预测生死
import pandas as pd
from sklearn.tree import DecisionTreeClassifier #决策树分类器。
from sklearn.feature_extraction import DictVectorizer # 将特征值映射列表转换为向量
from sklearn.model_selection import train_test_split # 将数据集拆分成训练集和测试集
def decision():
"""
决策树对泰坦尼克号进行预测生死
:return:
"""
# 读取数据
titan = pd.read_excel(r"http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls")
# 处理数据,找到特征值和目标值(目标值为是否存活,特征值为对是否存活有关联的值)
x = titan[["pclass", "age", "sex"]]
y = titan["survived"]
# 处理缺失值
x["age"].fillna(x["age"].mean(), inplace=True)
# 分割数据集到训练集和测试集 train_test_split()
x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size = 0.25)
# print(x_train.to_dict(orient = "records"))
# 进行处理 (特征工程)特征-》类别-》one_hot编码
dict = DictVectorizer(sparse=False)
x_train = dict.fit_transform(x_train.to_dict(orient = "records"))
print(dict.get_feature_names())
# print(x_train)
x_test = dict.transform(x_test.to_dict(orient = "records"))
# 用决策树进行预测
dec = DecisionTreeClassifier(criterion="gini", max_depth = 5)
dec.fit(x_train , y_train)
# 预测准确率
print(dec.score(x_test, y_test))
if __name__ == "__main__":
decision()