推荐系统-用户标签预测算法基础实践-决策树2

本文深入讲解机器学习中的关键算法,包括决策树、随机森林、集成学习等,通过实战案例如房价预测、鸢尾花分类,详述算法原理及应用技巧。

推荐系统-用户标签预测算法基础实践

1.泰坦尼克号获救人员识别实战

  • 加强iris的代码实战(掌握)
  • 代码版本一 : 未经过pca降温的X
#1.进行数据的读入---导入数据
from sklearn.datasets import load_iris
iris=load_iris()
#2.对数据进行简单的统计分析和图形化的展示
print(iris.keys())#['data', 'target', 'target_names', 'DESCR', 'feature_names']
print("iris data:",iris.data)
print("iris data:",type(iris.data))#iris data: <class 'numpy.ndarray'>
print("iris target:",iris.target)
print("iris targetname:",iris.target_names)
print("iris DESCR:",iris.DESCR)
print("iris  features_names:",iris.feature_names)
#['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
# import  pandas as pd
# df=pd.DataFrame(iris)
# print(df)
#2.1绘制图形
import seaborn  as sns
import matplotlib.pyplot as plt
# sns.pairplot(iris, hue="sepal length (cm)")
# plt.show()
#3.确定特征和标签-X和y
X=iris.data
y=iris.target
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=22)
#4.特征处理-特征工程
from sklearn.preprocessing import StandardScaler,MinMaxScaler
sc=MinMaxScaler() #feature_range : tuple (min, max), default=(0, 1)
#"""Standardize features by removing the mean and scaling to unit variance
X_train_std=sc.fit_transform(X_train)
# """Fit to data, then transform it.
X_test_std=sc.transform(X_test)
# """Perform standardization by centering and scaling
#4.1降维---pca
# from sklearn.decomposition import PCA
# pca=PCA(n_components=2)
# pca.fit(X_train_std)
# # print(pca.explained_variance_)#[0.24301994 0.03386828 0.01034326 0.00170887]
#5.建立机器学习模型
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(criterion="entropy")
dtc.fit(X_train_std,y_train)
#6.利用模型进行预测
y_pred=dtc.predict(X_test_std)
print(y_pred)
#7.校验模型
print("model in trainset score is:",dtc.score(X_train_std,y_train))
print("model in testset score is:",dtc.score(X_test_std,y_test))
# model in trainset score is: 1.0
# model in testset score is: 0.8666666666666667
# #7.保存模型
# from sklearn.externals import joblib
# joblib.dump(dtc,"dtctree.pkl")
# #8.模型可视化
# from sklearn.tree import export_graphviz
# export_graphviz(dtc,filled=True)
  • 代码版本二 : X经过PCA降维处理
#1.进行数据的读入---导入数据
from sklearn.datasets import load_iris
iris=load_iris()
#2.对数据进行简单的统计分析和图形化的展示
print(iris.keys())#['data', 'target', 'target_names', 'DESCR', 'feature_names']
print("iris data:",iris.data)
print("iris data:",type(iris.data))#iris data: <class 'numpy.ndarray'>
print("iris target:",iris.target)
print("iris targetname:",iris.target_names)
print("iris DESCR:",iris.DESCR)
print("iris  features_names:",iris.feature_names)
#['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
# import  pandas as pd
# df=pd.DataFrame(iris)
# print(df)
#2.1绘制图形
import seaborn  as sns
import matplotlib.pyplot as plt
# sns.pairplot(iris, hue="sepal length (cm)")
# plt.show()
#3.确定特征和标签-X和y
X=iris.data
y=iris.target

#4.1降维---pca
from sklearn.decomposition import PCA
pca=PCA(n_components=2)
X=pca.fit_transform(X)#Fit the model with X.
print(":"*1000)
print(X)
print(":"*1000)
# print(pca.explained_variance_)#[0.24301994 0.03386828 0.01034326 0.00170887]


from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=22)
#4.特征处理-特征工程
from sklearn.preprocessing import StandardScaler,MinMaxScaler
sc=MinMaxScaler() #feature_range : tuple (min, max), default=(0, 1)
#"""Standardize features by removing the mean and scaling to unit variance
X_train_std=sc.fit_transform(X_train)
# """Fit to data, then transform it.
X_test_std=sc.transform(X_test)
# """Perform standardization by centering and scaling

#5.建立机器学习模型
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(criterion="entropy")
dtc.fit(X_train_std,y_train)
#6.利用模型进行预测
y_pred=dtc.predict(X_test_std)
print(y_pred)
#7.校验模型
print("model in trainset score is:",dtc.score(X_train_std,y_train))
print("model in testset score is:",dtc.score(X_test_std,y_test))
# model in trainset score is: 1.0
# model in testset score is: 1.0
# #7.保存模型
# from sklearn.externals import joblib
# joblib.dump(dtc,"dtctree.pkl")
# #8.模型可视化
# from sklearn.tree import export_graphviz
# export_graphviz(dtc,filled=True)
  • 泰坦尼克号的问题
    • 加载数据–pandas和自身的数据
    • 预处理
    • 特征工程—类别型数据的处理
    • 建立模型
    • 模型预测
    • 模型校验

代码

#1.读取数据
import pandas as pd
import os
datapath=os.path.join(".","tantanic.txt")
tantanic=pd.read_csv(datapath)

# 信息展示函数
def show_data_info():
    print(tantanic.shape)  # (1313, 11)
    print(tantanic.info())
    import seaborn as sns
    import matplotlib.pyplot as plt
    sns.catplot(x="sex", y="survived", hue="pclass", kind="bar", data=tantanic);
    plt.show()
    # <class 'pandas.core.frame.DataFrame'>
    # RangeIndex: 1313 entries, 0 to 1312
    # Data columns (total 11 columns):
    # row.names    1313 non-null int64
    # pclass       1313 non-null object
    # survived     1313 non-null int64
    # name         1313 non-null object
    # age          633 non-null float64
    # embarked     821 non-null object
    # home.dest    754 non-null object
    # room         77 non-null object
    # ticket       69 non-null object
    # boat         347 non-null object
    # sex          1313 non-null object
#调用显示图像和属性函数
show_data_info()
#2.特征工程
#2.1选择特征
X=tantanic[["age","pclass","sex"]]
print(X)
y=tantanic["survived"]
#2.2缺失值处理---age年龄列---均值插补技术
X["age"].fillna(X["age"].mean(),inplace=True)#31.194181
print(X)
#2.3切分数据
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=22)
#2.3类别型变量处理---labelencoder+onehotencoder====>DictVector
from sklearn.feature_extraction import DictVectorizer
dv=DictVectorizer(sparse=False)
X_train_dv=dv.fit_transform(X_train.to_dict(orient="records"))
X_test_dv=dv.transform(X_test.to_dict(orient="records"))
print(X_train_dv)
print(dv.feature_names_)
# [[45.          0.          0.          1.          1.          0.        ]
#  [31.19418104  0.          0.          1.          0.          1.        ]
#  [31.19418104  1.          0.          0.          1.          0.        ]
#  ...
#  [31.19418104  0.          0.          1.          0.          1.        ]
#  [36.          1.          0.          0.          1.          0.        ]
#  [31.19418104  0.          0.          1.          0.          1.        ]]
# ['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male']
#3.建立模型
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(criterion="gini")
print(dtc.fit(X_train_dv,y_train))
#4.模型预测
y_pred=dtc.predict(X_test_dv)
print(y_pred)
#5.模型校验
print("model in trainset:",dtc.score(X_train_dv,y_train))
print("model in testset:",dtc.score(X_test_dv,y_test))
# model in trainset: 0.8657142857142858
# model in testset: 0.779467680608365
from sklearn.metrics import confusion_matrix
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值