推荐系统-用户标签预测算法基础实践
1.泰坦尼克号获救人员识别实战
- 加强iris的代码实战(掌握)
- 代码版本一 : 未经过pca降温的X
#1.进行数据的读入---导入数据
from sklearn.datasets import load_iris
iris=load_iris()
#2.对数据进行简单的统计分析和图形化的展示
print(iris.keys())#['data', 'target', 'target_names', 'DESCR', 'feature_names']
print("iris data:",iris.data)
print("iris data:",type(iris.data))#iris data: <class 'numpy.ndarray'>
print("iris target:",iris.target)
print("iris targetname:",iris.target_names)
print("iris DESCR:",iris.DESCR)
print("iris features_names:",iris.feature_names)
#['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
# import pandas as pd
# df=pd.DataFrame(iris)
# print(df)
#2.1绘制图形
import seaborn as sns
import matplotlib.pyplot as plt
# sns.pairplot(iris, hue="sepal length (cm)")
# plt.show()
#3.确定特征和标签-X和y
X=iris.data
y=iris.target
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=22)
#4.特征处理-特征工程
from sklearn.preprocessing import StandardScaler,MinMaxScaler
sc=MinMaxScaler() #feature_range : tuple (min, max), default=(0, 1)
#"""Standardize features by removing the mean and scaling to unit variance
X_train_std=sc.fit_transform(X_train)
# """Fit to data, then transform it.
X_test_std=sc.transform(X_test)
# """Perform standardization by centering and scaling
#4.1降维---pca
# from sklearn.decomposition import PCA
# pca=PCA(n_components=2)
# pca.fit(X_train_std)
# # print(pca.explained_variance_)#[0.24301994 0.03386828 0.01034326 0.00170887]
#5.建立机器学习模型
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(criterion="entropy")
dtc.fit(X_train_std,y_train)
#6.利用模型进行预测
y_pred=dtc.predict(X_test_std)
print(y_pred)
#7.校验模型
print("model in trainset score is:",dtc.score(X_train_std,y_train))
print("model in testset score is:",dtc.score(X_test_std,y_test))
# model in trainset score is: 1.0
# model in testset score is: 0.8666666666666667
# #7.保存模型
# from sklearn.externals import joblib
# joblib.dump(dtc,"dtctree.pkl")
# #8.模型可视化
# from sklearn.tree import export_graphviz
# export_graphviz(dtc,filled=True)
- 代码版本二 : X经过PCA降维处理
#1.进行数据的读入---导入数据
from sklearn.datasets import load_iris
iris=load_iris()
#2.对数据进行简单的统计分析和图形化的展示
print(iris.keys())#['data', 'target', 'target_names', 'DESCR', 'feature_names']
print("iris data:",iris.data)
print("iris data:",type(iris.data))#iris data: <class 'numpy.ndarray'>
print("iris target:",iris.target)
print("iris targetname:",iris.target_names)
print("iris DESCR:",iris.DESCR)
print("iris features_names:",iris.feature_names)
#['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
# import pandas as pd
# df=pd.DataFrame(iris)
# print(df)
#2.1绘制图形
import seaborn as sns
import matplotlib.pyplot as plt
# sns.pairplot(iris, hue="sepal length (cm)")
# plt.show()
#3.确定特征和标签-X和y
X=iris.data
y=iris.target
#4.1降维---pca
from sklearn.decomposition import PCA
pca=PCA(n_components=2)
X=pca.fit_transform(X)#Fit the model with X.
print(":"*1000)
print(X)
print(":"*1000)
# print(pca.explained_variance_)#[0.24301994 0.03386828 0.01034326 0.00170887]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=22)
#4.特征处理-特征工程
from sklearn.preprocessing import StandardScaler,MinMaxScaler
sc=MinMaxScaler() #feature_range : tuple (min, max), default=(0, 1)
#"""Standardize features by removing the mean and scaling to unit variance
X_train_std=sc.fit_transform(X_train)
# """Fit to data, then transform it.
X_test_std=sc.transform(X_test)
# """Perform standardization by centering and scaling
#5.建立机器学习模型
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(criterion="entropy")
dtc.fit(X_train_std,y_train)
#6.利用模型进行预测
y_pred=dtc.predict(X_test_std)
print(y_pred)
#7.校验模型
print("model in trainset score is:",dtc.score(X_train_std,y_train))
print("model in testset score is:",dtc.score(X_test_std,y_test))
# model in trainset score is: 1.0
# model in testset score is: 1.0
# #7.保存模型
# from sklearn.externals import joblib
# joblib.dump(dtc,"dtctree.pkl")
# #8.模型可视化
# from sklearn.tree import export_graphviz
# export_graphviz(dtc,filled=True)
- 泰坦尼克号的问题
- 加载数据–pandas和自身的数据
- 预处理
- 特征工程—类别型数据的处理
- 建立模型
- 模型预测
- 模型校验
代码
#1.读取数据
import pandas as pd
import os
datapath=os.path.join(".","tantanic.txt")
tantanic=pd.read_csv(datapath)
# 信息展示函数
def show_data_info():
print(tantanic.shape) # (1313, 11)
print(tantanic.info())
import seaborn as sns
import matplotlib.pyplot as plt
sns.catplot(x="sex", y="survived", hue="pclass", kind="bar", data=tantanic);
plt.show()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 1313 entries, 0 to 1312
# Data columns (total 11 columns):
# row.names 1313 non-null int64
# pclass 1313 non-null object
# survived 1313 non-null int64
# name 1313 non-null object
# age 633 non-null float64
# embarked 821 non-null object
# home.dest 754 non-null object
# room 77 non-null object
# ticket 69 non-null object
# boat 347 non-null object
# sex 1313 non-null object
#调用显示图像和属性函数
show_data_info()
#2.特征工程
#2.1选择特征
X=tantanic[["age","pclass","sex"]]
print(X)
y=tantanic["survived"]
#2.2缺失值处理---age年龄列---均值插补技术
X["age"].fillna(X["age"].mean(),inplace=True)#31.194181
print(X)
#2.3切分数据
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=22)
#2.3类别型变量处理---labelencoder+onehotencoder====>DictVector
from sklearn.feature_extraction import DictVectorizer
dv=DictVectorizer(sparse=False)
X_train_dv=dv.fit_transform(X_train.to_dict(orient="records"))
X_test_dv=dv.transform(X_test.to_dict(orient="records"))
print(X_train_dv)
print(dv.feature_names_)
# [[45. 0. 0. 1. 1. 0. ]
# [31.19418104 0. 0. 1. 0. 1. ]
# [31.19418104 1. 0. 0. 1. 0. ]
# ...
# [31.19418104 0. 0. 1. 0. 1. ]
# [36. 1. 0. 0. 1. 0. ]
# [31.19418104 0. 0. 1. 0. 1. ]]
# ['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male']
#3.建立模型
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(criterion="gini")
print(dtc.fit(X_train_dv,y_train))
#4.模型预测
y_pred=dtc.predict(X_test_dv)
print(y_pred)
#5.模型校验
print("model in trainset:",dtc.score(X_train_dv,y_train))
print("model in testset:",dtc.score(X_test_dv,y_test))
# model in trainset: 0.8657142857142858
# model in testset: 0.779467680608365
from sklearn.metrics import confusion_matrix

本文深入讲解机器学习中的关键算法,包括决策树、随机森林、集成学习等,通过实战案例如房价预测、鸢尾花分类,详述算法原理及应用技巧。
最低0.47元/天 解锁文章
4336

被折叠的 条评论
为什么被折叠?



