实验目的
- 理解线性模型原理并能够利用线性模型解决相关问题;
- 熟练掌握利用 Scikit-learn 中和线性模型相关的模块解决问题。
实验内容
使用线性判别分析和对数几率回归解决手写数字识别问题。
实验要求
- 对问题进行简单描述并给出线性判别分析和对数几率回归解决分类问题的原理;
- 将问题相关数据集划分成训练集和测试集;
- 给出线性判别分析和对数几率回归解决手写数字识别问题的代码;
- 对训练后的模型进行交叉验证,进而对不同模型进行比较和评估;
线性判别分析
from sklearn.model_selection import train_test_split # 划分数据集和测试集的模块
from sklearn.datasets import load_digits # 导入手写数字数据的模块
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # 导入线性模型
digits = load_digits() # 保存数据集
X=digits.data
y=digits.target
"""
X:待划分的样本数据
y:待划分的对应样本数据的样本标签
test_size:
1)浮点数,在0 ~ 1之间,表示样本占比(test_size = 0.3,则样本数据中有30%的数据作为测试数据,记入X_test,其余70%数据记入X_train,同时适用于样本标签);
2)整数,表示样本数据中有多少数据记入X_test中,其余数据记入X_train
random_state:随机数种子,种子不同,每次采的样本不一样;种子相同,采的样本不变.
shuffle:洗牌模式
1)shuffle = False,不打乱样本数据顺序;
2)shuffle = True,打乱样本数据顺序
"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # 划分数据集
model=LinearDiscriminantAnalysis( ) # 建立线性判别分析模型
model.fit(X_train, y_train) # 训练
Out[9]: LinearDiscriminantAnalysis()
model.predict(X_test) # 对测试集进行预测
Out[10]: # 预测结果
array([6, 9, 3, 7, 2, 2, 5, 2, 5, 3, 1, 9, 4, 0, 4, 2, 3, 7, 8, 8, 4, 3,
9, 7, 5, 6, 3, 5, 6, 3, 4, 9, 1, 4, 4, 6, 9, 4, 7, 6, 6, 9, 1, 3,
6, 1, 3, 0, 6, 5, 5, 1, 9, 5, 6, 0, 9, 0, 0, 1, 0, 4, 5, 2, 4, 5,
7, 0, 7, 5, 9, 9, 5, 4, 7, 0, 7, 5, 5, 9, 9, 0, 2, 3, 8, 0, 6, 4,
4, 9, 1, 2, 8, 3, 5, 2, 9, 0, 4, 4, 4, 3, 5, 3, 1, 3, 5, 9, 4, 2,
7, 7, 4, 4, 1, 9, 2, 7, 8, 7, 2, 6, 9, 4, 0, 7, 2, 7, 5, 8, 7, 5,
7, 9, 0, 6, 6, 4, 2, 8, 0, 9, 4, 6, 9, 9, 6, 9, 0, 5, 5, 6, 6, 0,
6, 4, 3, 9, 3, 8, 7, 2, 9, 0, 4, 5, 3, 6, 5, 8, 9, 8, 4, 2, 1, 3,
7, 7, 2, 2, 3, 9, 8, 0, 3, 2, 3, 5, 6, 9, 9, 4, 1, 5, 4, 2, 3, 6,
4, 8, 5, 9, 5, 7, 8, 9, 4, 8, 1, 5, 4, 4, 9, 6, 1, 8, 6, 0, 4, 5,
2, 7, 1, 6, 4, 5, 6, 0, 3, 2, 3, 6, 7, 9, 9, 1, 4, 7, 6, 5, 1, 5,
5, 1, 9, 2, 8, 8, 9, 8, 7, 6, 2, 2, 2, 3, 4, 8, 8, 3, 6, 0, 9, 7,
7, 0, 1, 0, 4, 5, 1, 5, 3, 6, 0, 4, 1, 0, 0, 3, 6, 5, 9, 7, 3, 5,
5, 9, 9, 8, 5, 3, 3, 2, 0, 5, 8, 3, 4, 0, 2, 4, 6, 4, 3, 4, 5, 0,
5, 2, 1, 3, 1, 4, 1, 1, 7, 0, 1, 5, 2, 1, 2, 8, 7, 0, 6, 4, 8, 1,
5, 1, 8, 4, 5, 8, 5, 9, 8, 6, 0, 6, 2, 0, 7, 9, 1, 9, 5, 2, 7, 7,
9, 8, 7, 4, 3, 8, 3, 5, 6, 0, 0, 3, 0, 5, 0, 0, 4, 1, 2, 8, 4, 5,
9, 6, 3, 1, 8, 8, 4, 2, 3, 8, 9, 8, 8, 5, 0, 6, 3, 3, 7, 1, 6, 4,
1, 2, 1, 8, 6, 4, 7, 4, 8, 3, 4, 0, 5, 1, 9, 4, 5, 7, 6, 3, 7, 0,
5, 9, 7, 5, 9, 7, 4, 2, 2, 9, 0, 7, 5, 2, 3, 6, 3, 9, 6, 9, 5, 0,
1, 5, 5, 8, 3, 3, 6, 2, 6, 5, 5, 2, 0, 8, 7, 3, 7, 0, 2, 2, 3, 5,
8, 7, 3, 6, 5, 9, 9, 2, 5, 6, 3, 0, 7, 1, 1, 9, 6, 1, 1, 0, 0, 2,
9, 8, 7, 9, 3, 7, 7, 1, 3, 5, 4, 6, 1, 2, 1, 1, 8, 7, 6, 9, 2, 0,
4, 4, 8, 8, 7, 1, 3, 1, 7, 1, 3, 5, 1, 7, 0, 0, 2, 2, 6, 9, 4, 8,
9, 0, 6, 7, 7, 9, 5, 4, 7, 0, 7, 6, 8, 7, 1, 4, 6, 2, 8, 7, 5, 9,
0, 3, 9, 6, 6, 1, 9, 1, 2, 9, 8, 9, 7, 4, 8, 5, 5, 9, 7, 7, 6, 8,
1, 3, 5, 7, 9, 5, 9, 2, 1, 1, 2, 2, 4, 8, 7, 5, 8, 8, 9, 4, 9, 0])
model.score(X_test, y_test) # 对测试集预测输出准确率
Out[12]: 0.9494949494949495
对数几率回归
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression # 导入对数回归模型
digits = load_digits()
X=digits.data
y=digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
model = LogisticRegression(random_state=0, solver='newton-cg')
model.fit(X_train, y_train)
Out[5]: LogisticRegression(random_state=0, solver='newton-cg')
model.predict(X_test)
Out[6]:
array([6, 9, 3, 7, 2, 1, 5, 2, 5, 2, 1, 9, 4, 0, 4, 2, 3, 7, 8, 8, 4, 3,
9, 7, 5, 6, 3, 5, 6, 3, 4, 9, 1, 4, 4, 6, 9, 4, 7, 6, 6, 9, 1, 3,
6, 1, 3, 0, 6, 5, 5, 1, 9, 5, 6, 0, 9, 0, 0, 1, 0, 4, 5, 2, 4, 5,
7, 0, 7, 5, 9, 5, 5, 4, 7, 0, 4, 5, 5, 9, 9, 0, 2, 3, 8, 0, 6, 4,
4, 9, 1, 2, 8, 3, 5, 2, 9, 0, 4, 4, 4, 3, 5, 3, 1, 3, 5, 9, 4, 2,
7, 7, 4, 4, 1, 9, 2, 7, 8, 7, 2, 6, 9, 4, 0, 7, 2, 7, 5, 8, 7, 5,
7, 5, 0, 6, 6, 4, 2, 8, 0, 9, 4, 6, 9, 9, 6, 9, 0, 5, 5, 6, 6, 0,
6, 4, 3, 9, 3, 8, 7, 2, 9, 0, 4, 5, 3, 6, 5, 9, 9, 8, 4, 2, 1, 3,
7, 7, 2, 2, 3, 9, 8, 0, 3, 2, 2, 5, 6, 9, 9, 4, 1, 5, 4, 2, 3, 6,
4, 8, 5, 9, 5, 7, 8, 9, 4, 8, 1, 5, 4, 4, 9, 6, 1, 8, 6, 0, 4, 5,
2, 7, 1, 6, 4, 5, 6, 0, 3, 2, 3, 6, 7, 1, 9, 1, 4, 7, 6, 5, 8, 5,
5, 1, 5, 2, 8, 8, 9, 9, 7, 6, 2, 2, 2, 3, 4, 8, 8, 3, 6, 0, 9, 7,
7, 0, 1, 0, 4, 5, 1, 5, 3, 6, 0, 4, 1, 0, 0, 3, 6, 5, 9, 7, 3, 5,
5, 9, 9, 8, 5, 3, 3, 2, 0, 5, 8, 3, 4, 0, 2, 4, 6, 4, 3, 4, 5, 0,
5, 2, 1, 3, 1, 4, 1, 1, 7, 0, 1, 5, 2, 1, 2, 8, 7, 0, 6, 4, 8, 8,
5, 1, 8, 4, 5, 8, 7, 9, 8, 6, 0, 6, 2, 0, 7, 9, 8, 9, 5, 2, 7, 7,
1, 8, 7, 4, 3, 8, 3, 5, 6, 0, 0, 3, 0, 5, 0, 0, 4, 1, 2, 8, 4, 5,
9, 6, 3, 1, 8, 8, 4, 2, 3, 8, 9, 8, 8, 5, 0, 6, 3, 3, 7, 1, 6, 4,
1, 2, 1, 1, 6, 4, 7, 4, 8, 3, 4, 0, 5, 1, 3, 4, 5, 7, 6, 3, 7, 0,
5, 9, 7, 5, 9, 7, 4, 2, 2, 9, 0, 7, 5, 2, 3, 6, 3, 9, 6, 9, 5, 0,
1, 5, 5, 8, 3, 3, 6, 2, 6, 5, 5, 2, 0, 8, 7, 3, 7, 0, 2, 2, 3, 5,
8, 7, 3, 6, 5, 9, 9, 2, 1, 6, 3, 0, 7, 1, 1, 9, 6, 1, 1, 0, 0, 2,
9, 3, 9, 9, 3, 7, 7, 1, 3, 5, 4, 6, 8, 2, 1, 1, 8, 7, 6, 9, 2, 0,
4, 4, 8, 8, 7, 1, 3, 1, 7, 1, 8, 5, 1, 7, 0, 0, 2, 2, 6, 9, 4, 8,
9, 0, 6, 7, 7, 9, 5, 4, 7, 0, 7, 6, 8, 7, 1, 4, 6, 2, 8, 7, 5, 9,
0, 3, 9, 6, 6, 1, 9, 1, 2, 9, 8, 9, 7, 4, 8, 5, 5, 9, 7, 7, 6, 8,
1, 3, 5, 7, 9, 5, 5, 2, 1, 1, 2, 2, 4, 8, 7, 5, 8, 8, 9, 4, 9, 0])
model.predict_proba(X_test)
Out[8]:
array([[3.76758557e-07, 1.65932051e-10, 5.88536423e-13, ...,
1.88104493e-10, 5.48175626e-07, 4.06528018e-11],
[3.29584910e-07, 1.41140645e-12, 2.51482932e-12, ...,
2.38971258e-07, 1.62774466e-08, 9.93329101e-01],
[1.88800433e-15, 1.47690077e-16, 9.07702605e-10, ...,
6.29134202e-13, 1.02243044e-07, 2.81221978e-08],
...,
[1.35879344e-07, 1.15469927e-11, 4.27548590e-18, ...,
2.49267818e-09, 5.81031456e-11, 2.75755474e-23],
[2.56308113e-11, 9.69164883e-16, 2.78868610e-11, ...,
1.02673127e-12, 1.03905484e-05, 9.99989359e-01],
[9.99972313e-01, 3.12905189e-17, 2.73404863e-05, ...,
2.98298164e-09, 9.01281885e-08, 2.64599867e-10]])
model.score(X_test, y_test)
Out[9]: 0.9730639730639731
十则交叉验证
from sklearn.model_selection import cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_digits
LDAmodel=LinearDiscriminantAnalysis( )
LRmodel = LogisticRegression()
digits = load_digits()
cross_val_score(LDAmodel,digits.data,digits.target,cv=10)
Out[4]:
array([0.90555556, 0.96111111, 0.9 , 0.87777778, 0.93333333,
0.89444444, 0.97222222, 0.93296089, 0.87709497, 0.91620112])
cross_val_score(LRmodel,digits.data,digits.target,cv=10)
Out[7]:
array([0.90555556, 0.96111111, 0.87777778, 0.92777778, 0.94444444,
0.96666667, 0.95 , 0.93854749, 0.87150838, 0.93854749])