目录
测试我们的算法 划分出测试集对模型效果进行评估
用sklearn中的train_test_split进行训练集和数据集的划分
1. K近邻的python实现
(1) 计算训练集中的每个样本与新加入样本的distance并保存下来
(2) 对distance进行排序,找到前k个元素
(3) 利用collections模块中Counter,统计前k个元素的值和对应的出现次数, 出现次数最多的类作为预测的新样本的类
>>> raw_data_X = [[3.393533211, 2.331273381],
... [3.110073483, 1.781539638],
... [1.343808831, 3.368360954],
... [3.582294042, 4.679179110],
... [2.280362439, 2.866990263],
... [7.423436942, 4.696522875],
... [5.745051997, 3.533989803],
... [9.172168622, 2.511101045],
... [7.792783481, 3.424088941],
... [7.939820817, 0.791637231]
... ]
>>> raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
# 转成numpy数组
>>> X_train = np.array(raw_data_X)
>>> y_train = np.array(raw_data_y)
>>> x = np.array([8.093607318, 3.365731514])
# 绘图
>>> plt.scatter(X_train[y_train==0, 0], X_train[y_train==0, 1], marker="+")
<matplotlib.collections.PathCollection object at 0x7f6f30cae908>
>>> plt.scatter(X_train[y_train==1, 0], X_train[y_train==1, 1], marker="o")
<matplotlib.collections.PathCollection object at 0x7f6f2e76c8d0>
>>> plt.scatter(x[0], x[1], color="green", marker="x")
<matplotlib.collections.PathCollection object at 0x7f6f2e76cf60>
>>> plt.show()
# 寻找最近的元素前k个元素
>>> distances = [sqrt(np.sum((x_train - x) ** 2)) for x_train in X_train]
>>> distances
[4.812566907609877, 5.229270827235305, 6.749798999160064, 4.6986266144110695, 5.83460014556857, 1.4900114024329525, 2.354574897431513, 1.3761132675144652, 0.3064319992975, 2.5786840957478887]
# 排序
>>> nearest = np.argsort(distances)
>>> nearest
array([8, 7, 5, 6, 9, 3, 0, 1, 4, 2])
# 返回topK个索引值
>>> k = 6
>>> topK_y = [y_train[i] for i in nearest[:k]]
>>> topK_y
[1, 1, 1, 1, 1, 0]
# 使用collections库计算数组中的所有元素的出现次数,可以理解成返回字典形式
>>> from collections import Counter
>>> votes = Counter(topK_y)
>>> votes
Counter({1: 5, 0: 1})
# 返回出现次数最多的元素值:出现的次数
>>> res = votes.most_common(1)[0][0]
>>> res
1
简洁版:
KNN不需要训练过程
没有模型,实际上训练数据集本身就是模型
2. 使用scikit-learn中的KNN
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
# 创建测试数据
raw_data_X = [[3.393533211, 2.331273381],
[3.110073483, 1.781539638],
[1.343808831, 3.368360954],
[3.582294042, 4.679179110],
[2.280362439, 2.866990263],
[7.423436942, 4.696522875],
[5.745051997, 3.533989803],
[9.172168622, 2.511101045],
[7.792783481, 3.424088941],
[7.939820817, 0.791637231]
]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
X_train = np.array(raw_data_X)
y_train = np.array(raw_data_y)
x = np.array([8.093607318, 3.365731514])
# 实例化一个KNN分类器
KNN_classifer = KNeighborsClassifier(n_neighbors=6)
# fit,传入训练集
KNN_classifer.fit(X_train, y_train)
# predict, 传入测试样本
res = KNN_classifer.predict(x.reshape(1, -1))
print(res)
自己封装一个实现KNN的类
import numpy as np
from math import sqrt
from collections import Counter
class KNNClassifier:
def __init__(self, k):
"""初始化KNN分类器"""
assert k >= 1, "k must be valid"
self.k = k
self._X_train = None
self._y_train = None
def fit(self, X_train, y_train):
"""根据训练数据集X_train和y_train训练KNN分类器"""
print(X_train.shape[0])
print(y_train.shape[0])
assert X_train.shape[0] == y_train.shape[0]#, \ "the size of X_train must be equal to y_train"
assert self.k <= X_train.shape[0]#, \ "the size of X_train must be at least k."
self._X_train = X_train
self._y_train = y_train
return self
def predict(self, X_predict):
"""给定待预测数据集X_predict,返回表示X_predict的结果向量"""
assert self._X_train is not None and self._y_train is not None#, \ "must fit before predict"
assert X_predict.shape[1] == self._X_train.shape[1]#, \ "the feature number of X_predict must be equal to X_train"
y_predict = [self._predict(x) for x in X_predict]
return np.array(y_predict)
def _predict(self, x):
"""给定单个待预测数据x,返回x的预测结果值"""
distances = [sqrt(np.sum((x_train - x) ** 2)) for x_train in self._X_train]
nearest = np.argsort(distances)
topK_y = [self._y_train[i] for i in nearest[:self.k]]
votes = Counter(topK_y)
return votes.most_common(1)[0][0]
3. 判断机器学习算法的性能
问题:
模型很差怎么办?真实损失
真实环境难以拿到真实的label
解决方案:
划分训练集和测试集---用测试数据评估算法的性能,在模型进入真实环境前改进模型
测试我们的算法
训练集和测试集的划分:
在划分之前要先将数据集打乱顺序,为了保证样本和对应的标签再打乱之后能够一一对应上, 对索引进行打乱顺序的操作,而不是对整个数据集进行shuffle
以鸢尾花数据集为例:
# 数据集的加载
>>> iris = datasets.load_iris()
>>> y = iris.target
>>> X = iris.data
>>> X.shape
(150, 4)
>>> y.shape
(150,)
# 打乱数据集的索引
>>> shuffle_indexes = np.random.permutation(len(X))
# 设置训练集和测试集的划分比例
>>> test_ratio = 0.2
>>> test_size = int(len(X) * test_ratio)
>>> test_size
30
# 训练集和测试集的划分
>>> test_indexes = shuffle_indexes[:test_size]
>>> train_indexes = shuffle_indexes[test_size:]
>>> X_train = X[train_indexes]
>>> y_train = y[train_indexes]
>>> X_test = X[test_indexes]
>>> y_test = y[test_indexes]
>>> X_train.shape
(120, 4)
>>> y_train.shape
(120,)
>>> X_test.shape
(30, 4)
>>> y_test.shape
(30,)
>>>
简洁版:
用sklearn中的train_test_split进行训练集和数据集的划分
>>> from sklearn.model_selection import train_test_split
>>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)
>>> X_train.shape
(120, 4)
>>> y_train.shape
(120,)
>>> X_test.shape
(30, 4)
>>> y_test.shape
(30,)
>>>