KNN算法(k-nearest neighbors algorithm)是机器学习中一个简单分类算法
基本思想:
在样本空间中求样本A所在的分类,选取样本空间中与样本A最接近的K个样本,若K个样本中绝大多数属于某个分类B,则认为A属于B
接近程度计算
使用向量的欧式距离来作为两个样本之间的距离,直接使用scipy中的distance函数计算
from scipy.spatial import distance
测试
使用iris_flower_data_set进行测试,只需要对分类器进行修改就可以测试不同的分类器
from sklearn import datasets
import KNN
iris = datasets.load_iris()
X= iris.data
y= iris.target
#拆分训练集和测试集
from sklearn.cross_validation import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size = 0.5)
#选择分类器
my_classifier = KNN.KNN()
my_classifier.fit(X_train,y_train)#训练
predictions = my_classifier.predict(X_test)#测试
#分析准确率
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,predictions))
实现
训练
直接获取到训练数据,不进行任何其他处理
测试
- 计算一个测试样本A与所有训练样本的距离
- 选取其中K个最近的样本
- 找到K个样本中多数所在的分类
from scipy.spatial import distance
class KNN():
def fit (self,X_train,y_train):
self.X_train = X_train
self.y_train = y_train
def predict(self,X_test):
K=3
predictions= []
for row in X_test:
label = self.Knearest(row,K)
predictions.append(label)
return predictions
#K=1
def cloest(self,row):
best_dist = euc(row,self.X_train[0])
best_index = 0
for i in range(1,len(self.X_train)):
dist = distance.euclidean(row,self.X_train[i])
if dist < best_dist:
best_dist = dist
best_index = i
return self.y_train[best_index]
#K>1
def Knearest(self,row,K):
dist = {}
for i in range(1,len(self.X_train)):
dist[i] = distance.euclidean(row,self.X_train[i])
lst = sorted(dist.items(),key=lambda d:d[1],reverse=False)
classCount={}
for i in range(K):
voteLabel = self.y_train[lst[i][0]]
classCount[voteLabel] = classCount.get(voteLabel,0) + 1
maxCount = 0
for key,value in classCount.items():
if value >maxCount:
voteclass = key
return voteclass