# coding:utf-8
import numpy as np
from math import sqrt
from collections import Counter
from metrics import accuracy_score
class kNNClassifier:
def __init__(self, k):
'''初始化KNN分类器'''
assert 1 <= k, 'k must be valid'
self.k = k
self._X_train = None
self._y_train = None
def fit(self, X_train, y_train):
'''根据训练数据集X-train和y_train训练kNN分类器'''
assert X_train.shape[0] == y_train.shape[0], 'the size of X_train must equal to the size of y_train'
assert self.k <= X_train.shape[0], 'the size of X_train must be at least k'
self._X_train = X_train
self._y_train = y_train
return self
def predict(self, X_predict):
'''给定带预测数据集X_predict,返回表示X_predict的结果向量'''
assert self._X_train is not None and self._y_train is not None, 'must fit before predict'
assert X_predict.shape[1] == self._X_train.shape[1], 'the feature number of X_predict equal to the feature number of self._X_train'
y_predict = [self._predict(x) for x in X_predict]
return np.array(y_predict)
def _predict(self, x):
'''给定单个待预测数据x,返回x的预测结果值'''
assert x.shape[0] == self._X_train.shape[1], 'the feature number of x must be equal to X_train'
distances = [sqrt(np.sum((x_train - x)**2))
for x_train in self._X_train]
nearest = np.argsort(distances)
topK_y = [self._y_train[i] for i in nearest[:self.k]]
votes = Counter(topK_y)
return votes.most_common(1)[0][0]
def score(self, X_test, y_test):
'''计算对测试集的预测准确度'''
y_predict = self.predict(X_test)
return accuracy_score(y_test, y_predict)
def __repr__(self):
return 'kNN(k=%d)' % self.k
# 使用:
# kNN_clf = kNNClassifier(3)
# kNN_clf.fit(X_train,y_train)
# kNN_clf.predict(x)
# kNN_clf.score(X_test,y_test)
import numpy as np
def accuracy_score(y_test,y_predict):
'''计算y_predict相对于y_test的准确度'''
assert y_test.shape[0] == y_predict.shape[0],'the size of y_test must equal to the size of y_predict'
return sum(y_predict == y_test) / len(y_test)
数据归一化处理(均值方差归一化):
import numpy as np
class StandardScaler:
def __init__(self):
self.mean_ = None
self.scale_ = None
def fit(self,X):
'''根据训练数据集X获得数据均值和方差'''
assert X.ndim == 2,'the dimension of X must be 2'
self.mean_ = np.array([np.mean(X[:,i]) for i in range(X.shape[1])])
self.scale_ = np.array([np.std(X[:,i]) for i in range(X.shape[1])])
return self
def transform(self,X):
'''将X根据已有信息进行均值方差归一化处理'''
assert X.ndim == 2,'the dimension of X must be 2'
assert self.mean_ is not None and self.scale_ is not None , 'must fit before transform!'
assert X.shape[1] == len(self.mean_),'the feature number of X must be equal to mean_ and std_'
resX = np.empty(shape = X.shape,dtype = float)
for col in range(0,X.shape[1]):
resX[:,col] = (X[:,col] - self.mean_[col])/self.scale_[col]
return resX

本文介绍了如何使用Python实现k近邻(kNN)算法,并涵盖了数据预处理的均值方差归一化步骤。
304

被折叠的 条评论
为什么被折叠?



