注:本文是夏文俊对KNN算法的R语言实现的介绍
KNN(K-Nearest Neighbor或者K-最近邻)算法是回归和分类问题当中一个非常基本和重要的算法。简要介绍参见贝叶斯分类器及其若干实现算法的原理。
文中使用数据来自UCI机器学习网站https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)
,数据为乳腺癌的诊断数据,响应变量为是否患乳腺癌,自变量为乳腺肿块的数字特征。原始数据第一列为编号,将其删除。由于各个变量之间尺度差别很大,因此对各个变量进行归一化操作。
library(class)
# 读取数据
mydata = read.table('C:/wdbc.data', sep = ',', header = FALSE)
# 数据基本认识
head(mydata)
# 第一列为编号,删除
mydata = mydata[, -1]
str(mydata)
summary(mydata)
# 数据归一化
normalize = function(x){
return((x-min(x))/(max(x)-min(x)))
}
X = data.frame(sapply(mydata[, 2:31], normalize))
Y = mydata[, 1]
table(Y) # 数据较为均衡
newmydata = cbind(Y, X)
我们将采用交叉验证法来确定K的取值,将数据集划分为训练集、验证集、测试集,选取在验证集上误差最小的K值作为最优的K值。最终,当K=6时误差最小,在测试集上的准确率为0.982.
# 划分训练集、验证集和测试集
set.seed(0)
index = sample(x = 1:3,size = nrow(newmydata), replace = TRUE, prob = c(0.6,0.2,0.2))
train = newmydata[index == 1, ]
test = newmydata[index == 2, ]
valid = newmydata[index == 3, ]
?knn()
# 用验证集确定k个数
for (i in 1:round(sqrt(dim(train)[1]))){
model = knn(train = train[, -1], test = valid[, -1], cl = train$Y, k = i)
Freq = table(valid[, 1], model)
cat('k=',i, '\t', 'error:', 1-sum(diag(Freq))/sum(Freq), '\n')
}
# choose k=6, 对应的分类误差率最低,为0.01785714 ,故取k=6
fit = knn(train = train[, -1], test = test[, -1], cl = train$Y, k = 6)
#查看模型预测与实际类别的列联表
Freq = table(test[, 1], fit)
sum(diag(Freq))/sum(Freq)
接着,我们来模拟实验验证当维数增大时,KNN分类器可能会面临的维数灾难。我们假设数据来自多元正态分布,并根据sigmoid函数的输出值确定类别。
### KNN 的维度灾难
# 维度增大以后 数据变得稀疏,且集中在超球体的边缘,导致距离的判断变难
library(MASS)
determinK = function(train, valid){
min = 1
minindex =1
for (i in 1:round(sqrt(dim(train)[1]))){
model = knn(train = train[, -1], test = valid[, -1], cl = train[, 1], k = i)
Freq = table(valid[, 1], model)
currerror = 1-sum(diag(Freq))/sum(Freq)
if (currerror minindex = i
min = currerror
}
}
return(minindex)
}
dimension = rep(1:50)*10
result = c(length(0))
for (i in 1:length(dimension)) {
n = 1000
p = dimension[i]
mu = rep(0, p)
Sigma = diag(p)
set.seed(i)
X = mvrnorm(n, mu, Sigma)
Y = X %*% rnorm(p)
Y = exp(Y)/(1+exp(Y))
Y[Y>=0.5] = 1
Y[Y<0.5] = 0
data = data.frame(cbind(Y, X))
data[, 1] = as.factor(data[, 1])
index = sample(x = 1:3,size = nrow(X), replace = TRUE, prob = c(0.6,0.2,0.2))
train = data[index == 1, ]
test = data[index == 2, ]
valid = data[index == 3, ]
K = determinK(train, valid)
fit 1], test = test[,-1], cl = train[, 1], k = K)
Freq 1], fit)
result[i] = sum(diag(Freq))/sum(Freq)
cat('p=',p,'accuracy',sum(diag(Freq))/sum(Freq), '\n')
}
plot(x=dimension, y=result, type = 'o', ylim = c(0.5, 0.9), ylab = 'accuracy')
abline(h = 0.5, col = "red")
