数据集可以在下方留言找我要
或者

一、认识knn算法
寻找最近的k个数据,推测新数据的分类。
二、算法原理
通用步骤
- 计算距离(常用欧几里得距离或者马氏距离)
- 升序排序
- 取前k个
- 加权平均
k的选取
- k太大:导致分类模糊
- k太小:受个例影响,波动较大
如何选取k
- 经验
- 均方根误差
读取数据
import csv
with open('E:\资资\python练习\knn\Prostate_Cancer.csv','r') as file:
#DictReader把数据读成字典
reader = csv.DictReader(file)
datas=[row for row in reader]
print(datas)
算距离
K=5
def knn(data):
#1距离
res = [
{"result":train['diagnosis_result'],"distance":distance(data,train)}
for train in train_set
]
print(res)
knn(test_set[0])
#2排序
res = sorted(res,key=lambda item:item['distance'])
print(res)
knn
def knn(data):
#1距离
res = [
{"result":train['diagnosis_result'],"distance":distance(data,train)}
for train in train_set
]
#2排序
res = sorted(res,key=lambda item:item['distance'])
#取前k个
res2=res[0:K]
#加权平均
result={'B':0,'M':0}
#总距离
sum = 0
for r in res2:
sum+=r['distance']
#距离进的权重反而高
for r in res2:
result[r['result']]+=1-r['distance']/sum
#输出预测的结果
print(result)
#输出本身结果
print(data['diagnosis_result'])
测试
#测试阶段
correct=0
for test in test_set:
result=test['diagnosis_result']
result2=knn(test)
if result==result2:
correct+=1
print("准确率:{:.2f}%".format(100*correct/len(test_set)))
三、完整代码
import random
import csv
#数据读取
with open('E:\资资\python练习\knn\Prostate_Cancer.csv','r') as file:
#DictReader把数据读成字典
reader = csv.DictReader(file)
datas=[row for row in reader]
#分组
#分成训练集和测试集
#打乱顺序,防止偶然性
random.shuffle(datas)
n = len(datas)//3
test_set = datas[0:n]
train_set = datas[n:]
#KNN
#距离
def distance(d1,d2):
res = 0
for key in ("radius","texture","perimeter","area","smoothness","compactness","symmetry","fractal_dimension"):
res += (float(d1[key])-float(d2[key]))**2
return res**0.5
K=5
def knn(data):
#1距离
res = [
{"result":train['diagnosis_result'],"distance":distance(data,train)}
for train in train_set
]
#2排序
res = sorted(res,key=lambda item:item['distance'])
#取前k个
res2=res[0:K]
#加权平均
result={'B':0,'M':0}
#总距离
sum = 0
for r in res2:
sum+=r['distance']
#距离进的权重反而高
for r in res2:
result[r['result']]+=1-r['distance']/sum
#输出预测的结果
#print(result)
#输出本身结果
#print(data['diagnosis_result'])
if result['B']>result['M']:
return 'B'
else:
return 'M'
#测试阶段
correct=0
for test in test_set:
result=test['diagnosis_result']
result2=knn(test)
if result==result2:
correct+=1
print("准确率:{:.2f}%".format(100*correct/len(test_set)))