KNN算法
代码示例
电影类型分类
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
# 分类,电影类别
# 动作 武打镜头:碟中谍6 、杀死比尔
# 爱情 接吻镜头:泰坦尼克号
# 属性:武打镜头、接吻镜头
# 量化:数量化、数字化
movie=pd.read_excel('./movies.xlsx',sheet_name=1)
data=movie.iloc[:,1:3]
target=movie['分类情况']
# 算法,训练
knn=KNeighborsClassifier(n_neighbors=5)
# 训练,学习,算法,知道数据和目标值什么样的关系
knn.fit(data,target)
# 预测,使用,应用
X_test=pd.DataFrame({'武打镜头':[100,67,1],'接吻镜头':[3,2,15]})
Y_test=knn.predict(X_test)
print (Y_test)
#结果
['动作片' '动作片' '爱情片']
鸢尾花数据分类
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris=load_iris()
x=iris['data']
y=iris['target']
# 150代表着150个样本,4代表着4个属性:花萼长、宽,花瓣长、宽;
print(x.shape)
# 将数据划分,一分为二,一部分用于训练,一部分用于测试
#x_train,x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
index=np.arange(150)
np.random.shuffle(index)
x_train,x_test=x[index[:100]],x[index[100:]]
y_train,y_test=y[index[:100]],y[index[100:]]
#n_neighbors为选取最近的邻居个数,
# weights表示每个邻居所占的权重,uniform表示每个邻居所占权重相同,distance表示距离越近权重越大
#p=1表示按照曼哈顿距离公式进行计算,p=2表示按照欧氏距离公式进行计算,默认=2
knn=KNeighborsClassifier(n_neighbors=5,weights='distance',p=2)
knn.fit(x_train,y_train)
y_=knn.predict(x_test)
print('预测值:',y_)
print('真实值',y_test)
#算法返回的结果,使用bianliang_接收:约定俗成的变量命名规则
proba_=knn.predict_proba(x_test)
print(proba_)
#返回最大值的索引
print(proba_.argmax(axis=1))
#返回准确度
accuery=knn.score(x_test,y_test)
print("准确度:",accuery)
手写数字分类
#结果
(150, 4)
预测值: [0 2 1 2 0 2 0 0 1 0 0 1 2 2 1 2 1 0 1 1 0 0 2 0 1 2 1 0 1 2 0 0 0 2 1 2 2
1 0 1 0 0 2 2 1 2 2 2 1 2]
真实值 [0 2 1 1 0 2 0 0 1 0 0 1 2 2 1 2 1 0 1 1 0 0 2 0 1 2 1 0 1 2 0 0 0 2 1 2 2
1 0 1 0 0 2 1 1 2 2 2 1 2]
[[1. 0. 0. ]
[0. 0.32949775 0.67050225]
[0. 1. 0. ]
[0. 0.179748 0.820252 ]
[1. 0. 0. ]
[0. 0. 1. ]
[1. 0. 0. ]
[1. 0. 0. ]
[0. 0.88650652 0.11349348]
[1. 0. 0. ]
[1. 0. 0. ]
[0. 1. 0. ]
[0. 0.14932596 0.85067404]
[0. 0. 1. ]
[0. 1. 0. ]
[0. 0. 1. ]
[0. 1. 0. ]
[1. 0. 0. ]
[0. 1. 0. ]
[0. 1. 0. ]
[1. 0. 0. ]
[1. 0. 0. ]
[0. 0. 1. ]
[1. 0. 0. ]
[0. 1. 0. ]
[0. 0. 1. ]
[0. 0.83371931 0.16628069]
[1. 0. 0. ]
[0. 1. 0. ]
[0. 0. 1. ]
[1. 0. 0. ]
[1. 0. 0. ]
[1. 0. 0. ]
[0. 0. 1. ]
[0. 1. 0. ]
[0. 0. 1. ]
[0. 0. 1. ]
[0. 1. 0. ]
[1. 0. 0. ]
[0. 1. 0. ]
[1. 0. 0. ]
[1. 0. 0. ]
[0. 0. 1. ]
[0. 0. 1. ]
[0. 1. 0. ]
[0. 0. 1. ]
[0. 0. 1. ]
[0. 0.11992379 0.88007621]
[0. 1. 0. ]
[0. 0. 1. ]]
[0 2 1 2 0 2 0 0 1 0 0 1 2 2 1 2 1 0 1 1 0 0 2 0 1 2 1 0 1 2 0 0 0 2 1 2 2
1 0 1 0 0 2 2 1 2 2 2 1 2]
准确度: 0.96
import numpy as np
import cv2
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
# bitmap 位图
digit=cv2.imread('./data/0/0_101.bmp')
# 将彩色(三维的)图像转化为灰度图像,图像的灰度化处理 (28,28,3)-------->(28,28)
digit=cv2.cvtColor(digit,code=cv2.COLOR_BAYER_BG2GRAY)
# 加载数据,灰度化处理
x=[]
for i in range(10):
for j in range(501):
digit=cv2.imread('./data/%d/%d_/%d.bmp'%(i,i,j))
x.append(digit[:,:,0])
# 数据x和目标值y是一一对应的
x=np.asarray(x) # numpy 对象
y=np.array([i for i in range(10)]*500)
y.sort()
# digit是二维的,高度、宽度,像素(只有一个值)-------用什么颜色表示呢
index=np.random.randint(0,5000,size=1)[0]
digit=x[index]
print('---------------',y[index])
plt.imshow(digit,cmap=plt.cm.gray)
# 二值化操作
for i in range(5000):
for a in range(28):
for b in range(28):
if x[i][a,b]<200:
x[i][a,b]=0
else:
x[i][a, b]=255
# x,y划分成训练和验证数据
# 模型选择,可以打乱顺序,按照比例进行划分
from sklearn.model_selection import train_test_split
# test_size=0.2,train_size=0.8,训练:测试=4:1
# 一一对应
x_trian,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
# 数据不符合要求,reshape将形状改变
# 三维的数据------->变成二维的数据
x_trian=x_trian.reshape(4000,-1)
x_test=x_test.reshape(1000,-1)
# 算法训练和预测(验证)
accuracy=0
for i in range(30):
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(x_trian,y_train)
# 使用算法进行预测
# 保留了1000个数据,算法“没见过”
y_=knn.predict(x_test)
accuracy+=(y_==y_test).mean()/30
print('-------多次划分训练的平均准确率:%0.3f'%(accuracy))