2D数据类别划分
本篇笔记主要完成的任务如下:
1.采用Kmeans算法实现2D数据自动聚类,并预测V1=80,V2=60的数据集
2.计算预测准确率,完成结果矫正
3.利用KNN、Meanshift算法重复步骤1、2
数据集链接如下:链接:https://pan.baidu.com/s/1WQCHNwXWkvWeZYa9R-7rww
提取码:1234
#和之前一样先导入我们需要的包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
1.导入数据并进行数据可视化
data = pd.read_csv('data.csv')
data.head()
V1 | V2 | labels | |
---|---|---|---|
0 | 2.072345 | -3.241693 | 0 |
1 | 17.936710 | 15.784810 | 0 |
2 | 1.083576 | 7.319176 | 0 |
3 | 11.120670 | 14.406780 | 0 |
4 | 23.711550 | 2.557729 | 0 |
这里我们进行的是无监督学习引入的数据需要的是前两列,最后一列是为了验证KNN聚类,加深对监督和无监督学习的认识。
# 定义x 和 y
x = data.drop(['labels'],axis=1)
y = data.loc[:,'labels']
y.head()
0 0
1 0
2 0
3 0
4 0
Name: labels, dtype: int64
#查看标签的类型个数
pd.value_counts(y)
2 1156
1 954
0 890
Name: labels, dtype: int64
fig1 = plt.figure()
plt.figure(figsize=(6,6))
plt.scatter(x.loc[:,'V1'],x.loc[:,'V2'],)
plt.title('un-labeled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.show()
<Figure size 432x288 with 0 Axes>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-RtF2Co7q-1649146411759)(output_8_1.png)]
fig2 = plt.figure()
plt.figure(figsize=(6,6))
label0 = plt.scatter(x.loc[:,'V1'][y==0],x.loc[:,'V2'][y==0])
label1 = plt.scatter(x.loc[:,'V1'][y==1],x.loc[:,'V2'][y==1])
label2 = plt.scatter(x.loc[:,'V1'][y==2],x.loc[:,'V2'][y==2])
plt.title('un-labeled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
plt.show()
<Figure size 432x288 with 0 Axes>
2.利用sklearn包构建模型
#设置模型
from sklearn.cluster import KMeans
KM = KMeans(n_clusters=3,random_state=0)
KM.fit(x)
KMeans(n_clusters=3, random_state=0)
centers = KM.cluster_centers_
centers
array([[ 69.92418447, -10.11964119],
[ 40.68362784, 59.71589274],
[ 9.4780459 , 10.686052 ]])
fig3 = plt.figure()
plt.figure(figsize=(6,6))
label0 = plt.scatter(x.loc[:,'V1'][y==0],x.loc[:,'V2'][y==0])
label1 = plt.scatter(x.loc[:,'V1'][y==1],x.loc[:,'V2'][y==1])
label2 = plt.scatter(x.loc[:,'V1'][y==2],x.loc[:,'V2'][y==2])
plt.scatter(centers[:,0],centers[:,1])
plt.title('un-labeled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
plt.show()
<Figure size 432x288 with 0 Axes>
# 测试数据 :V1 = 80 V2 = 60
y_predict_test = KM.predict([[80,60]])
print(y_predict_test)
[1]
D:\ProgramData\Anaconda3\envs\imooc_ai\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but KMeans was fitted with feature names
warnings.warn(
# 基于训练数据进行预测
y_predict = KM.predict(x)
print(pd.value_counts(y_predict),pd.value_counts(y))
1 1149
0 952
2 899
dtype: int64 2 1156
1 954
0 890
Name: labels, dtype: int64
from sklearn.metrics import accuracy_score
accuray = accuracy_score(y,y_predict)
print(accuray)
0.0023333333333333335
# 可视化训练集
fig4 = plt.subplot(121)
label0 = plt.scatter(x.loc[:,'V1'][y_predict==0],x.loc[:,'V2'][y_predict==0])
label1 = plt.scatter(x.loc[:,'V1'][y_predict==1],x.loc[:,'V2'][y_predict==1])
label2 = plt.scatter(x.loc[:,'V1'][y_predict==2],x.loc[:,'V2'][y_predict==2])
plt.scatter(centers[:,0],centers[:,1])
plt.title('predict data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
fig5 = plt.subplot(122)
label0 = plt.scatter(x.loc[:,'V1'][y==0],x.loc[:,'V2'][y==0])
label1 = plt.scatter(x.loc[:,'V1'][y==1],x.loc[:,'V2'][y==1])
label2 = plt.scatter(x.loc[:,'V1'][y==2],x.loc[:,'V2'][y==2])
plt.scatter(centers[:,0],centers[:,1])
plt.title('un-labeled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
plt.show()
3.进行数据矫正
这里的矫正策略是根据给出的数据集和预测的数据他们的标签是不相同的我们将预测的标签进行整合赋给一个新的列表。
#进行数据矫正
y_corrected = []
for i in y_predict:
if i==0:
y_corrected.append(1)
elif i==1:
y_corrected.append(2)
else:
y_corrected.append(0)
print(pd.value_counts(y_corrected),pd.value_counts(y))
2 1149
1 952
0 899
dtype: int64 2 1156
1 954
0 890
Name: labels, dtype: int64
accuray1 = accuracy_score(y,y_corrected)
print(accuray1)
0.997
y_corrected = np.array(y_corrected)
fig6 = plt.subplot(121)
label0 = plt.scatter(x.loc[:,'V1'][y_corrected==0],x.loc[:,'V2'][y_corrected==0])
label1 = plt.scatter(x.loc[:,'V1'][y_corrected==1],x.loc[:,'V2'][y_corrected==1])
label2 = plt.scatter(x.loc[:,'V1'][y_corrected==2],x.loc[:,'V2'][y_corrected==2])
plt.scatter(centers[:,0],centers[:,1])
plt.title('correct data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
fig7 = plt.subplot(122)
label0 = plt.scatter(x.loc[:,'V1'][y==0],x.loc[:,'V2'][y==0])
label1 = plt.scatter(x.loc[:,'V1'][y==1],x.loc[:,'V2'][y==1])
label2 = plt.scatter(x.loc[:,'V1'][y==2],x.loc[:,'V2'][y==2])
plt.scatter(centers[:,0],centers[:,1])
plt.title('un-labeled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
plt.show()
#建立一个KNN模型
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(x,y)
KNeighborsClassifier(n_neighbors=3)
#对测试集进行预测 V1=80 V2=60
y_predict_knn_test = KNN.predict([[80,60]])
y_predict_knn = KNN.predict(x)
print(y_predict_knn_test)
print("knn accuracy:",accuracy_score(y,y_predict_knn))
[2]
knn accuracy: 1.0
D:\ProgramData\Anaconda3\envs\imooc_ai\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but KNeighborsClassifier was fitted with feature names
warnings.warn(
print(pd.value_counts(y_predict_knn),pd.value_counts(y))
2 1156
1 954
0 890
dtype: int64 2 1156
1 954
0 890
Name: labels, dtype: int64
fig6 = plt.subplot(121)
label0 = plt.scatter(x.loc[:,'V1'][y_predict_knn==0],x.loc[:,'V2'][y_predict_knn==0])
label1 = plt.scatter(x.loc[:,'V1'][y_predict_knn==1],x.loc[:,'V2'][y_predict_knn==1])
label2 = plt.scatter(x.loc[:,'V1'][y_predict_knn==2],x.loc[:,'V2'][y_predict_knn==2])
plt.scatter(centers[:,0],centers[:,1])
plt.title('knn result')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
fig7 = plt.subplot(122)
label0 = plt.scatter(x.loc[:,'V1'][y==0],x.loc[:,'V2'][y==0])
label1 = plt.scatter(x.loc[:,'V1'][y==1],x.loc[:,'V2'][y==1])
label2 = plt.scatter(x.loc[:,'V1'][y==2],x.loc[:,'V2'][y==2])
plt.scatter(centers[:,0],centers[:,1])
plt.title('un-labeled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
plt.show()
#建立meanshift模型
from sklearn.cluster import MeanShift,estimate_bandwidth
#估计带宽
bw = estimate_bandwidth(x,n_samples=500)
print(bw)
30.84663454820215
ms = MeanShift(bandwidth=bw)
ms.fit(x)
MeanShift(bandwidth=30.84663454820215)
y_predict_ms = ms.predict(x)
print(pd.value_counts(y_predict_ms),pd.value_counts(y))
0 1149
1 952
2 899
dtype: int64 2 1156
1 954
0 890
Name: labels, dtype: int64
fig6 = plt.subplot(121)
label0 = plt.scatter(x.loc[:,'V1'][y_predict_ms==0],x.loc[:,'V2'][y_predict_ms==0])
label1 = plt.scatter(x.loc[:,'V1'][y_predict_ms==1],x.loc[:,'V2'][y_predict_ms==1])
label2 = plt.scatter(x.loc[:,'V1'][y_predict_ms==2],x.loc[:,'V2'][y_predict_ms==2])
plt.scatter(centers[:,0],centers[:,1])
plt.title('meanshift result')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
fig7 = plt.subplot(122)
label0 = plt.scatter(x.loc[:,'V1'][y==0],x.loc[:,'V2'][y==0])
label1 = plt.scatter(x.loc[:,'V1'][y==1],x.loc[:,'V2'][y==1])
label2 = plt.scatter(x.loc[:,'V1'][y==2],x.loc[:,'V2'][y==2])
plt.scatter(centers[:,0],centers[:,1])
plt.title('un-labeled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
plt.show()
#进行数据矫正
y_corrected_ms = []
for i in y_predict_ms:
if i==0:
y_corrected_ms.append(2)
elif i==1:
y_corrected_ms.append(1)
else:
y_corrected_ms.append(0)
print(pd.value_counts(y_corrected),pd.value_counts(y))
2 1149
1 952
0 899
dtype: int64 2 1156
1 954
0 890
Name: labels, dtype: int64
#更改数据格式
y_corrected_ms = np.array(y_corrected_ms)
print(type(y_corrected_ms))
fig6 = plt.subplot(121)
label0 = plt.scatter(x.loc[:,'V1'][y_corrected_ms==0],x.loc[:,'V2'][y_corrected_ms==0])
label1 = plt.scatter(x.loc[:,'V1'][y_corrected_ms==1],x.loc[:,'V2'][y_corrected_ms==1])
label2 = plt.scatter(x.loc[:,'V1'][y_corrected_ms==2],x.loc[:,'V2'][y_corrected_ms==2])
plt.scatter(centers[:,0],centers[:,1])
plt.title('meanshift result')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
fig7 = plt.subplot(122)
label0 = plt.scatter(x.loc[:,'V1'][y==0],x.loc[:,'V2'][y==0])
label1 = plt.scatter(x.loc[:,'V1'][y==1],x.loc[:,'V2'][y==1])
label2 = plt.scatter(x.loc[:,'V1'][y==2],x.loc[:,'V2'][y==2])
plt.scatter(centers[:,0],centers[:,1])
plt.title('un-labeled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
plt.show()