2Ddata_cluster classifier

这篇笔记介绍了如何使用KMeans、KNN和Meanshift算法对2D数据进行无监督聚类,并进行预测准确性评估。首先,通过数据可视化展示了不同算法的聚类效果,接着计算了KMeans的预测准确率,并进行了结果矫正,以匹配原始标签。然后,应用KNN算法实现了分类,并得到100%的预测准确率。最后,使用Meanshift算法再次进行聚类,经过数据矫正后,与原始标签匹配。整个过程揭示了监督和无监督学习的差异。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

2D数据类别划分

本篇笔记主要完成的任务如下:

1.采用Kmeans算法实现2D数据自动聚类,并预测V1=80,V2=60的数据集

2.计算预测准确率,完成结果矫正

3.利用KNN、Meanshift算法重复步骤1、2

数据集链接如下:链接:https://pan.baidu.com/s/1WQCHNwXWkvWeZYa9R-7rww
提取码:1234

#和之前一样先导入我们需要的包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

1.导入数据并进行数据可视化

data = pd.read_csv('data.csv')
data.head()
V1V2labels
02.072345-3.2416930
117.93671015.7848100
21.0835767.3191760
311.12067014.4067800
423.7115502.5577290

这里我们进行的是无监督学习引入的数据需要的是前两列,最后一列是为了验证KNN聚类,加深对监督和无监督学习的认识。

# 定义x 和 y
x = data.drop(['labels'],axis=1)
y = data.loc[:,'labels']
y.head()
0    0
1    0
2    0
3    0
4    0
Name: labels, dtype: int64
#查看标签的类型个数
pd.value_counts(y)
2    1156
1     954
0     890
Name: labels, dtype: int64
fig1 = plt.figure()
plt.figure(figsize=(6,6))
plt.scatter(x.loc[:,'V1'],x.loc[:,'V2'],)
plt.title('un-labeled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.show()
<Figure size 432x288 with 0 Axes>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-RtF2Co7q-1649146411759)(output_8_1.png)]

fig2 = plt.figure()
plt.figure(figsize=(6,6))
label0 = plt.scatter(x.loc[:,'V1'][y==0],x.loc[:,'V2'][y==0])
label1 = plt.scatter(x.loc[:,'V1'][y==1],x.loc[:,'V2'][y==1])
label2 = plt.scatter(x.loc[:,'V1'][y==2],x.loc[:,'V2'][y==2])

plt.title('un-labeled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
plt.show()
<Figure size 432x288 with 0 Axes>

在这里插入图片描述

2.利用sklearn包构建模型

#设置模型
from sklearn.cluster import KMeans
KM = KMeans(n_clusters=3,random_state=0)
KM.fit(x)
KMeans(n_clusters=3, random_state=0)
centers = KM.cluster_centers_
centers
array([[ 69.92418447, -10.11964119],
       [ 40.68362784,  59.71589274],
       [  9.4780459 ,  10.686052  ]])
fig3 = plt.figure()
plt.figure(figsize=(6,6))
label0 = plt.scatter(x.loc[:,'V1'][y==0],x.loc[:,'V2'][y==0])
label1 = plt.scatter(x.loc[:,'V1'][y==1],x.loc[:,'V2'][y==1])
label2 = plt.scatter(x.loc[:,'V1'][y==2],x.loc[:,'V2'][y==2])

plt.scatter(centers[:,0],centers[:,1])
plt.title('un-labeled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
plt.show()
<Figure size 432x288 with 0 Axes>

在这里插入图片描述

# 测试数据 :V1 = 80 V2 = 60
y_predict_test = KM.predict([[80,60]])
print(y_predict_test)
[1]


D:\ProgramData\Anaconda3\envs\imooc_ai\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but KMeans was fitted with feature names
  warnings.warn(
# 基于训练数据进行预测
y_predict = KM.predict(x)
print(pd.value_counts(y_predict),pd.value_counts(y))
1    1149
0     952
2     899
dtype: int64 2    1156
1     954
0     890
Name: labels, dtype: int64
from sklearn.metrics import accuracy_score
accuray = accuracy_score(y,y_predict)
print(accuray)
0.0023333333333333335
# 可视化训练集
fig4 = plt.subplot(121)

label0 = plt.scatter(x.loc[:,'V1'][y_predict==0],x.loc[:,'V2'][y_predict==0])
label1 = plt.scatter(x.loc[:,'V1'][y_predict==1],x.loc[:,'V2'][y_predict==1])
label2 = plt.scatter(x.loc[:,'V1'][y_predict==2],x.loc[:,'V2'][y_predict==2])

plt.scatter(centers[:,0],centers[:,1])
plt.title('predict data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
fig5 = plt.subplot(122)
label0 = plt.scatter(x.loc[:,'V1'][y==0],x.loc[:,'V2'][y==0])
label1 = plt.scatter(x.loc[:,'V1'][y==1],x.loc[:,'V2'][y==1])
label2 = plt.scatter(x.loc[:,'V1'][y==2],x.loc[:,'V2'][y==2])
plt.scatter(centers[:,0],centers[:,1])
plt.title('un-labeled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
plt.show()

在这里插入图片描述

3.进行数据矫正

这里的矫正策略是根据给出的数据集和预测的数据他们的标签是不相同的我们将预测的标签进行整合赋给一个新的列表。

#进行数据矫正
y_corrected = []
for i in y_predict:
    if i==0:
        y_corrected.append(1)
    elif i==1:
        y_corrected.append(2)
    else:
        y_corrected.append(0)
print(pd.value_counts(y_corrected),pd.value_counts(y))
2    1149
1     952
0     899
dtype: int64 2    1156
1     954
0     890
Name: labels, dtype: int64
accuray1 = accuracy_score(y,y_corrected)
print(accuray1)
0.997
y_corrected = np.array(y_corrected)
fig6 = plt.subplot(121)

label0 = plt.scatter(x.loc[:,'V1'][y_corrected==0],x.loc[:,'V2'][y_corrected==0])
label1 = plt.scatter(x.loc[:,'V1'][y_corrected==1],x.loc[:,'V2'][y_corrected==1])
label2 = plt.scatter(x.loc[:,'V1'][y_corrected==2],x.loc[:,'V2'][y_corrected==2])

plt.scatter(centers[:,0],centers[:,1])
plt.title('correct data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
fig7 = plt.subplot(122)
label0 = plt.scatter(x.loc[:,'V1'][y==0],x.loc[:,'V2'][y==0])
label1 = plt.scatter(x.loc[:,'V1'][y==1],x.loc[:,'V2'][y==1])
label2 = plt.scatter(x.loc[:,'V1'][y==2],x.loc[:,'V2'][y==2])
plt.scatter(centers[:,0],centers[:,1])
plt.title('un-labeled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
plt.show()

在这里插入图片描述

#建立一个KNN模型
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(x,y)
KNeighborsClassifier(n_neighbors=3)
#对测试集进行预测 V1=80 V2=60
y_predict_knn_test = KNN.predict([[80,60]])
y_predict_knn = KNN.predict(x)
print(y_predict_knn_test)
print("knn accuracy:",accuracy_score(y,y_predict_knn))
[2]
knn accuracy: 1.0


D:\ProgramData\Anaconda3\envs\imooc_ai\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but KNeighborsClassifier was fitted with feature names
  warnings.warn(
print(pd.value_counts(y_predict_knn),pd.value_counts(y))
2    1156
1     954
0     890
dtype: int64 2    1156
1     954
0     890
Name: labels, dtype: int64
fig6 = plt.subplot(121)

label0 = plt.scatter(x.loc[:,'V1'][y_predict_knn==0],x.loc[:,'V2'][y_predict_knn==0])
label1 = plt.scatter(x.loc[:,'V1'][y_predict_knn==1],x.loc[:,'V2'][y_predict_knn==1])
label2 = plt.scatter(x.loc[:,'V1'][y_predict_knn==2],x.loc[:,'V2'][y_predict_knn==2])

plt.scatter(centers[:,0],centers[:,1])
plt.title('knn result')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
fig7 = plt.subplot(122)
label0 = plt.scatter(x.loc[:,'V1'][y==0],x.loc[:,'V2'][y==0])
label1 = plt.scatter(x.loc[:,'V1'][y==1],x.loc[:,'V2'][y==1])
label2 = plt.scatter(x.loc[:,'V1'][y==2],x.loc[:,'V2'][y==2])
plt.scatter(centers[:,0],centers[:,1])
plt.title('un-labeled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
plt.show()

在这里插入图片描述

#建立meanshift模型
from sklearn.cluster import MeanShift,estimate_bandwidth
#估计带宽
bw = estimate_bandwidth(x,n_samples=500)
print(bw)
30.84663454820215
ms = MeanShift(bandwidth=bw)
ms.fit(x)
MeanShift(bandwidth=30.84663454820215)
y_predict_ms = ms.predict(x)
print(pd.value_counts(y_predict_ms),pd.value_counts(y))
0    1149
1     952
2     899
dtype: int64 2    1156
1     954
0     890
Name: labels, dtype: int64
fig6 = plt.subplot(121)

label0 = plt.scatter(x.loc[:,'V1'][y_predict_ms==0],x.loc[:,'V2'][y_predict_ms==0])
label1 = plt.scatter(x.loc[:,'V1'][y_predict_ms==1],x.loc[:,'V2'][y_predict_ms==1])
label2 = plt.scatter(x.loc[:,'V1'][y_predict_ms==2],x.loc[:,'V2'][y_predict_ms==2])

plt.scatter(centers[:,0],centers[:,1])
plt.title('meanshift result')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
fig7 = plt.subplot(122)
label0 = plt.scatter(x.loc[:,'V1'][y==0],x.loc[:,'V2'][y==0])
label1 = plt.scatter(x.loc[:,'V1'][y==1],x.loc[:,'V2'][y==1])
label2 = plt.scatter(x.loc[:,'V1'][y==2],x.loc[:,'V2'][y==2])
plt.scatter(centers[:,0],centers[:,1])
plt.title('un-labeled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
plt.show()

在这里插入图片描述

#进行数据矫正
y_corrected_ms = []
for i in y_predict_ms:
    if i==0:
        y_corrected_ms.append(2)
    elif i==1:
        y_corrected_ms.append(1)
    else:
        y_corrected_ms.append(0)
print(pd.value_counts(y_corrected),pd.value_counts(y))
2    1149
1     952
0     899
dtype: int64 2    1156
1     954
0     890
Name: labels, dtype: int64
#更改数据格式
y_corrected_ms = np.array(y_corrected_ms)
print(type(y_corrected_ms))
fig6 = plt.subplot(121)

label0 = plt.scatter(x.loc[:,'V1'][y_corrected_ms==0],x.loc[:,'V2'][y_corrected_ms==0])
label1 = plt.scatter(x.loc[:,'V1'][y_corrected_ms==1],x.loc[:,'V2'][y_corrected_ms==1])
label2 = plt.scatter(x.loc[:,'V1'][y_corrected_ms==2],x.loc[:,'V2'][y_corrected_ms==2])

plt.scatter(centers[:,0],centers[:,1])
plt.title('meanshift result')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
fig7 = plt.subplot(122)
label0 = plt.scatter(x.loc[:,'V1'][y==0],x.loc[:,'V2'][y==0])
label1 = plt.scatter(x.loc[:,'V1'][y==1],x.loc[:,'V2'][y==1])
label2 = plt.scatter(x.loc[:,'V1'][y==2],x.loc[:,'V2'][y==2])
plt.scatter(centers[:,0],centers[:,1])
plt.title('un-labeled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((label0,label1,label2),('label0','label1','label2'))
plt.show()

在这里插入图片描述

评论 9
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值