机器学习python Kmeans聚类

本文通过实例展示了K-means聚类算法的应用过程,包括如何选择最佳聚类数k,评估聚类效果,并探讨了算法可能存在的局限性。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
'''容易收敛到局部最优解,需要预先设定类别数量'''
# 使用pandas分别读取训练数据与测试数据集
# digits_train = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra', header=None)
# digits_test = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes', header=None)
#
# # 从训练与测试数据集上都分离出64维度的像素特征与1维度的数字目标
# X_train = digits_train[np.arange(64)]
# y_train = digits_train[64]
#
# X_test = digits_test[np.arange(64)]
# y_test = digits_test[64]
#
# kmeans = KMeans(n_clusters=10)
# kmeans.fit(X_train)
# y_pred = kmeans.predict(X_test)
# #ARI指标通分类指标相似,用来评估数据本身带有正确的类别信息
# print(metrics.adjusted_rand_score(y_test,y_pred))

# #用轮廓系数来进行评价
# '''sc = bi-ai/max(bi,ai),bi是类别之间的分离度,ai是凝聚度'''
# plt.subplot(3,2,1)
# x1 = np.array([1, 2, 3, 1, 5, 6, 5, 5, 6, 7, 8, 9, 7, 9])
# x2 = np.array([1, 3, 2, 2, 8, 6, 7, 6, 7, 1, 2, 1, 1, 3])
# X = np.array([(1, 1), (2, 3), (3, 2), (1, 2), (5, 8), (6, 6), (5, 7), (5, 6), (6, 7), (7, 1), (8, 2), (9, 1), (7, 1), (9, 3)])
#
# plt.xlim([0, 10])
# plt.ylim([0, 10])
# plt.title('Instances')
# plt.scatter(x1, x2)
#
# colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'b']
# markers = ['o', 's', 'D', 'v', '^', 'p', '*', '+']
#
# clusters = [2, 3, 4, 5, 8]
# subplot_counter = 1
# sc_scores = []
# for t in clusters:
#     subplot_counter += 1
#     plt.subplot(3,2,subplot_counter)
#     kmeans_model = KMeans(n_clusters=t).fit(X)
#     for i,l in enumerate(kmeans_model.labels_):
#         plt.plot(x1[i], x2[i], color=colors[l], marker=markers[l], ls='None')
#     plt.xlim([0, 10])
#     plt.ylim([0, 10])
#     sc_score = silhouette_score(X, kmeans_model.labels_, metric='euclidean')
#     sc_scores.append(sc_score)
#     plt.title('K = %s, silhouette coefficient= %0.03f' % (t, sc_score))
#
# # 绘制轮廓系数与不同类簇数量的关系曲线。
# plt.figure()
# plt.plot(clusters, sc_scores, '*-')
# plt.xlabel('Number of Clusters')
# plt.ylabel('Silhouette Coefficient Score')
# plt.show()

import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
# 使用均匀分布函数随机三个簇,每个簇周围10个数据样本。
cluster1 = np.random.uniform(0.5, 1.5, (2, 10))
cluster2 = np.random.uniform(5.5, 6.5, (2, 10))
cluster3 = np.random.uniform(3.0, 4.0, (2, 10))

# 绘制30个数据样本的分布图像
X = np.hstack((cluster1, cluster2, cluster3)).T
plt.scatter(X[:,0], X[:, 1])
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()
# 测试9种不同聚类中心数量下,每种情况的聚类质量,并作图
K = range(1, 10)
meandistortions = []

for k in K:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    meandistortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])

plt.plot(K, meandistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average Dispersion')
plt.title('Selecting k with the Elbow Method')
plt.show()


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值