import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
n_samples = 1500
random_state = 170
transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
X, y = make_blobs(n_samples=n_samples, random_state=random_state)
X_aniso = np.dot(X, transformation) # Anisotropic blobs
X_varied, y_varied = make_blobs(
n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
) # Unequal variance
X_filtered = np.vstack(
(X[y == 0][:500], X[y == 1][:100], X[y == 2][:10])
) # Unevenly sized blobs
y_filtered = [0] * 500 + [1] * 100 + [2] * 10fig, axs = plt.subplots(2, 2, figsize=(6, 6))
axs[0, 0].scatter(X[:, 0], X[:, 1], c=y)
axs[0, 0].set_title("Mixture of Gaussian Blobs")
axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y)
axs[0, 1].set_title("Anisotropically Distributed Blobs")
axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_varied)
axs[1, 0].set_title("Unequal Variance")
axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_filtered)
axs[1, 1].set_title("Unevenly Sized Blobs")
plt.suptitle("Ground truth clusters").set_y(0.95)K-Means 算法基于这样一个观念:一个簇 (类) 中的每个点都应该靠近该簇的中心。
它的工作原理如下:首先我们选择 k,即我们希望在数据中找到的簇的数量。然后,以某种方式初始化这些 k 个簇的中心,这些中心被称为质心 (Centroid)。
算法然后分为两个交替进行的部分进行:
在 重新分配数据点 步骤中,我们将数据中的每个点分配给其质心最近的簇。
在 更新质心 步骤中,我们重新计算每个质心的位置,作为分配给其簇的所有点的均值(中心)。
然后我们重复这些步骤,直到质心停止移动,或者等价地,直到点停止切换簇。
首先,我们实现一个 K_MeansClustering 的类:class K_MeansClustering:
def __init__(self, k, dataPoint, randInitFlg=False, **kwargs):
''' Initialize parameters and centroids
'''
self.k = k
self.dataPoint = dataPoint
self.rowNum = self.dataPoint.shape[0]
self.centroidList = []
if randInitFlg:
self.randomInitialization(kwargs.get('randSeed', 0))
else:
for i in range(self.k):
self.centroidList.append(self.dataPoint[i, :]) # the centroid is chosen from the first k data points.
self.lastCentroidList = None
self.distanceMatrix = np.empty((self.rowNum, self.k))
self.centroidList = np.asarray(self.centroidList, dtype=np.float32)
def randomInitialization(self, randSeed=0):
''' 随机初始化
'''
pass
def calculateCentroidDistance(self, x, c):
''' Calculate the distance between the whole data points and one centroid
'''
pass
def computeJe(self):
''' 计算聚类准则
'''
# TODO
## ----------- 输入代码,计算聚类准则 ----------- ##
pass
def __call__(self):
''' Iteratively find the optimal centroids
'''
pass随机初始化函数def randomInitialization(self, randSeed=0):
''' 随机初始化
'''
# TODO
## ----------- 输入代码,完成随机初始化 ----------- ##
np.random.seed(randSeed)
pass计算每一个点到质心的距离def calculateCentroidDistance(self, x, c):
''' Calculate the distance between the whole data points and one centroid
'''
# TODO
## ----------- 输入代码,完成点到质心距离的计算 ----------- ##
passdef computeJe(self):
''' 计算聚类准则
'''
self.Je = 0.0
# TODO
## ----------- 输入代码,计算聚类准则 ----------- ##
passdef iteration_func(self):
''' Iteratively find the optimal centroids
'''
while True:
# TODO
## ----------- 输入代码,K-Means迭代设计 ----------- ##
#loop through the data points
pass
# 重新分配数据点
## array of indexes
self.nearestCentroids = None
# 终止条件
pass
# 更新质心
## calculate the mean of a cluster and reassign the value of the centroids
pass
#keep track of centroid values
pass
return self.nearestCentroids, self.centroidList K_MeansClustering.randomInitialization = randomInitialization
K_MeansClustering.calculateCentroidDistance = calculateCentroidDistance
K_MeansClustering.computeJe = computeJe
K_MeansClustering.__call__ = iteration_funcfig, axs = plt.subplots(nrows=2, ncols=2, figsize=(8, 8))
y_pred, centroidList = K_MeansClustering(k=2, dataPoint=X)()
axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred)
axs[0, 0].set_title("Non-optimal Number of Clusters")
y_pred, centroidList = K_MeansClustering(k=3, dataPoint=X_aniso)()
axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
axs[0, 1].set_title("Anisotropically Distributed Blobs")
y_pred, centroidList = K_MeansClustering(k=3, dataPoint=X_varied)()
axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
axs[1, 0].set_title("Unequal Variance")
y_pred, centroidList = K_MeansClustering(k=3, dataPoint=X_filtered)()
axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)
axs[1, 1].set_title("Unevenly Sized Blobs")
plt.suptitle("Unexpected KMeans clusters").set_y(1.02)首先,我们在数据 X 下测试出不同的质心 (Centroid) 数据对聚类结果的影响。由下图可以看到,合适的质心数目对获得满意的聚类效果至关重要。fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(10, 4))
kmeans1 = K_MeansClustering(k=2, dataPoint=X)
y_pred1, centroidList1 = kmeans1()
kmeans1.computeJe()
axs[0].scatter(X[:, 0], X[:, 1], c=y_pred1)
axs[0].set_title(f"k=2, Je={kmeans1.Je:.2f}")
# TODO
## ----------- 输入代码,完成k=3的聚类 ----------- ##
kmeans2 = None
y_pred2, centroidList2 = None, None
axs[1].scatter(X[:, 0], X[:, 1], c=y_pred2)
axs[1].set_title(f"k=3, Je={kmeans2.Je:.2f}")
# TODO
## ----------- 输入代码,完成k=4的聚类 ----------- ##
kmeans3 = None
y_pred3, centroidList3 = None, None
axs[2].scatter(X[:, 0], X[:, 1], c=y_pred3)
axs[2].set_title(f"k=4, Je={kmeans3.Je:.2f}")
plt.suptitle("KMeans clusters with different # of centroids").set_y(1.03)kList = [2, 3, 4, 5, 6]
jeList = []
# TODO
## ----------- 输入代码,计算jeList ----------- ##
pass
fig, ax = plt.subplots(figsize=(4, 4))
ax.plot(kList, jeList, marker='o', markersize=4, color='k')
ax.set_xlabel('# of centroids (k)')
ax.set_ylabel('Je');从上图可以看出,在 k=3 是 Je 曲线的拐点,Je 曲线的下降率在 k=3 之后迅速减小,说明 k=3 适合这个数据集
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8, 4))
# TODO
## ----------- 输入代码,对比不同的初始化策略 ----------- ##
pass
from sklearn.datasets import make_moons
twoArcsX, twoArcsy = make_moons(200, noise=.05, random_state=0)
# TODO
## ----------- 输入代码,采用 K-Means 对双曲线数据聚类 ----------- ##
ta_kmeans = None
labels, cenList = None, None
from sklearn.cluster import SpectralClustering
model = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', assign_labels='kmeans')
labels = model.fit_predict(twoArcsX)
fig, ax = plt.subplots(figsize=(3.5, 3.5))
plt.scatter(twoArcsX[:, 0], twoArcsX[:, 1], c=labels, s=50, cmap='viridis');epsilon_list = [0.3, 0.6, 1]
min_samples_list = [5, 7, 9]
from sklearn.cluster import DBSCAN
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(10, 10))
for eps in epsilon_list:
for min_samples in min_samples_list:
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
labels = dbscan.fit_predict(X)
ax = axes[epsilon_list.index(eps), min_samples_list.index(min_samples)]
ax.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis')
ax.set_title(f"eps={eps}, min_samples={min_samples}")
## TODO
## ----------- 参考4.1的代码,寻找最适合X_aniso数据集的DBSCAN参数 ----------- ##
## ----------- 请在以下补全代码 ----------- ##
epsilon_list = None
min_samples_list = None
dataPath = 'images/tiger.png'
imgData = plt.imread(dataPath)[:, :, :3]
fig, ax = plt.subplots(figsize=(4, 3))
plt.imshow(imgData[:, :, :3]);print(f'The shape of the tiger image: {imgData.shape}')pixelValues = imgData.reshape((-1, 3))
fig = plt.figure(figsize=(4, 4))
ax = plt.axes(projection='3d')
ax.scatter3D(pixelValues[:, 0], pixelValues[:, 1], pixelValues[:, 2], c=pixelValues[:, 2], cmap='plasma');k = 3 # 选择 3 个质心
# TODO
## ----------- 输入代码,k=3 的 K-Means 对图片聚类 ----------- ##
kmeans = None
centroidIndices, centroidList = None, None
fig = plt.figure(figsize=(4, 4))
ax = plt.axes(projection='3d')
ax.scatter3D(pixelValues[:, 0], pixelValues[:, 1], pixelValues[:, 2], c=centroidIndices, cmap='plasma');newPixels = np.zeros((imgData.shape[0] * imgData.shape[1], 3))
# TODO
## ----------- 输入代码,对newPixels各点赋予各类的质心的值 ----------- ##
pass
segmentedImage = np.reshape(newPixels, imgData.shape)
fig, axes = plt.subplots(1, 2, figsize=(8, 4))
axes[0].imshow(imgData)
axes[1].imshow(segmentedImage);from sklearn.datasets import load_digits
digits = load_digits()
digits.data.shape# 绘制随机16个数字
indices = np.random.choice(digits.data.shape[0], 16)
fig, axes = plt.subplots(4, 4, sharex=True, sharey=True, figsize=(6, 6))
for ax, ind in zip(axes.flatten(), indices):
digData = digits.data[ind].reshape((8, 8))
ax.imshow(digData, interpolation='nearest', cmap=plt.cm.binary)
ax.set(xticks=[], yticks=[])from sklearn.cluster import KMeans
# TODO
## ----------- 输入代码,对数字集的聚类 ----------- ##
k = 10
kmeans = None
clusters = Nonefig, ax = plt.subplots(2, 5, figsize=(8, 3))
centers = kmeans.cluster_centers_.reshape(10, 8, 8)
for axi, center in zip(ax.flat, centers):
axi.set(xticks=[], yticks=[])
axi.imshow(center, interpolation='nearest', cmap=plt.cm.binary)from scipy.stats import mode
labels = np.zeros_like(clusters)
for i in range(10):
mask = (clusters == i)
labels[mask] = mode(digits.target[mask])[0]from sklearn.metrics import accuracy_score
accuracy_score(digits.target, labels)from sklearn.metrics import confusion_matrix
import seaborn as sns
mat = confusion_matrix(digits.target, labels)
sns.heatmap(mat.T, square=True, annot=True, fmt='d',
cbar=False, cmap='Blues',
xticklabels=digits.target_names,
yticklabels=digits.target_names)
plt.xlabel('true label')
plt.ylabel('predicted label');from sklearn.manifold import TSNE
# Project the data: this step will take several seconds
tsne = TSNE(n_components=2, init='random',
learning_rate=0.1,random_state=0)
digits_proj = tsne.fit_transform(digits.data)
# TODO
## ----------- 输入代码,使用 K-Means 对TSNE的特征聚类 ----------- ##
# Compute the clusters
kmeans = None
clusters = None
# Permute the labels
labels = np.zeros_like(clusters)
pass
# Compute the accuracy
accuracy_score(digits.target, labels)帮我填充代码
最新发布