参考《机器学习实战》
首先给出k均值算法
输入:样本集
聚类簇数k
过程:
- 从D中随机选择k个样本作为初始均值向量
- repeat
- 令
- for j = 1, 2, ..., m do
- 计算样本
与各均值向量
的距离:
- 根据距离最近的均值向量确定
的簇标记:
- 将样本
划入相应的簇:
- end for
- for i = 1, 2, ..., k do
- 计算新均值向量:
- if
then
- 将当前均值向量
更新为
- else
- 保持当前均值向量不变
- end if
- end for
- until 当前均值向量均未更新
输出:簇划分
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 16 21:54:20 2018
@author: Li Qingquan
"""
import numpy as np
from sklearn.datasets import make_blobs
import time
import matplotlib.pyplot as plt
seed = np.random.seed(2018)
class KMeans(object):
def __init__(self, n_clusters):
'''
:param n_clusters:
'''
self.n_clusters = n_clusters
def fit(self, X):
'''
:param X:
:return:
'''
'''
write your code here
'''
# pass
'''
Compute K-Means clustering
'''
# choose k centroids randomly
n = X.shape[1]
centroids = np.zeros((self.n_clusters, n)) # store the centroids
for i in range(n):
minNum = np.min(X[:, i])
maxNum = np.max(X[:, i])
centroids[:, i] = minNum + (maxNum - minNum) * np.random.rand(self.n_clusters)
self.centroids = centroids
# update centroids to find the optimum solution
m = X.shape[0]
Assment = np.zeros((m, 2)) # store the index of the family and the distance from centroids of sample points
clusterChanged = True;
while clusterChanged:
clusterChanged = False
for i in range(m): # find the closest centroid
minDist = np.inf
minIndex = -1;
for j in range(self.n_clusters):
distBetw = np.math.sqrt(sum(np.power(centroids[j, :] - X[i, :], 2))) # the distance between point and centroid
if distBetw < minDist:
minDist = distBetw
minIndex = j
if Assment[i, 0] != minIndex:
clusterChanged = True;
Assment[i, :] = minIndex, minDist**2
for i in range(self.n_clusters): # update centroids
indexAll = Assment[:, 0]
valueI = np.nonzero(indexAll==i)
pointsIn = X[valueI[0]]
centroids[i, :] = np.mean(pointsIn, axis=0) # compute the mean to be the new centroids
self.centroids = centroids
self.Assment = Assment
def predict(self, X):
'''
:param X:
:return:
'''
'''
write your code here
'''
# Predict the closet cluster each sample in X belongs to
m = X.shape[0]
y_pred = np.empty((m,))
for i in range(m): # distribute sample points
minDist = np.inf
for j in range(self.n_clusters):
distBetw = np.math.sqrt(sum(np.power(self.centroids[j, :] - X[i, :], 2)))
if distBetw < minDist:
minDist = distBetw
y_pred[i] = j
return y_pred
def fit_predict(self, X):
'''
:param X:
:return:
'''
self.fit(X)
return self.predict(X)
if __name__ == '__main__':
################################# 1. define params ###################################
# number of samples
n_samples = 1000
# number of clusters, you can define the param by yourself, the recommend value is 3
n_clusters = 4
# read data
X, y = make_blobs(n_samples=n_samples, random_state=seed)
################################# 2. initialize the model ###################################
kmeans_model = KMeans(n_clusters=n_clusters)
################################# 3. training and predicting #######################################
start_time = time.time()
y_pred = kmeans_model.fit_predict(X)
end_time = time.time()
print('Training and predicting Time: ', end_time - start_time)
################################ 4. plot #######################################################
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.title("Unevenly Sized Blobs")
plt.show()