模糊C均值聚类的python实现
存在的问题:
用来判断终止条件的变量dist,是前后两次隶属度矩阵的对应元素做差,取绝对值,再取最大值。用鸢尾花数据集的话,dist的值最低只能到小数点后两位,但是网上别的代码能达到小数点后6、7位。
但是标准互信息指数NMI的值相差不大,都在0.74左右。
import numpy as np
from sklearn import metrics
import time
def read_txt(path, delimiter=','):
data = np.loadtxt(path, delimiter=delimiter)
n_row, n_col = data.shape
n_col = n_col - 1
y = np.array(data[:, -1], dtype=np.int)
X = data[:, 0:n_col]
return X, y
def random_membership_matrix(n_sample, k):
# 初始化一个隶属度矩阵
membership_matrix = np.zeros((n_sample, k), dtype=np.float)
for i in range(n_sample):
temp = np.random.random(k) # 生成k个随机浮点数
temp = temp / sum(temp)
membership_matrix[i] = temp
return membership_matrix
def iterate(X, u, k, m):
# 停止条件来自:https://blog.youkuaiyun.com/lyxleft/article/details/88964494
# 前后两次隶属度矩阵对应元素做差,差的最大值不超过某一阈值
n_sample, dim = X.shape
# 先把前一次的隶属度矩阵存起来
old_u = np.array(u)
ter = 0.0000001 # 阈值
iteration = 0
dist = float('inf')
max_iter = 20 # 最大迭代次数,这个次数之内达不到阈值,就得重新随机隶属度矩阵了
center = np.zeros((k, dim), dtype=np.float) # 类中心,每行存一个类中心
# 满足最小阈值,或者达到最大迭代次数时停止
while dist > ter and iteration < max_iter:
iteration = iteration + 1
# 根据隶属度矩阵求中心
for i in range(k):
up = np.zeros((dim,), dtype=np.float)
down = 0.0
for j in range(n_sample):
up += np.power(u[j, i], m) * X[j]
down += np.power(u[j, i], m)
center[i] = up / down
# 根据中心求隶属度矩阵
for p in range(n_sample):
for q in range(k):
down = 0.0
up = np.power(np.linalg.norm(X[p] - center[q]), -2/(m-1))
for e in range(k):
down += np.power(np.linalg.norm(X[p] - center[e]), -2/(m-1))
u[p, q] = up / down
# 求终止条件
dist = (abs(u - old_u)).max()
return center, u
def cluster(membership_matrix):
# 根据隶属度求聚类结果,把样本点归到隶属度最大的类
n_sample = membership_matrix.shape[0]
y_pred = np.zeros((n_sample,), dtype=np.int)
for i in range(n_sample):
y_pred[i] = np.argmax(membership_matrix[i])
return y_pred
if __name__ == '__main__':
k = 3
m = 2
fileName = 'iris'
X, y = read_txt('./dataset/' + fileName + '.txt')
n_sample, dim = X.shape
# 多跑几次取最好结果
best_y_pred = np.zeros((n_sample,), dtype=np.int)
best_nmi = 0.0
center = np.zeros((k, dim), dtype=np.float)
for d in range(10):
print('**********第%d次随机隶属度矩阵**********' % d)
membership_matrix = random_membership_matrix(n_sample, k)
center, membership_matrix = iterate(X, membership_matrix, k, m)
y_pred = cluster(membership_matrix)
nmi = metrics.normalized_mutual_info_score(y, y_pred)
if nmi > best_nmi:
best_nmi = nmi
best_y_pred = y_pred
print('best_nmi: %f' % best_nmi)
print(center)