欧式距里

最新推荐文章于 2023-12-30 19:50:12 发布

原创最新推荐文章于 2023-12-30 19:50:12 发布 · 367 阅读

0 ·

CC 4.0 BY-SA版权

（浮点计算，类封装，最后的分类结果和质心点输出到CSV文件）

平面坐标系：
x，y
point1:
（x1, y1）
point2:
（x2,,y2)

sqrt((x1-x2)^2 + (y1-y2)^2)

随机生成100个点：
分成N类：
在100个点中随机出N个点作为初始的分类中心点

Kmeas：

计算其他点和这N个点之间的距离，将每个点分给距离最近的中心点

收敛条件：
N1: 21 计算新的质心：N11
N2：31 计算新的质心：N12
N3：11 计算新的质心：N13
N4: 31 计算新的质心：N14
N5：6 计算新的质心：N15

递归：
0.0000001

完整代码如下：

import random
import matplotlib.pyplot as plt
import numpy

class KMeans():

    def __init__(self, k=1):
        '''
        :param k: k代表分类数
        '''
        self.__k = k
        self.__data = []       # 存放原始数据
        self.__pointCenter = []  # 存放中心点，第一次获得的中心点通过随机方式在__data里随机出来
        self.__result = []
        for i in range(k):
            self.__result.append([]) 
            pass
        pass
    def fit(self, data, threshold, times=10000):
        '''
        进行模型训练
        :param data: 训练数据
        :param threshold: 阈值，退出条件
        :return:
        '''
        self.__data = data
        self.randomCenter()
        print(self.__pointCenter)
        centerDistance = self.calPointCenterDistance(self.__pointCenter, self.__data)
        # 对原始数据进行分类，将每个点分到离它最近的中心点
        i = 0
        for temp in centerDistance:
            index = temp.index(min(temp))
            self.__result[index].append(self.__data[i])
            i += 1
            pass
        # 打印分类结果
        # print(self.__result)
        oldCenterPoint = self.__pointCenter
        newCenterPoint = self.calNewPointCenter(self.__result)
        while self.calCenterToCenterDistance(oldCenterPoint, newCenterPoint) > threshold:
            times -= 1
            result = []
            for i in range(self.__k):
                result.append([])
                pass
            # 保存上次的中心点
            oldCenterPoint = newCenterPoint
            centerDistance = self.calPointCenterDistance(newCenterPoint, self.__data)
            # 对原始数据进行分类，将每个点分到离它最近的中心点
            i = 0
            for temp in centerDistance:
                index = temp.index(min(temp))
                result[index].append(self.__data[i]) # result = [[[10,20]]]
                i += 1
                pass

            newCenterPoint = self.calNewPointCenter(result)
            print(self.calCenterToCenterDistance(oldCenterPoint, newCenterPoint))
            self.__result = result
            pass
        self.__pointCenter = newCenterPoint
        return newCenterPoint, self.__result
        pass
    def calCenterToCenterDistance(self, old, new):
        '''
        计算两次中心点之间的距离，求和求均值
        :param old: 上次的中心点
        :param new: 新计算的中心点
        :return:
        '''
        total = 0
        for point1, point2 in zip (old, new):
            total += self.distance(point1, point2)
            pass
        return total / len(old)
        pass
    def calPointCenterDistance(self, center, data):
        '''
        计算每个点和每个中心点之间的距离
        :return:
        '''
        centerDistance = []
        for temp in data:
            centerDistance.append([self.distance(temp, point) for point in center])
            pass
        print(centerDistance)
        return centerDistance
        pass
    def calNewPointCenter(self, result):
        '''
        计算新的中心点
        :param result:
        :return:
        '''
        newCenterPoint = []
        for temp in result:
            # 转置
            temps = [[temp[x][i] for x in range(len(temp)) ] for i in range(len(temp[0]))]
            point = []
            for t in temps:
                # 对每个维度求和，去平均
                point.append(sum(t)/len(t)) # mean
                pass
            newCenterPoint.append(point)
            pass
        print(newCenterPoint)
        return newCenterPoint
        pass
    def distance(self, pointer1, pointer2):
        '''
        计算两个点之间的距离，支持任意维度，欧式距离
        :param pointer1:
        :param pointer2:
        :return:
        '''
        distance = (sum([(x1 - x2)**2 for x1, x2 in zip(pointer1, pointer2)]))**0.5
        return distance
        pass
    def randomCenter(self):
        '''
        从原始的__data里随机出最开始进行计算的k个中心点
        :return:
        '''
        while len(self.__pointCenter) < self.__k:
            # 随机一个索引
            index = random.randint(0, len(self.__data) - 1)
            # 判断中心点是否重复，如果不重复，加入中心点列表
            if self.__data[index] not in self.__pointCenter:
                self.__pointCenter.append(self.__data[index])
                pass
            pass
        pass
    pass
if __name__ == "__main__":
    data = [[random.randint(1, 100), random.randint(1, 100)] for i in range(1000)]
    for i in range(10):
        kmeans = KMeans(k=5)
        centerPoint, result = kmeans.fit(data, 0.0001)
        print(centerPoint)
        #添加绘图
        plt.plot()
        plt.title("KMeans Classification")
        i = 0
        #所有x
        tempx = []
        #所有y
        tempy = []
        color = []
        for temp in result:
            temps = [[temp[x][i] for x in range(len(temp))] for i in range(len(temp[0]))]
            color += [i] * len(temps[0])
            tempx += temps[0]
            tempy += temps[1]

            i += 2
            pass
        plt.scatter(tempx, tempy, c=color, s=30)
        plt.show()
        pass
    pass

欧式距离是python算法里最简单的算法，我现在掌握的也很不熟练，对于这一系列的方法调用有的地方也找不好，

不过我相信一直练习，总会熟练掌握的