分组聚集的K-means算法

最新推荐文章于 2024-01-21 03:43:01 发布

转载最新推荐文章于 2024-01-21 03:43:01 发布 · 546 阅读

0 ·

CC 4.0 BY-SA版权

原文链接：http://www.cnblogs.com/OMG-Team/archive/2011/11/11/2245526.html

本文介绍了一种使用MPI并行编程实现的K-means聚类算法。该算法能够处理大量数据点，通过主从模式进行数据调度和划分，多个节点并行计算簇中心，提高算法效率。

在许多实际应用中，需要对许多数据点进行分组，划分成一个个簇（cluster），并计算出每一个簇的中心。这就是著名的k-means算法。

k-means算法的输入是N个d维数据点：x_1, ..., x_N，以及需要划分的簇的数目k。算法运行的结果是每个簇的中心点m_1, ..., m_k，也可以输出每个簇中有哪些数据点。算法先通过随机，或启发式搜索，确定初始的中心点位置。再通过如下两个步骤的交替，进行数据的计算：

数据点划分。根据当前的k个中心点，确定每个中心点所在的簇中的数据点有哪些。即根据当前的中心点的坐标，计算每个点到这些中心点的距离，选取最短的距离相应的簇为该点所在的簇；
重新计算中心点：根据每个簇中所有点的坐标，求平均值得到这个簇的新的中心。

此算法不一定收敛，因此通常会选取不同的初始点多次运行算法。

k-means算法可以并行化。使用主从模式，由一个节点负责数据的调度和划分，其他各节点负责本地的中心点的计算，并把结果发送给主节点。因此，可以使用MPI并行编程来实现。大致的过程如下：

节点P_0把数据点集划分成p个块：D_1, ...,D_p，分配给P_1, ...,P_p这个p个节点；
P_0产生初始的k个中心点(m_1, ..., m_k)，并广播给所有的节点；
节点P_r计算块D_r中每个点到k个中心点(m_1, ..., m_k)的距离，选取最小值对应的中心点为该点所在的簇。
节点P_r分别计算出本地数据点集D_r在每个簇中的总和以及数目，发送给P_0；
P_0收到所有这些数据后，计算出新的中心点(m_1, ..., m_k)，广播给所有节点；
重复3-5步直到收敛。

附上源程序：

  1 #include <mpi.h>
  2 #include <stdio.h>
  3 #include <stdlib.h>
  4 #include <time.h>
  5 #include <math.h>
  6 
  7 #define N 10000    // Number of data points
  8 #define D 2        // Dimension of data
  9 #define K 3        // Number of clusters
 10 #define M 1000    // Number of iterations
 11 
 12 typedef double DataPoint[D];
 13 
 14 // Generate a random point, which is in [0, 1] ^ D
 15 void GenRandomPoint(DataPoint point)
 16 {
 17     for(int i = 0; i < D; i++)
 18         point[i] = (double)rand()/RAND_MAX;
 19 }
 20 
 21 // Reset the point to 0 ^ D
 22 void ResetPoint(DataPoint point)
 23 {
 24     for(int i = 0; i < D; i++)
 25         point[i] = 0.0;
 26 }
 27 
 28 // Print the point
 29 void PrintPoint(DataPoint point)
 30 {
 31     for(int i = 0; i < D; i++)
 32         printf("%g ", point[i]);
 33     putchar('\n');
 34 }
 35 
 36 // Calculate the distance between the two points
 37 double Distance(DataPoint p1, DataPoint p2)
 38 {
 39     double s = 0.0;
 40     for(int i = 0; i < D; i++)
 41         s += (p1[i] - p2[i]) * (p1[i] - p2[i]);
 42     return sqrt(s);
 43 }
 44 
 45 // Find the nearest centroid index which p belongs to
 46 int GetNearestIndex(DataPoint centroid[K], DataPoint p)
 47 {
 48     double min = 1e10;
 49     int min_i;
 50     for(int i = 0; i < K; i++)
 51     {
 52         double dist = Distance(centroid[i], p);
 53         if(dist < min)
 54         {
 55             min = dist;
 56             min_i = i;
 57         }
 58     }
 59     return min_i;
 60 }
 61 
 62 // Sum the point p to sum
 63 void AddPointToSum(DataPoint sum, DataPoint p)
 64 {
 65     for(int i = 0; i < D; i++)
 66         sum[i] += p[i];
 67 }
 68 
 69 int main(int argc, char *argv[]) 
 70 {
 71     int rank, size;
 72     MPI_Datatype MPI_DATAPOINT;
 73     MPI_Status status;
 74 
 75     MPI_Init(&argc, &argv);
 76     MPI_Comm_size(MPI_COMM_WORLD, &size);
 77     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 78 
 79     // Test whether size >= 2
 80     if(size < 2)
 81     {
 82         fprintf(stderr, "Should be at least two processors. Program terminate.\n");
 83         MPI_Finalize();
 84         return -1;
 85     }
 86 
 87     MPI_Type_contiguous(D, MPI_DOUBLE, &MPI_DATAPOINT);
 88     MPI_Type_commit(&MPI_DATAPOINT);
 89 
 90     DataPoint *full_data;
 91     DataPoint *data;
 92     DataPoint centroid[K];
 93 
 94     DataPoint sum[K];
 95     int number[K];
 96 
 97     // Generate the points, and allocate the memory
 98     if(rank == 0)
 99     {
100         srand((unsigned)time(NULL));
101 
102         data = NULL;
103         full_data = (DataPoint *)malloc(sizeof(DataPoint) * N);
104         
105         for(int i = 0; i < N; i++)
106             GenRandomPoint(full_data[i]);
107 
108         for(int i = 0; i < K; i++)
109             GenRandomPoint(centroid[i]);
110     }
111     else
112     {
113         data = (DataPoint *)malloc(sizeof(DataPoint) * (N  + size - 2) / (size - 1));
114         full_data = NULL;
115     }
116 
117     // Broadcast the initial centroids
118     MPI_Bcast(centroid, K, MPI_DATAPOINT, 0, MPI_COMM_WORLD);
119 
120     // Scatter the data points to each node
121     int *sendcounts = (int *)malloc(sizeof(int) * size);
122     int *displs = (int *)malloc(sizeof(int) * size);
123     sendcounts[0] = displs[0] = 0;
124 
125     int remain = N;
126     for(int i = 1; i < size; i++)
127     {
128         sendcounts[i] = (N  + size - 2) / (size - 1);
129         if(sendcounts[i] > remain)
130             sendcounts[i] = remain;
131         displs[i] = displs[i-1] + sendcounts[i-1];
132         remain -= sendcounts[i];
133     }
134     MPI_Scatterv(full_data, sendcounts, displs, MPI_DATAPOINT, data, sendcounts[rank], MPI_DATAPOINT, 0, MPI_COMM_WORLD);
135 
136     // Begin of the iteration
137     for(int iteration = 0; iteration < M; iteration++)
138     {
139         if(rank == 0)
140         {
141             DataPoint sum[K];
142             int number[K];
143             int total_number[K];
144             int remain = N;
145 
146             // Other nodes are calculating their local centroids...
147 
148             for(int i = 0; i < K; i++)
149             {
150                 total_number[i] = 0;
151                 ResetPoint(centroid[i]);
152             }
153 
154             for(int i = 1; i < size; i++)
155             {
156                 if(sendcounts[i] > 0)
157                 {
158                     MPI_Recv(number, K, MPI_INT, i, iteration * 2, MPI_COMM_WORLD, &status);
159                     MPI_Recv(sum, K, MPI_DATAPOINT, i, iteration * 2 + 1, MPI_COMM_WORLD, &status);
160 
161                     for(int j = 0; j < K; j++)
162                     {
163                         total_number[j] += number[j];
164                         AddPointToSum(centroid[j], sum[j]);
165                     }
166                 }
167             }
168             
169             for(int i = 0; i < K; i++)
170                 for(int j = 0; j < D; j++)
171                     centroid[i][j] /= total_number[i];
172         }
173         else
174         {
175             for(int i = 0; i < K; i++)
176             {
177                 ResetPoint(sum[i]);
178                 number[i] = 0;
179             }
180 
181             for(int i = 0; i < sendcounts[rank]; i++)
182             {
183                 int index = GetNearestIndex(centroid, data[i]);
184                 AddPointToSum(sum[index], data[i]);
185                 number[index] ++;
186             }
187 
188             MPI_Send(number, K, MPI_INT, 0, iteration * 2, MPI_COMM_WORLD);
189             MPI_Send(sum, K, MPI_DATAPOINT, 0, iteration * 2 + 1, MPI_COMM_WORLD);
190         }
191         
192         // End of the iteration, broadcast the new centroids
193         MPI_Bcast(centroid, K, MPI_DATAPOINT, 0, MPI_COMM_WORLD);
194 
195     }
196 
197     if(rank == 0)
198     {
199         printf("K-means: \n");
200         for(int i = 0; i < K; i++)
201             PrintPoint(centroid[i]);
202         system("pause");
203     }
204 
205     if(rank == 0)
206         free(full_data);
207     else
208         free(data);
209     
210     MPI_Type_free(&MPI_DATAPOINT);
211 
212     MPI_Finalize();
213     return 0;
214 }

转载于:https://www.cnblogs.com/OMG-Team/archive/2011/11/11/2245526.html