在许多实际应用中,需要对许多数据点进行分组,划分成一个个簇(cluster),并计算出每一个簇的中心。这就是著名的k-means算法。
k-means算法的输入是N个d维数据点:x_1, ..., x_N,以及需要划分的簇的数目k。算法运行的结果是每个簇的中心点m_1, ..., m_k,也可以输出每个簇中有哪些数据点。算法先通过随机,或启发式搜索,确定初始的中心点位置。再通过如下两个步骤的交替,进行数据的计算:
- 数据点划分。根据当前的k个中心点,确定每个中心点所在的簇中的数据点有哪些。即根据当前的中心点的坐标,计算每个点到这些中心点的距离,选取最短的距离相应的簇为该点所在的簇;
- 重新计算中心点:根据每个簇中所有点的坐标,求平均值得到这个簇的新的中心。
此算法不一定收敛,因此通常会选取不同的初始点多次运行算法。
k-means算法可以并行化。使用主从模式,由一个节点负责数据的调度和划分,其他各节点负责本地的中心点的计算,并把结果发送给主节点。因此,可以使用MPI并行编程来实现。大致的过程如下:
- 节点P_0把数据点集划分成p个块:D_1, ...,D_p,分配给P_1, ...,P_p这个p个节点;
- P_0产生初始的k个中心点(m_1, ..., m_k),并广播给所有的节点;
- 节点P_r计算块D_r中每个点到k个中心点(m_1, ..., m_k)的距离,选取最小值对应的中心点为该点所在的簇。
- 节点P_r分别计算出本地数据点集D_r在每个簇中的总和以及数目,发送给P_0;
- P_0收到所有这些数据后,计算出新的中心点(m_1, ..., m_k),广播给所有节点;
- 重复3-5步直到收敛。
附上源程序:
1 #include <mpi.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <time.h>
5 #include <math.h>
6
7 #define N 10000 // Number of data points
8 #define D 2 // Dimension of data
9 #define K 3 // Number of clusters
10 #define M 1000 // Number of iterations
11
12 typedef double DataPoint[D];
13
14 // Generate a random point, which is in [0, 1] ^ D
15 void GenRandomPoint(DataPoint point)
16 {
17 for(int i = 0; i < D; i++)
18 point[i] = (double)rand()/RAND_MAX;
19 }
20
21 // Reset the point to 0 ^ D
22 void ResetPoint(DataPoint point)
23 {
24 for(int i = 0; i < D; i++)
25 point[i] = 0.0;
26 }
27
28 // Print the point
29 void PrintPoint(DataPoint point)
30 {
31 for(int i = 0; i < D; i++)
32 printf("%g ", point[i]);
33 putchar('\n');
34 }
35
36 // Calculate the distance between the two points
37 double Distance(DataPoint p1, DataPoint p2)
38 {
39 double s = 0.0;
40 for(int i = 0; i < D; i++)
41 s += (p1[i] - p2[i]) * (p1[i] - p2[i]);
42 return sqrt(s);
43 }
44
45 // Find the nearest centroid index which p belongs to
46 int GetNearestIndex(DataPoint centroid[K], DataPoint p)
47 {
48 double min = 1e10;
49 int min_i;
50 for(int i = 0; i < K; i++)
51 {
52 double dist = Distance(centroid[i], p);
53 if(dist < min)
54 {
55 min = dist;
56 min_i = i;
57 }
58 }
59 return min_i;
60 }
61
62 // Sum the point p to sum
63 void AddPointToSum(DataPoint sum, DataPoint p)
64 {
65 for(int i = 0; i < D; i++)
66 sum[i] += p[i];
67 }
68
69 int main(int argc, char *argv[])
70 {
71 int rank, size;
72 MPI_Datatype MPI_DATAPOINT;
73 MPI_Status status;
74
75 MPI_Init(&argc, &argv);
76 MPI_Comm_size(MPI_COMM_WORLD, &size);
77 MPI_Comm_rank(MPI_COMM_WORLD, &rank);
78
79 // Test whether size >= 2
80 if(size < 2)
81 {
82 fprintf(stderr, "Should be at least two processors. Program terminate.\n");
83 MPI_Finalize();
84 return -1;
85 }
86
87 MPI_Type_contiguous(D, MPI_DOUBLE, &MPI_DATAPOINT);
88 MPI_Type_commit(&MPI_DATAPOINT);
89
90 DataPoint *full_data;
91 DataPoint *data;
92 DataPoint centroid[K];
93
94 DataPoint sum[K];
95 int number[K];
96
97 // Generate the points, and allocate the memory
98 if(rank == 0)
99 {
100 srand((unsigned)time(NULL));
101
102 data = NULL;
103 full_data = (DataPoint *)malloc(sizeof(DataPoint) * N);
104
105 for(int i = 0; i < N; i++)
106 GenRandomPoint(full_data[i]);
107
108 for(int i = 0; i < K; i++)
109 GenRandomPoint(centroid[i]);
110 }
111 else
112 {
113 data = (DataPoint *)malloc(sizeof(DataPoint) * (N + size - 2) / (size - 1));
114 full_data = NULL;
115 }
116
117 // Broadcast the initial centroids
118 MPI_Bcast(centroid, K, MPI_DATAPOINT, 0, MPI_COMM_WORLD);
119
120 // Scatter the data points to each node
121 int *sendcounts = (int *)malloc(sizeof(int) * size);
122 int *displs = (int *)malloc(sizeof(int) * size);
123 sendcounts[0] = displs[0] = 0;
124
125 int remain = N;
126 for(int i = 1; i < size; i++)
127 {
128 sendcounts[i] = (N + size - 2) / (size - 1);
129 if(sendcounts[i] > remain)
130 sendcounts[i] = remain;
131 displs[i] = displs[i-1] + sendcounts[i-1];
132 remain -= sendcounts[i];
133 }
134 MPI_Scatterv(full_data, sendcounts, displs, MPI_DATAPOINT, data, sendcounts[rank], MPI_DATAPOINT, 0, MPI_COMM_WORLD);
135
136 // Begin of the iteration
137 for(int iteration = 0; iteration < M; iteration++)
138 {
139 if(rank == 0)
140 {
141 DataPoint sum[K];
142 int number[K];
143 int total_number[K];
144 int remain = N;
145
146 // Other nodes are calculating their local centroids...
147
148 for(int i = 0; i < K; i++)
149 {
150 total_number[i] = 0;
151 ResetPoint(centroid[i]);
152 }
153
154 for(int i = 1; i < size; i++)
155 {
156 if(sendcounts[i] > 0)
157 {
158 MPI_Recv(number, K, MPI_INT, i, iteration * 2, MPI_COMM_WORLD, &status);
159 MPI_Recv(sum, K, MPI_DATAPOINT, i, iteration * 2 + 1, MPI_COMM_WORLD, &status);
160
161 for(int j = 0; j < K; j++)
162 {
163 total_number[j] += number[j];
164 AddPointToSum(centroid[j], sum[j]);
165 }
166 }
167 }
168
169 for(int i = 0; i < K; i++)
170 for(int j = 0; j < D; j++)
171 centroid[i][j] /= total_number[i];
172 }
173 else
174 {
175 for(int i = 0; i < K; i++)
176 {
177 ResetPoint(sum[i]);
178 number[i] = 0;
179 }
180
181 for(int i = 0; i < sendcounts[rank]; i++)
182 {
183 int index = GetNearestIndex(centroid, data[i]);
184 AddPointToSum(sum[index], data[i]);
185 number[index] ++;
186 }
187
188 MPI_Send(number, K, MPI_INT, 0, iteration * 2, MPI_COMM_WORLD);
189 MPI_Send(sum, K, MPI_DATAPOINT, 0, iteration * 2 + 1, MPI_COMM_WORLD);
190 }
191
192 // End of the iteration, broadcast the new centroids
193 MPI_Bcast(centroid, K, MPI_DATAPOINT, 0, MPI_COMM_WORLD);
194
195 }
196
197 if(rank == 0)
198 {
199 printf("K-means: \n");
200 for(int i = 0; i < K; i++)
201 PrintPoint(centroid[i]);
202 system("pause");
203 }
204
205 if(rank == 0)
206 free(full_data);
207 else
208 free(data);
209
210 MPI_Type_free(&MPI_DATAPOINT);
211
212 MPI_Finalize();
213 return 0;
214 }