beer数据集 聚类分析
import pandas as pd
beer = pd.read_csv('./data/data.txt', sep=' ')
print(beer)
name calories sodium alcohol cost
0 Budweiser 144 15 4.7 0.43
1 Schlitz 151 19 4.9 0.43
2 Lowenbrau 157 15 0.9 0.48
3 Kronenbourg 170 7 5.2 0.73
4 Heineken 152 11 5.0 0.77
5 Old_Milwaukee 145 23 4.6 0.28
6 Augsberger 175 24 5.5 0.40
7 Srohs_Bohemian_Style 149 27 4.7 0.42
8 Miller_Lite 99 10 4.3 0.43
9 Budweiser_Light 113 8 3.7 0.40
10 Coors 140 18 4.6 0.44
11 Coors_Light 102 15 4.1 0.46
12 Michelob_Light 135 11 4.2 0.50
13 Becks 150 19 4.7 0.76
14 Kirin 149 6 5.0 0.79
15 Pabst_Extra_Light 68 15 2.3 0.38
16 Hamms 139 19 4.4 0.43
17 Heilemans_Old_Style 144 24 4.9 0.43
18 Olympia_Goled_Light 72 6 2.9 0.46
19 Schlitz_Light 97 7 4.2 0.47
X = beer[['calories', 'sodium', 'alcohol', 'cost']]
K-means聚类算法
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3).fit(X)
km2 = KMeans(n_clusters=2).fit(X)
km.labels_
array([0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2, 0, 0, 0, 1, 0, 0, 1, 2])
beer['cluster'] = km.labels_
beer['cluster2'] = km2.labels_
beer.sort_values('cluster')
|
name |
calories |
sodium |
alcohol |
cost |
cluster |
cluster2 |
0 |
Budweiser |
144 |
15 |
4.7 |
0.43 |
0 |
0 |
1 |
Schlitz |
151 |
19 |
4.9 |
0.43 |
0 |
0 |
2 |
Lowenbrau |
157 |
15 |
0.9 |
0.48 |
0 |
0 |
3 |
Kronenbourg |
170 |
7 |
5.2 |
0.73 |
0 |
0 |
4 |
Heineken |
152 |
11 |
5.0 |
0.77 |
0 |
0 |
5 |
Old_Milwaukee |
145 |
23 |
4.6 |
0.28 |
0 |
0 |
6 |
Augsberger |
175 |
24 |
5.5 |
0.40 |
0 |
0 |
7 |
Srohs_Bohemian_Style |
149 |
27 |
4.7 |
0.42 |
0 |
0 |
17 |
Heilemans_Old_Style |
144 |
24 |
4.9 |
0.43 |
0 |
0 |
10 |
Coors |
140 |
18 |
4.6 |
0.44 |
0 |
0 |
16 |
Hamms |
139 |
19 |
4.4 |
0.43 |
0 |
0 |
12 |
Michelob_Light |
135 |
11 |
4.2 |
0.50 |
0 |
0 |
13 |
Becks |
150 |
19 |
4.7 |
0.76 |
0 |
0 |
14 |
Kirin |
149 |
6 |
5.0 |
0.79 |
0 |
0 |
18 |
Olympia_Goled_Light |
72 |
6 |
2.9 |
0.46 |
1 |
1 |
15 |
Pabst_Extra_Light |
68 |
15 |
2.3 |
0.38 |
1 |
1 |
9 |
Budweiser_Light |
113 |
8 |
3.7 |
0.40 |
2 |
1 |
8 |
Miller_Lite |
99 |
10 |
4.3 |
0.43 |
2 |
1 |
11 |
Coors_Light |
102 |
15 |
4.1 |
0.46 |
2 |
1 |
19 |
Schlitz_Light |
97 |
7 |
4.2 |
0.47 |
2 |
1 |
from pandas.tools.plotting import scatter_matrix
%matplotlib inline
cluster_centers = km.cluster_centers_
cluster2_centers = km2.cluster_centers_
beer.groupby('cluster').mean()