实现基于Kmeans的商品价格聚类
# -*-coding:utf-8-*-
"""
Author: Thinkgamer
Desc:
代码4-9 实现基于Kmeans的商品价格聚类
"""
import numpy as np
import pandas as pd
import random
class kMeans:
def __init__(self):
pass
# 加载数据集
def loadData(self,file):
return pd.read_csv(file,header=0,sep=",")
# 去除异常值,使用正态分布方法,同时保证最大异常值为5000,最小异常值为1
def filterAnomalyValue(self,data):
upper = np.mean(data["price"]) + 3 * np.std(data["price"])
lower = np.mean(data["price"]) - 3 * np.std(data["price"])
upper_limit = upper if upper > 5000 else 5000
lower_limit = lower if lower > 1 else 1
print("最大异常值为:{},最小异常值为:{}"
.format(upper_limit,lower_limit))
# 过滤掉大于最大异常值和小于最小异常值的
newData = data[(data["price"]<upper_limit)
& (data["price"]>lower_limit)]
return newData,upper_limit,lower_limit
# 初始化簇类中心
def initCenters(self,values,K,Cluster):
random.seed(100)
oldCenters = list()
for i in range(K):
index = random.randint(0,len(values))
Cluster.setdefault(i,{})
Cluster[i]["center"]=values[index]
Cluster[i]["values"]=[]
oldCenters.append(values[index])
return oldCenters,Cluster
# 计算任意两条数据之间的欧式距离
def distance(self,price1,price2):
return np.emath.sqrt(pow(price1-price2, 2))
# 聚类
def kMeans(self,data,K,maxIters):
Cluster = dict() # 最终聚类结果
oldCenters, Cluster = self.initCenters(data,K,Cluster)
print("初始的簇类中心为:{}".format(oldCenters))
# 标志变量,若为True,则继续迭代
clusterChanged = True
i = 0 # 记录迭代次数 最大迭代
while clusterChanged:
for price in data:
# 每条数据距离离最近簇类的距离,初始化为正无穷大
minDistance = np.inf
# 每条数据对应的索引,初始化为-1
minIndex = -1
for key in Cluster.keys():
# 计算每条数据到簇类中心的距离
dis = self.distance(price, Cluster[key]["center"])
if dis < minDistance:
minDistance = dis
minIndex = key
Cluster[minIndex]["values"].append(price)
newCenters = list()
for key in Cluster.keys():
newCenter = np.mean(Cluster[key]["values"])
Cluster[key]["center"] = newCenter
newCenters.append(newCenter)
print("第{}次迭代后的簇类中心为:{}".format(i,newCenters))
if oldCenters == newCenters or i > maxIters:
clusterChanged = False
else:
oldCenters = newCenters
i += 1
# 删除Cluster 中记录的簇类值
for key in Cluster.keys(): Cluster[key]["values"]=[]
return Cluster
if __name__ == "__main__":
file = "sku-price/skuid_price.csv"
km = kMeans()
data = km.loadData(file)
newData,upper_limit,lower_limit = km.filterAnomalyValue(data)
Cluster = km.kMeans(newData["price"].values,K=7,maxIters=200)
print(Cluster)
结果
最大异常值为:5149.081853395541,最小异常值为:1
初始的簇类中心为:[362, 58, 48, 1881, 149, 145, 18]
第0次迭代后的簇类中心为:[639.5957446808511, 76.22099447513813, 42.116883116883116, 2633.59649122807, 194.6044776119403, 123.68, 15.355371900826446]
第1次迭代后的簇类中心为:[803.8540372670808, 78.05555555555556, 43.8034188034188, 3236.0897435897436, 259.305, 127.03703703703704, 13.345794392523365]
第2次迭代后的簇类中心为:[889.9057971014493, 80.45161290322581, 44.208333333333336, 3352.5416666666665, 332.25615763546796, 143.6, 13.345794392523365]
第3次迭代后的簇类中心为:[958.2520661157025, 84.01796407185628, 45.03174603174603, 3429.014705882353, 403.6180904522613, 162.56185567010309, 13.345794392523365]
第4次迭代后的簇类中心为:[1007.8967136150235, 91.15300546448087, 46.857142857142854, 3448.179104477612, 472.7724867724868, 187.87745098039215, 13.345794392523365]
第5次迭代后的簇类中心为:[1056.4972677595629, 99.67692307692307, 50.11842105263158, 3448.179104477612, 538.313829787234, 214.69117647058823, 14.368421052631579]
第6次迭代后的簇类中心为:[1097.975, 111.60765550239235, 53.98816568047337, 3448.179104477612, 579.7908163265306, 239.82872928176795, 15.355371900826446]
第7次迭代后的簇类中心为:[1128.7430555555557, 122.0377358490566, 57.79459459459459, 3448.179104477612, 615.2448979591836, 264.6140350877193, 16.34375]
第8次迭代后的簇类中心为:[1158.4, 132.95890410958904, 62.30890052356021, 3448.179104477612, 651.8952879581152, 296.39877300613495, 18.281690140845072]
第9次迭代后的簇类中心为:[1183.1596638655462, 145.78095238095239, 68.66341463414633, 3448.179104477612, 687.5934065934066, 328.2654320987654, 20.411392405063292]
第10次迭代后的簇类中心为:[1223.0833333333333, 160.47524752475246, 74.29596412556054, 3465.4545454545455, 717.1657458563536, 355.64935064935065, 21.810650887573964]
第11次迭代后的簇类中心为:[1254.0309278350514, 175.13197969543148, 80.17094017094017, 3465.4545454545455, 741.934065934066, 382.4166666666667, 23.704918032786885]
第12次迭代后的簇类中心为:[1303.3658536585365, 191.92513368983958, 85.99604743083005, 3465.4545454545455, 766.9424083769634, 405.6212121212121, 24.942708333333332]
第13次迭代后的簇类中心为:[1419.5694444444443, 204.44919786096256, 91.232, 3536.0967741935483, 798.890052356021, 436.9389312977099, 27.36190476190476]
第14次迭代后的簇类中心为:[1595.0892857142858, 218.5632183908046, 97.68339768339769, 3593.6610169491523, 839.5846153846154, 464.0220588235294, 29.254464285714285]
第15次迭代后的簇类中心为:[1781.9791666666667, 233.53529411764706, 104.02334630350195, 3669.4727272727273, 881.2553191489362, 499.0211267605634, 31.818930041152264]
第16次迭代后的簇类中心为:[1977.2093023255813, 244.36746987951807, 109.40873015873017, 3745.9411764705883, 918.4198895027624, 525.2266666666667, 34.011538461538464]
第17次迭代后的簇类中心为:[2163.2105263157896, 257.71069182389937, 116.08835341365462, 3803.1041666666665, 956.7159090909091, 548.0193548387097, 36.33812949640288]
第18次迭代后的簇类中心为:[2366.189189189189, 271.9483870967742, 122.59016393442623, 3907.767441860465, 1000.3636363636364, 576.1779141104295, 38.63513513513514]
第19次迭代后的簇类中心为:[2474.153846153846, 289.85526315789474, 128.77551020408163, 3998.025641025641, 1029.1633986928105, 603.1144578313254, 40.31715210355987]
第20次迭代后的簇类中心为:[2492.975, 310.91333333333336, 135.5483870967742, 4018.315789473684, 1045.041958041958, 628.8282208588957, 41.90654205607477]
第21次迭代后的簇类中心为:[2511.769230769231, 329.50993377483445, 141.08032128514057, 4018.315789473684, 1059.876811594203, 649.2866242038217, 43.259818731117825]
第22次迭代后的簇类中心为:[2531.2105263157896, 348.44666666666666, 147.96774193548387, 4018.315789473684, 1085.984251968504, 675.9177215189874, 45.05813953488372]
第23次迭代后的簇类中心为:[2550.1794871794873, 369.6808510638298, 158.16935483870967, 4038.5135135135133, 1100.7142857142858, 694.5253164556962, 47.38781163434903]
第24次迭代后的簇类中心为:[2605.625, 382.863309352518, 166.69787234042553, 4081.342857142857, 1117.9912280701753, 708.493670886076, 50.269633507853406]
第25次迭代后的簇类中心为:[2639.1428571428573, 399.94573643410854, 176.1949152542373, 4128.121212121212, 1134.066037735849, 722.3703703703703, 52.09620253164557]
第26次迭代后的簇类中心为:[2693.9024390243903, 411.4047619047619, 181.85470085470087, 4152.09375, 1161.0392156862745, 733.7743902439024, 53.43316831683168]
第27次迭代后的簇类中心为:[2693.9024390243903, 424.46031746031747, 188.5286343612335, 4152.09375, 1177.3473684210526, 748.6, 55.39088729016787]
第28次迭代后的簇类中心为:[2712.9, 441.15748031496065, 194.43805309734512, 4152.09375, 1190.1489361702127, 759.6100628930818, 56.6]
第29次迭代后的簇类中心为:[2712.9, 457.5833333333333, 202.08796296296296, 4152.09375, 1207.896551724138, 779.6474358974359, 58.893181818181816]
第30次迭代后的簇类中心为:[2712.9, 474.89230769230767, 209.56880733944953, 4152.09375, 1236.4675324675325, 800.2704402515724, 59.96420581655481]
第31次迭代后的簇类中心为:[2712.9, 490.43283582089555, 214.5852534562212, 4152.09375, 1283.125, 826.2085889570552, 60.93156732891832]
第32次迭代后的簇类中心为:[2712.9, 508.9136690647482, 220.85185185185185, 4152.09375, 1334.0377358490566, 852.6134969325153, 62.073913043478264]
第33次迭代后的簇类中心为:[2712.9, 519.8, 223.66355140186917, 4152.09375, 1387.9772727272727, 873.5121951219512, 62.747844827586206]
第34次迭代后的簇类中心为:[2748.157894736842, 530.6291390728477, 229.0097087378641, 4152.09375, 1452.4878048780488, 890.4906832298136, 64.4324894514768]
第35次迭代后的簇类中心为:[2804.054054054054, 544.6513157894736, 235.3574879227053, 4174.935483870968, 1505.175, 903.8789808917197, 65.27139874739039]
第36次迭代后的簇类中心为:[2822.3611111111113, 554.5562913907285, 242.00975609756097, 4174.935483870968, 1537.4871794871794, 912.3870967741935, 66.46090534979425]
第37次迭代后的簇类中心为:[2822.3611111111113, 566.5833333333334, 246.95544554455446, 4174.935483870968, 1574.5142857142857, 930.0463576158941, 67.51016260162602]
第38次迭代后的簇类中心为:[2822.3611111111113, 576.1552795031056, 251.76142131979697, 4174.935483870968, 1606.125, 945.374149659864, 68.75350701402806]
第39次迭代后的簇类中心为:[2822.3611111111113, 583.6036585365854, 256.8072916666667, 4174.935483870968, 1655.75, 960.4794520547945, 70.00592885375494]
第40次迭代后的簇类中心为:[2822.3611111111113, 595.3913043478261, 262.92424242424244, 4174.935483870968, 1655.75, 965.9859154929577, 70.1854043392505]
第41次迭代后的簇类中心为:[2822.3611111111113, 606.4076433121019, 270.55223880597015, 4174.935483870968, 1655.75, 970.0575539568346, 70.93150684931507]
第42次迭代后的簇类中心为:[2822.3611111111113, 611.9873417721519, 273.68, 4174.935483870968, 1655.75, 974.1764705882352, 71.50389105058366]
第43次迭代后的簇类中心为:[2822.3611111111113, 615.3270440251572, 277.1326530612245, 4174.935483870968, 1655.75, 976.9104477611941, 72.46628131021194]
第44次迭代后的簇类中心为:[2822.3611111111113, 620.8589743589744, 282.1015228426396, 4174.935483870968, 1655.75, 978.2857142857143, 73.0478927203065]
第45次迭代后的簇类中心为:[2822.3611111111113, 624.2193548387097, 287.7083333333333, 4174.935483870968, 1655.75, 979.6590909090909, 74.40831758034027]
第46次迭代后的簇类中心为:[2822.3611111111113, 627.5921052631579, 293.8042328042328, 4174.935483870968, 1655.75, 979.6590909090909, 75.57570093457944]
第47次迭代后的簇类中心为:[2822.3611111111113, 632.125, 298.5483870967742, 4174.935483870968, 1655.75, 982.3846153846154, 76.57037037037037]
第48次迭代后的簇类中心为:[2822.3611111111113, 635.51677852349, 302.3903743315508, 4174.935483870968, 1655.75, 982.3846153846154, 76.97785977859779]
第49次迭代后的簇类中心为:[2822.3611111111113, 638.9527027027027, 307.9344262295082, 4174.935483870968, 1655.75, 983.7364341085271, 78.1970802919708]
第50次迭代后的簇类中心为:[2822.3611111111113, 643.5479452054794, 313.2142857142857, 4174.935483870968, 1655.75, 985.1015625, 79.0126811594203]
第51次迭代后的簇类中心为:[2822.3611111111113, 649.3793103448276, 316.48369565217394, 4174.935483870968, 1655.75, 987.8333333333334, 79.2242314647378]
第52次迭代后的簇类中心为:[2822.3611111111113, 654.0206896551724, 318.9081081081081, 4174.935483870968, 1655.75, 990.5806451612904, 79.43682310469315]
第53次迭代后的簇类中心为:[2822.3611111111113, 658.6689655172414, 321.3279569892473, 4174.935483870968, 1655.75, 993.360655737705, 79.65225225225225]
第54次迭代后的簇类中心为:[2822.3611111111113, 659.8472222222222, 322.8817204301075, 4174.935483870968, 1655.75, 993.360655737705, 79.86870503597122]
第55次迭代后的簇类中心为:[2822.3611111111113, 661.027972027972, 325.7826086956522, 4174.935483870968, 1655.75, 993.360655737705, 80.51878354203936]
第56次迭代后的簇类中心为:[2822.3611111111113, 664.5633802816901, 328.9402173913044, 4174.935483870968, 1655.75, 994.7355371900826, 80.9536541889483]
第57次迭代后的簇类中心为:[2822.3611111111113, 665.7659574468086, 330.5217391304348, 4174.935483870968, 1655.75, 994.7355371900826, 81.17259786476869]
第58次迭代后的簇类中心为:[2822.3611111111113, 665.7659574468086, 331.20765027322403, 4174.935483870968, 1655.75, 994.7355371900826, 81.39253996447601]
第59次迭代后的簇类中心为:[2822.3611111111113, 665.7659574468086, 332.5911602209945, 4174.935483870968, 1655.75, 994.7355371900826, 81.83362831858408]
第60次迭代后的簇类中心为:[2822.3611111111113, 665.7659574468086, 333.2888888888889, 4174.935483870968, 1655.75, 994.7355371900826, 82.0547703180212]
第61次迭代后的簇类中心为:[2822.3611111111113, 665.7659574468086, 333.2888888888889, 4174.935483870968, 1655.75, 994.7355371900826, 82.0547703180212]
{0: {'center': 2822.3611111111113, 'values': [3308, 2988, 2404, 2360, 2667, 2854, 3311, 3070, 3227, 3444, 2817, 3271, 2528, 2884, 3011, 2707, 2563, 2695, 2545, 2369, 2569, 2966, 2308, 2889, 2709, 2867, 3361, 3290, 2372, 3288, 2402, 2823, 3042, 2381, 2406, 2909]}, 1: {'center': 665.7659574468086, 'values': [609, 757, 644, 758, 638, 768, 521, 802, 747, 781, 621, 761, 536, 705, 647, 601, 797, 707, 777, 781, 645, 594, 698, 723, 606, 692, 754, 626, 706, 505, 718, 676, 592, 602, 634, 524, 733, 623, 721, 593, 582, 794, 589, 541, 563, 777, 700, 706, 576, 669, 569, 522, 532, 566, 717, 596, 817, 646, 707, 584, 580, 742, 555, 618, 566, 726, 593, 658, 751, 681, 582, 585, 664, 768, 635, 743, 632, 640, 735, 545, 692, 627, 763, 560, 681, 515, 683, 731, 726, 808, 732, 635, 600, 684, 670, 523, 812, 643, 598, 735, 557, 618, 821, 732, 725, 567, 809, 553, 821, 803, 732, 536, 669, 818, 695, 500, 509, 814, 596, 509, 659, 674, 684, 778, 581, 741, 789, 589, 653, 788, 626, 793, 672, 567, 827, 579, 736, 640, 607, 658, 755]}, 2: {'center': 333.2888888888889, 'values': [385, 405, 233, 285, 247, 491, 439, 387, 237, 244, 245, 219, 482, 461, 448, 324, 432, 426, 378, 400, 495, 419, 259, 467, 311, 351, 292, 427, 213, 473, 229, 317, 217, 216, 338, 489, 306, 445, 314, 430, 352, 259, 268, 329, 273, 454, 465, 327, 339, 284, 211, 493, 310, 224, 273, 249, 213, 414, 344, 326, 349, 279, 395, 237, 252, 412, 362, 241, 267, 465, 385, 427, 449, 414, 468, 327, 420, 444, 267, 268, 294, 480, 257, 245, 225, 427, 444, 228, 253, 269, 336, 231, 288, 210, 359, 228, 475, 255, 427, 274, 229, 292, 301, 458, 295, 290, 453, 333, 369, 227, 386, 474, 265, 379, 398, 297, 241, 215, 209, 247, 483, 423, 299, 413, 327, 417, 378, 352, 273, 322, 247, 306, 271, 243, 385, 369, 343, 276, 377, 298, 473, 378, 427, 241, 485, 314, 232, 377, 331, 325, 395, 273, 493, 414, 475, 260, 444, 244, 297, 453, 415, 458, 281, 245, 319, 470, 244, 246, 235, 221, 270, 290, 222, 243, 288, 223, 227, 272, 208, 208]}, 3: {'center': 4174.935483870968, 'values': [4374, 4297, 3848, 3672, 4083, 4653, 3702, 3687, 5114, 5014, 3574, 4835, 4295, 4373, 4284, 3891, 3624, 3734, 4985, 3713, 4399, 4173, 4375, 4512, 4841, 3505, 3999, 3740, 4279, 4299, 3549]}, 4: {'center': 1655.75, 'values': [1760, 1433, 1430, 1326, 1528, 1556, 1516, 1564, 1382, 1934, 2044, 1395, 1409, 1514, 1604, 1447, 2042, 1512, 1773, 1421, 1881, 1810, 1874, 1542, 2062, 2145, 2062, 1395]}, 5: {'center': 994.7355371900826, 'values': [838, 1007, 847, 848, 981, 1061, 1041, 1039, 1063, 1039, 969, 1042, 1207, 1073, 1214, 1059, 865, 1078, 1006, 1121, 1017, 1249, 973, 1230, 849, 1124, 1259, 992, 837, 1268, 1254, 1213, 887, 1170, 1233, 1204, 1165, 1220, 1254, 1127, 1186, 1134, 924, 1027, 985, 861, 943, 921, 927, 834, 844, 974, 980, 871, 918, 1038, 838, 878, 993, 908, 952, 902, 964, 937, 848, 1002, 972, 972, 963, 1077, 999, 1017, 1029, 983, 887, 1000, 956, 991, 947, 863, 990, 999, 1015, 887, 898, 840, 995, 846, 1000, 1130, 909, 1014, 938, 1042, 893, 1103, 1037, 1076, 1043, 992, 894, 937, 843, 860, 1039, 1017, 896, 906, 875, 946, 992, 838, 911, 930, 923, 862, 899, 1069, 927, 1077, 907]}, 6: {'center': 82.0547703180212, 'values': [143, 196, 35, 127, 86, 127, 67, 28, 125, 125, 79, 53, 109, 35, 42, 51, 46, 67, 41, 40, 5, 93, 183, 24, 74, 151, 43, 74, 15, 176, 18, 52, 15, 48, 160, 35, 63, 37, 62, 159, 60, 80, 189, 107, 91, 94, 114, 159, 27, 48, 6, 169, 41, 120, 39, 111, 184, 71, 46, 160, 75, 37, 176, 17, 45, 77, 69, 90, 100, 31, 36, 61, 78, 83, 65, 71, 130, 3, 42, 33, 108, 85, 96, 182, 25, 72, 55, 190, 145, 72, 13, 69, 30, 13, 72, 97, 43, 124, 140, 119, 31, 45, 33, 34, 19, 70, 96, 52, 164, 200, 100, 154, 52, 177, 171, 82, 153, 123, 90, 66, 53, 75, 113, 22, 135, 202, 205, 122, 34, 156, 201, 173, 155, 71, 156, 137, 207, 56, 123, 71, 188, 175, 63, 174, 86, 116, 93, 125, 134, 187, 120, 14, 178, 187, 150, 38, 188, 117, 123, 190, 95, 68, 38, 100, 74, 53, 58, 11, 104, 58, 120, 19, 160, 5, 54, 87, 116, 77, 27, 65, 12, 32, 116, 56, 30, 95, 41, 39, 17, 158, 100, 135, 145, 121, 11, 116, 66, 74, 170, 6, 36, 7, 171, 94, 201, 63, 18, 143, 15, 86, 79, 96, 171, 3, 144, 206, 85, 126, 178, 92, 8, 50, 65, 3, 48, 58, 58, 7, 172, 177, 98, 74, 6, 89, 52, 4, 10, 101, 137, 70, 26, 55, 114, 31, 134, 84, 26, 53, 116, 20, 76, 154, 11, 5, 16, 18, 34, 79, 17, 82, 118, 76, 32, 62, 53, 7, 157, 58, 9, 3, 76, 61, 182, 79, 36, 39, 143, 33, 149, 157, 10, 63, 98, 88, 6, 6, 13, 103, 142, 92, 24, 99, 46, 4, 124, 69, 100, 6, 40, 90, 50, 180, 151, 102, 204, 146, 84, 142, 117, 3, 16, 168, 111, 70, 8, 96, 102, 74, 179, 155, 30, 133, 23, 64, 83, 24, 88, 23, 141, 50, 17, 59, 137, 49, 30, 27, 57, 84, 11, 54, 80, 14, 107, 135, 52, 71, 39, 143, 161, 77, 30, 39, 9, 48, 20, 53, 92, 143, 18, 59, 108, 56, 74, 88, 25, 105, 65, 64, 69, 14, 117, 14, 52, 70, 124, 94, 133, 72, 64, 142, 79, 50, 108, 117, 63, 121, 4, 81, 11, 92, 17, 14, 33, 48, 19, 127, 188, 3, 21, 12, 48, 6, 117, 124, 22, 63, 190, 139, 159, 103, 58, 12, 101, 16, 74, 37, 67, 193, 26, 69, 91, 40, 203, 178, 16, 35, 39, 60, 36, 96, 39, 166, 50, 40, 41, 125, 9, 55, 116, 63, 54, 36, 40, 123, 83, 92, 199, 58, 104, 142, 93, 12, 144, 61, 42, 66, 58, 152, 147, 189, 100, 63, 7, 79, 93, 112, 70, 157, 133, 89, 124, 60, 25, 97, 7, 79, 166, 9, 66, 93, 30, 84, 46, 118, 147, 107, 40, 50, 41, 63, 115, 175, 124, 7, 10, 201, 81, 9, 101, 105, 128, 83, 126, 45, 40, 100, 115, 147, 98, 22, 6, 99, 30, 36, 88, 141, 184, 91, 136, 188, 37, 70, 63, 206, 2, 22, 12, 143, 131, 174, 45, 19, 101, 71, 112, 3, 102, 95, 178, 172, 25, 31, 64, 32, 197, 148, 10, 106, 3, 175, 6, 21, 147, 10, 65, 51, 14, 166, 76, 44, 95, 113, 62, 65, 2, 11]}}
代码下载链接代码下载链接
实现基于二分-Kmeans的商品价格聚类
# -*-coding:utf-8-*-
"""
Author: Thinkgamer
Desc:
代码4-10 实现基于二分-Kmeans的商品价格聚类
"""
import numpy as np
import pandas as pd
import random
class kMeans:
def __init__(self):
pass
# 加载数据集
def loadData(self,file):
return pd.read_csv(file,header=0,sep=",")
# 去除异常值,使用正态分布方法,同时保证最大异常值为5000,最小异常值为1
def filterAnomalyValue(self,data):
upper = np.mean(data["price"]) + 3 * np.std(data["price"])
lower = np.mean(data["price"]) - 3 * np.std(data["price"])
upper_limit = upper if upper > 5000 else 5000
lower_limit = lower if lower > 1 else 1
print("最大异常值为:{},最小异常值为:{}"
.format(upper_limit,lower_limit))
# 过滤掉大于最大异常值和小于最小异常值的
newData = data[(data["price"]<upper_limit)
& (data["price"]>lower_limit)]
return newData,upper_limit,lower_limit
# 初始化簇类中心
def initCenters(self,values,K,Cluster):
random.seed(100)
oldCenters = list()
for i in range(K):
index = random.randint(0,len(values))
Cluster.setdefault(i,{})
Cluster[i]["center"]=values[index]
Cluster[i]["values"]=[]
oldCenters.append(values[index])
return oldCenters,Cluster
# 计算任意两条数据之间的欧式距离
def distance(self,price1,price2):
return np.emath.sqrt(pow(price1-price2, 2))
# 聚类
def kMeans(self,data,K,maxIters):
Cluster = dict() # 最终聚类结果
oldCenters,Cluster = self.initCenters(data,K,Cluster)
# print("初始的簇类中心为:{}".format(oldCenters))
# 标志变量,若为True,则继续迭代
clusterChanged = True
i = 0 # 记录迭代次数 最大迭代
while clusterChanged:
for price in data:
# 每条数据距离离最近簇类的距离,初始化为正无穷大
minDistance = np.inf
# 每条数据对应的索引,初始化为-1
minIndex = -1
for key in Cluster.keys():
# 计算每条数据到簇类中心的距离
dis = self.distance(price, Cluster[key]["center"])
if dis < minDistance:
minDistance = dis
minIndex = key
Cluster[minIndex]["values"].append(price)
newCenters = list()
for key in Cluster.keys():
newCenter = np.mean(Cluster[key]["values"])
Cluster[key]["center"] = newCenter
newCenters.append(newCenter)
# print("第{}次迭代后的簇类中心为:{}".format(i,newCenters))
if oldCenters == newCenters or i > maxIters:
clusterChanged = False
else:
oldCenters = newCenters
i += 1
# 删除self.Cluster 中记录的簇类值
for key in Cluster.keys(): Cluster[key]["values"]=[]
return Cluster
# 计算对应的SSE值
def SSE(self,data,mean):
newData = np.mat(data)-mean
return (newData * newData.T).tolist()[0][0]
# 二分kMeans
def diKMeans(self,data,K=7):
clusterSSEResult = dict() # 簇类对应的SSE值
clusterSSEResult.setdefault(0,{})
clusterSSEResult[0]["values"] = data
clusterSSEResult[0]["sse"] = np.inf # inf为正无穷大
clusterSSEResult[0]["center"] = np.mean(data)
while len(clusterSSEResult) < K:
maxSSE = -np.inf
maxSSEKey = 0
# 找到最大SSE值对应数据,进行kmeans聚类
for key in clusterSSEResult.keys():
if clusterSSEResult[key]["sse"] > maxSSE:
maxSSE = clusterSSEResult[key]["sse"]
maxSSEKey = key
# clusterResult {0: {'center': x, 'values': []}, 1: {'center': x, 'values': []}}
clusterResult = \
self.kMeans(clusterSSEResult[maxSSEKey]["values"],K=2,maxIters = 200)
# 删除clusterSSE中的minKey对应的值
del clusterSSEResult[maxSSEKey]
# 将经过kMeas聚类后的结果赋值给clusterSSEResult
clusterSSEResult.setdefault(maxSSEKey,{})
clusterSSEResult[maxSSEKey]["center"]=clusterResult[0]["center"]
clusterSSEResult[maxSSEKey]["values"]=clusterResult[0]["values"]
clusterSSEResult[maxSSEKey]["sse"]=\
self.SSE(clusterResult[0]["values"],clusterResult[0]["center"])
maxKey = max(clusterSSEResult.keys()) + 1
clusterSSEResult.setdefault(maxKey,{})
clusterSSEResult[maxKey]["center"]=clusterResult[1]["center"]
clusterSSEResult[maxKey]["values"]=clusterResult[1]["values"]
clusterSSEResult[maxKey]["sse"]=\
self.SSE(clusterResult[1]["values"],clusterResult[1]["center"])
return clusterSSEResult
if __name__ == "__main__":
file = "../data/sku-price/skuid_price.csv"
km = kMeans()
data = km.loadData(file)
newData,upper_limit,lower_limit = km.filterAnomalyValue(data)
# Cluster = km.kMeans(newData["price"].values,K=7,maxIters=200)
# print(Cluster)
clusterSSE = km.diKMeans(newData["price"].values,K=7)
print(clusterSSE)
结果
最大异常值为:5149.081853395541,最小异常值为:1
{3: {'center': 4152.09375, 'values': [4374, 4297, 3848, 3672, 4083, 4653, 3444, 3702, 3687, 5114, 5014, 3574, 4835, 4295, 4373, 4284, 3891, 3624, 3734, 4985, 3713, 4399, 4173, 4375, 4512, 4841, 3505, 3999, 3740, 4279, 4299, 3549], 'sse': 7261498.71875}, 1: {'center': 1127.6285714285714, 'values': [1007, 981, 1061, 1041, 1039, 1760, 1433, 1430, 1063, 1039, 969, 1326, 1042, 1528, 1207, 1073, 1214, 1059, 1078, 1006, 1556, 1121, 1017, 1249, 973, 1230, 1516, 1564, 1124, 1382, 1259, 992, 1268, 1254, 1213, 1170, 1233, 1395, 1409, 1514, 1604, 1204, 1447, 1512, 1165, 1220, 1773, 1254, 1127, 1421, 1186, 1542, 1134, 1395, 924, 1027, 985, 943, 921, 927, 974, 980, 918, 1038, 993, 952, 964, 937, 1002, 972, 972, 963, 1077, 999, 1017, 1029, 983, 1000, 956, 991, 947, 990, 999, 1015, 995, 1000, 1130, 1014, 938, 1042, 1103, 1037, 1076, 1043, 992, 937, 1039, 1017, 946, 992, 930, 923, 1069, 927, 1077], 'sse': 4282336.514285714}, 4: {'center': 707.6685714285715, 'values': [838, 609, 757, 847, 848, 644, 758, 638, 768, 521, 802, 747, 781, 621, 865, 761, 849, 536, 837, 705, 647, 601, 797, 707, 777, 781, 645, 594, 698, 723, 606, 692, 754, 626, 706, 505, 718, 676, 592, 602, 634, 524, 733, 623, 721, 593, 582, 794, 589, 541, 563, 777, 700, 887, 706, 576, 669, 569, 522, 532, 861, 566, 717, 596, 817, 646, 707, 584, 580, 742, 555, 618, 566, 834, 844, 726, 593, 658, 751, 681, 871, 582, 585, 664, 768, 635, 743, 632, 838, 640, 735, 545, 878, 692, 627, 763, 908, 560, 681, 902, 515, 848, 683, 731, 726, 808, 732, 635, 600, 684, 670, 523, 812, 643, 598, 735, 557, 887, 618, 863, 821, 732, 725, 567, 887, 898, 840, 809, 553, 821, 803, 846, 732, 536, 669, 818, 909, 695, 893, 509, 814, 596, 509, 659, 674, 684, 778, 894, 843, 860, 581, 741, 789, 896, 589, 906, 653, 875, 788, 626, 838, 793, 911, 672, 567, 862, 827, 579, 736, 899, 640, 607, 658, 755, 907], 'sse': 2263906.777142857}, 2: {'center': 336.3314606741573, 'values': [385, 405, 233, 285, 247, 491, 439, 387, 237, 244, 245, 219, 482, 461, 448, 324, 432, 426, 378, 400, 495, 419, 259, 467, 311, 351, 292, 427, 213, 473, 229, 317, 217, 216, 338, 489, 306, 445, 314, 430, 352, 259, 268, 329, 273, 454, 465, 327, 339, 284, 211, 493, 310, 224, 273, 249, 213, 414, 344, 326, 349, 279, 395, 237, 252, 412, 362, 241, 267, 465, 385, 427, 449, 414, 468, 327, 420, 444, 267, 268, 294, 480, 257, 245, 225, 427, 444, 228, 253, 269, 336, 231, 288, 210, 359, 228, 475, 255, 427, 274, 229, 292, 301, 458, 295, 290, 453, 333, 369, 227, 386, 474, 265, 379, 398, 297, 241, 215, 247, 483, 423, 299, 413, 327, 417, 378, 352, 500, 273, 322, 247, 306, 271, 243, 385, 369, 343, 276, 377, 298, 473, 378, 427, 241, 485, 314, 232, 377, 331, 325, 395, 273, 493, 414, 475, 260, 444, 244, 297, 453, 415, 458, 281, 245, 319, 470, 244, 246, 235, 221, 270, 290, 222, 243, 288, 223, 227, 272], 'sse': 1361889.4438202246}, 5: {'center': 82.72056239015818, 'values': [143, 196, 35, 127, 86, 127, 67, 28, 125, 125, 79, 53, 109, 35, 42, 51, 46, 67, 41, 40, 5, 93, 183, 24, 74, 151, 43, 74, 15, 176, 18, 52, 15, 48, 160, 35, 63, 37, 62, 159, 60, 80, 189, 107, 91, 94, 114, 159, 27, 48, 6, 169, 41, 120, 39, 111, 184, 71, 46, 160, 75, 37, 176, 17, 45, 77, 69, 90, 100, 31, 36, 61, 78, 83, 65, 71, 130, 3, 42, 33, 108, 85, 96, 182, 25, 72, 55, 190, 145, 72, 13, 69, 30, 13, 72, 97, 43, 124, 140, 119, 31, 45, 33, 34, 19, 70, 96, 52, 164, 200, 100, 154, 52, 177, 171, 82, 153, 123, 90, 66, 53, 75, 113, 22, 135, 202, 205, 122, 34, 156, 201, 173, 155, 71, 156, 137, 207, 56, 123, 71, 188, 175, 63, 174, 209, 86, 116, 93, 125, 134, 187, 120, 14, 178, 187, 150, 38, 188, 117, 123, 190, 95, 68, 38, 100, 74, 53, 58, 11, 104, 58, 120, 19, 160, 5, 54, 87, 116, 77, 27, 65, 12, 32, 116, 56, 30, 95, 41, 39, 17, 158, 100, 135, 145, 121, 11, 116, 66, 74, 170, 6, 36, 7, 171, 94, 201, 63, 18, 143, 15, 86, 79, 96, 171, 3, 144, 206, 85, 126, 178, 92, 8, 50, 65, 3, 48, 58, 58, 7, 172, 177, 98, 74, 6, 89, 52, 4, 10, 101, 137, 70, 26, 55, 114, 31, 134, 84, 26, 53, 116, 20, 76, 154, 11, 5, 16, 18, 34, 79, 17, 82, 118, 76, 32, 62, 53, 7, 157, 58, 9, 3, 76, 61, 182, 79, 36, 39, 143, 33, 149, 157, 10, 63, 98, 88, 6, 6, 13, 103, 142, 92, 24, 99, 46, 4, 124, 69, 100, 6, 40, 90, 50, 180, 151, 102, 204, 146, 84, 142, 117, 3, 16, 168, 111, 70, 8, 96, 102, 74, 179, 155, 30, 133, 23, 64, 83, 24, 88, 23, 141, 50, 17, 59, 137, 49, 30, 27, 57, 84, 11, 54, 80, 14, 107, 135, 52, 71, 39, 143, 161, 77, 30, 39, 9, 48, 20, 53, 92, 143, 18, 59, 108, 56, 74, 88, 25, 105, 65, 64, 69, 14, 117, 14, 52, 70, 124, 94, 133, 72, 64, 142, 79, 50, 108, 117, 63, 121, 4, 81, 11, 92, 17, 14, 33, 48, 19, 127, 188, 3, 21, 12, 48, 6, 117, 124, 22, 63, 190, 139, 159, 103, 58, 12, 101, 16, 74, 37, 67, 193, 26, 69, 91, 40, 203, 178, 16, 35, 39, 60, 36, 96, 39, 166, 50, 40, 41, 125, 9, 55, 116, 63, 54, 36, 40, 123, 83, 92, 199, 58, 104, 142, 93, 12, 144, 61, 42, 66, 58, 152, 147, 189, 100, 63, 7, 79, 93, 112, 70, 157, 133, 89, 124, 60, 25, 97, 7, 79, 166, 9, 66, 93, 30, 84, 46, 118, 147, 107, 40, 50, 41, 63, 115, 175, 124, 7, 10, 201, 81, 9, 101, 105, 128, 83, 126, 45, 40, 100, 115, 147, 98, 22, 6, 99, 30, 36, 88, 141, 184, 91, 136, 188, 208, 37, 70, 63, 206, 2, 22, 12, 143, 131, 174, 45, 19, 101, 71, 112, 3, 102, 95, 178, 172, 25, 31, 64, 32, 197, 148, 208, 10, 106, 3, 175, 6, 21, 147, 10, 65, 51, 14, 166, 76, 44, 95, 113, 62, 65, 2, 11], 'sse': 1723360.569420035}, 0: {'center': 2241.0, 'values': [2404, 2360, 2528, 1934, 2044, 2563, 2545, 2369, 2569, 2042, 2308, 2372, 2402, 1881, 1810, 1874, 2062, 2145, 2381, 2062, 2406], 'sse': 1233570.0}, 6: {'center': 2998.0, 'values': [3308, 2988, 2667, 2854, 3311, 3070, 3227, 2817, 3271, 2884, 3011, 2707, 2695, 2966, 2889, 2709, 2867, 3361, 3290, 3288, 2823, 3042, 2909], 'sse': 1134238.0}}
sk-learn中聚类效果评估
# -*-coding:utf-8-*-
"""
Author: Thinkgamer
Desc:
代码4-11 sk-learn中聚类效果评估
"""
from sklearn import metrics
labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 1, 1, 2, 2]
# 以下预测结果均是 值越大 预测结果与真实结果越吻合
# 兰德系数
print(metrics.adjusted_rand_score(labels_true, labels_pred))
# 互信息
print(metrics.adjusted_mutual_info_score(labels_true, labels_pred))
# 同质性
print(metrics.homogeneity_score(labels_true, labels_pred))
# 完整性
print(metrics.completeness_score(labels_true, labels_pred))
# 同质性与完整性的调和平均
print(metrics.v_measure_score(labels_true, labels_pred) )
# FMI
print( metrics.fowlkes_mallows_score(labels_true, labels_pred) )
结果
0.24242424242424243
0.2987924581708901
0.6666666666666669
0.420619835714305
0.5158037429793889
0.4714045207910317
基于Apriori算法实现频繁项集合相关规则挖掘
# -*-coding:utf-8-*-
"""
Author: Thinkgamer
Desc:
代码4-12 基于Apriori算法实现频繁项集合相关规则挖掘
"""
class Apriori:
def __init__(self, minSupport, minConfidence):
# 最小支持度
self.minSupport = minSupport
# 最小置信度
self.minConfidence = minConfidence
self.data = self.loadData()
# 加载数据集
def loadData(self):
return [[1, 5], [2, 3, 4], [2, 3, 4, 5], [2, 3]]
# 生成项集C1,不包含项集中每个元素出现的次数
def createC1(self, data):
C1 = list() # C1为大小为1的项的集合
for items in data: # 遍历数据集
for item in items:
if [item] not in C1:
C1.append([item])
# map函数表示遍历C1中的每一个元素执行forzenset
# frozenset表示“冰冻”的集合,即不可改变
return list(map(frozenset, sorted(C1)))
# 该函数用于从候选项集Ck生成Lk,Lk表示满足最低支持度的元素集合
def scanD(self, Ck):
# Data表示数据列表的列表 [set([]), set([]), set([]), set([])]
Data = list(map(set, self.data))
CkCount = {}
# 统计Ck项集中每个元素出现的次数
for items in Data:
for one in Ck:
# issubset:表示如果集合one中的每一元素都在items中则返回true
if one.issubset(items):
CkCount.setdefault(one, 0)
CkCount[one] += 1
numItems = len(list(Data)) # 数据条数
Lk = [] # 初始化符合支持度的项集
supportData = {} # 初始化所有符合条件的项集及对应的支持度
for key in CkCount:
# 计算每个项集的支持度,如果满足条件则把该项集加入到Lk列表中
support = CkCount[key] * 1.0 / numItems
if support >= self.minSupport:
Lk.insert(0, key)
# 构建支持的项集的字典
supportData[key] = support
return Lk, supportData
# generateNewCk的输人参数为频繁项集列表Lk与项集元素个数k,输出为Ck
def generateNewCk(self, Lk, k):
nextLk = []
lenLk = len(Lk)
# 若两个项集的长度为k-1,则必须前k-2项相同才可连接,即求并集,所以[:k-2]的实际作用为取列表的前k-1个元素
for i in range(lenLk):
for j in range(i + 1, lenLk):
# 前k-2项相同时合并两个集合
L1 = list(Lk[i])[: k - 2]
L2 = list(Lk[j])[: k - 2]
if sorted(L1) == sorted(L2):
nextLk.append(Lk[i] | Lk[j])
return nextLk
# 生成频繁项集
def gengrateLK(self):
# 构建候选项集C1
C1 = self.createC1(self.data)
L1, supportData = self.scanD(C1)
L = [L1]
k = 2
while len(L[k - 2]) > 0:
# 组合项集Lk中的元素,声新的候选项集Ck
Ck = self.generateNewCk(L[k - 2], k)
Lk, supK = self.scanD(Ck)
supportData.update(supK)
L.append(Lk)
k += 1
return L, supportData
# 生成关联规则
def generateRules(self, L, supportData):
ruleResult = [] # 最终记录的关联规则结果
for i in range(1, len(L)):
for ck in L[i]:
Cks = [frozenset([item]) for item in ck]
# 频繁项集中有三个及以上元素的集合
self.rulesOfMore(ck, Cks, supportData, ruleResult)
return ruleResult
# 频繁项集只有两个元素
def rulesOfTwo(self, ck, Cks, supportData, ruleResult):
prunedH = []
for oneCk in Cks:
# 计算置信度
conf = supportData[ck] / supportData[ck - oneCk]
if conf >= self.minConfidence:
print(ck - oneCk, "-->", oneCk, "Confidence is:", conf)
ruleResult.append((ck - oneCk, oneCk, conf))
prunedH.append(oneCk)
return prunedH
# 频繁项集中有三个及以上元素的集合,递归生成关联规则
def rulesOfMore(self, ck, Cks, supportData, ruleResult):
m = len(Cks[0])
while len(ck) > m:
Cks = self.rulesOfTwo(ck, Cks, supportData, ruleResult)
if len(Cks) > 1:
Cks = self.generateNewCk(Cks, m + 1)
m += 1
else:
break
if __name__ == "__main__":
apriori = Apriori(minSupport=0.5, minConfidence=0.6)
L, supportData = apriori.gengrateLK()
for one in L:
print("项数为 %s 的频繁项集:" % (L.index(one) + 1), one)
print("supportData:", supportData)
print("minConf=0.6时:")
rules = apriori.generateRules(L, supportData)
结果
项数为 1 的频繁项集: [frozenset({4}), frozenset({3}), frozenset({2}), frozenset({5})]
项数为 2 的频繁项集: [frozenset({2, 3}), frozenset({2, 4}), frozenset({3, 4})]
项数为 3 的频繁项集: [frozenset({2, 3, 4})]
项数为 4 的频繁项集: []
supportData: {frozenset({1}): 0.25, frozenset({5}): 0.5, frozenset({2}): 0.75, frozenset({3}): 0.75, frozenset({4}): 0.5, frozenset({3, 4}): 0.5, frozenset({2, 4}): 0.5, frozenset({2, 3}): 0.75, frozenset({4, 5}): 0.25, frozenset({3, 5}): 0.25, frozenset({2, 5}): 0.25, frozenset({2, 3, 4}): 0.5}
minConf=0.6时:
frozenset({3}) --> frozenset({2}) Confidence is: 1.0
frozenset({2}) --> frozenset({3}) Confidence is: 1.0
frozenset({4}) --> frozenset({2}) Confidence is: 1.0
frozenset({2}) --> frozenset({4}) Confidence is: 0.6666666666666666
frozenset({4}) --> frozenset({3}) Confidence is: 1.0
frozenset({3}) --> frozenset({4}) Confidence is: 0.6666666666666666
frozenset({3, 4}) --> frozenset({2}) Confidence is: 1.0
frozenset({2, 4}) --> frozenset({3}) Confidence is: 1.0
frozenset({2, 3}) --> frozenset({4}) Confidence is: 0.6666666666666666
frozenset({4}) --> frozenset({2, 3}) Confidence is: 1.0
frozenset({3}) --> frozenset({2, 4}) Confidence is: 0.6666666666666666
frozenset({2}) --> frozenset({3, 4}) Confidence is: 0.6666666666666666