K-means代码实战

最新推荐文章于 2025-05-29 21:28:37 发布

原创最新推荐文章于 2025-05-29 21:28:37 发布 · 340 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#K-means #Yelp

机器学习专栏收录该内容

10 篇文章

订阅专栏

Background: Clustering the categories of businesses in Yelp academic datasets through K-Means algorithm.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Oct  6 16:02:17 2018

@author: zizibong
"""

import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
import collections
import scipy.sparse as sp
import matplotlib.pyplot as plt
def taglist_to_matrix(taglist):
    #taglist: list of tags. For example, each element of the list is the list of tags of a business category: [u'Doctors', u'Health & Medical']
    #Returns: A sparse matrix num_docs x tags where element i, j has the counts of how many time tag j appear in document i
    all_tags = [w for doc in taglist for w in doc]
    counter = collections.Counter(all_tags)
    count_pairs = sorted(counter.items(), key=lambda x: -x[1])
    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    # sparse matrix indices
    i_indices = [doc_idx for doc_idx in range(len(taglist)) for _ in taglist[doc_idx]]
    j_indices = [word_to_id[w] for doc_idx in range(len(taglist)) for w in taglist[doc_idx]]
    data = [1]*len(all_tags)
    m = sp.csc_matrix((data, (i_indices, j_indices)))
    m.sum_duplicates()
    return m
business = pd.read_csv(yelp_business)
tags = business.categories.tolist()
tag_countmatrix = taglist_to_matrix(tags)
km = KMeans(n_clusters=20)
km.fit(tag_countmatrix)
business['cluster'] = km.predict(tag_countmatrix)