Background: Clustering the categories of businesses in Yelp academic datasets through K-Means algorithm.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 6 16:02:17 2018
@author: zizibong
"""
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
import collections
import scipy.sparse as sp
import matplotlib.pyplot as plt
def taglist_to_matrix(taglist):
#taglist: list of tags. For example, each element of the list is the list of tags of a business category: [u'Doctors', u'Health & Medical']
#Returns: A sparse matrix num_docs x tags where element i, j has the counts of how many time tag j appear in document i
all_tags = [w for doc in taglist for w in doc]
counter = collections.Counter(all_tags)
count_pairs = sorted(counter.items(), key=lambda x: -x[1])
words, _ = list(zip(*count_pairs))
word_to_id = dict(zip(words, range(len(words))))
# sparse matrix indices
i_indices = [doc_idx for doc_idx in range(len(taglist)) for _ in taglist[doc_idx]]
j_indices = [word_to_id[w] for doc_idx in range(len(taglist)) for w in taglist[doc_idx]]
data = [1]*len(all_tags)
m = sp.csc_matrix((data, (i_indices, j_indices)))
m.sum_duplicates()
return m
business = pd.read_csv(yelp_business)
tags = business.categories.tolist()
tag_countmatrix = taglist_to_matrix(tags)
km = KMeans(n_clusters=20)
km.fit(tag_countmatrix)
business['cluster'] = km.predict(tag_countmatrix)