K-means代码实战

Background: Clustering the categories of businesses in Yelp academic datasets through K-Means algorithm.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Oct  6 16:02:17 2018

@author: zizibong
"""

import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
import collections
import scipy.sparse as sp
import matplotlib.pyplot as plt
def taglist_to_matrix(taglist):
    #taglist: list of tags. For example, each element of the list is the list of tags of a business category: [u'Doctors', u'Health & Medical']
    #Returns: A sparse matrix num_docs x tags where element i, j has the counts of how many time tag j appear in document i
    all_tags = [w for doc in taglist for w in doc]
    counter = collections.Counter(all_tags)
    count_pairs = sorted(counter.items(), key=lambda x: -x[1])
    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    # sparse matrix indices
    i_indices = [doc_idx for doc_idx in range(len(taglist)) for _ in taglist[doc_idx]]
    j_indices = [word_to_id[w] for doc_idx in range(len(taglist)) for w in taglist[doc_idx]]
    data = [1]*len(all_tags)
    m = sp.csc_matrix((data, (i_indices, j_indices)))
    m.sum_duplicates()
    return m
business = pd.read_csv(yelp_business)
tags = business.categories.tolist()
tag_countmatrix = taglist_to_matrix(tags)
km = KMeans(n_clusters=20)
km.fit(tag_countmatrix)
business['cluster'] = km.predict(tag_countmatrix)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值