How to calculate similarity?
use cosine similarity!
Edit helper.py.
So you can call the function in helper class when needed
>>> helper = Helper()
similarity = helper.cosine_similarity(app_list1, app_list2)
import operator
import math
class Helper(object):
@classmethod
def cosine_similarity(cls, app_list1, app_list2):
match_count = cls.__count_match(app_list1, app_list2)
return float(match_count) / math.sqrt( len(app_list1) * len(app_list2))
@classmethod
def __count_match(cls, list1, list2):
count = 0
for element in list1:
if element in list2:
count += 1
return count
def calculate_top_5(app, user_download_history):
#create a dict to store each other app and its similarity to this app
app_similarity = {} #{app_id: similarity}
for apps in user_download_history:
#calculate the similarity
similarity = Helper.cosine_similarity([app], apps)
for other_app in apps:
if app_similarity.has_key(other_app):
app_similarity[other_app] = app_similarity[other_app] + similarity
else:
app_similarity[other_app] = similarity
# There could be app without related apps (not in any download history)
if not app_similarity.has_key(app):
return
#sort app_similarity dict by value and get the top 5 as recommendation
app_similarity.pop(app)
sorted_tups = sorted(app_similarity.items(), key=operator.itemgetter(1), reverse=True) #sort by similarity
top_5_app = [sorted_tups[0][0], sorted_tups[1][0], sorted_tups[2][0], sorted_tups[3][0], sorted_tups[4][0]]
print("top_5_app for " + str(app) + ":\t" + str(top_5_app))
Edit dataservice.py
retrieve and store data
from pymongo import MongoClient
import random
# about data
class DataService(object):
@classmethod
def init(cls, client):
cls.client = client
cls.db = client.appstore
cls.user_download_history = cls.db.user_download_history
cls.app_info = cls.db.app_info
@classmethod
def retrieve_user_download_history(cls, filter_dict={}):
#return a dict {user_id: download_history} containing user download history data
#return all data in the collection if no filter is specified
result = {}
cursor = cls.user_download_history.find(filter_dict)
for user_download_history in cursor:
result[user_download_history['user_id']] = user_download_history['download_history']
return result
Edit main.py
from pymongo import MongoClient
from dataservice import DataService
from helper import calculate_top_5
def main():
try:
#get MongoDB client and set it in DataService
client = MongoClient('localhost', 27017)
DataService.init(client)
#work flow
user_download_history = DataService.retrieve_user_download_history()
calculate_top_5('C10107104', user_download_history.values())
except Exception as e:
print(e)
finally:
#clean up work
if 'client' in locals():
client.close()
if __name__ == "__main__":
main()
output:
top_5_app for C10107104: [u'C10129690', u'C5341', u'C20252', u'C10191382', u'C183901']