In [26]:
import pandas as pd
import os
encoding = 'latin1'
upath = os.path.expanduser('ch02/movielens/users.dat')
rpath = os.path.expanduser('ch02/movielens/ratings.dat')
mpath = os.path.expanduser('ch02/movielens/movies.dat')
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
mnames = ['movie_id', 'title', 'genres']
users = pd.read_csv(upath, sep='::', header=None, names=unames, encoding=encoding)
ratings = pd.read_csv(rpath, sep='::', header=None, names=rnames, encoding=encoding)
movies = pd.read_csv(mpath, sep='::', header=None, names=mnames, encoding=encoding)
In [6]:
users[:5]
Out[6]:
In [7]:
ratings[:5]
Out[7]:
In [8]:
movies[:5]
Out[8]:
In [9]:
ratings
Out[9]:
In [10]:
data = pd.merge(pd.merge(ratings, users), movies)
data
Out[10]:
In [11]:
data.ix[0]
Out[11]:
In [34]:
import sys
reload(sys)
sys.setdefaultencoding('latin1')
mean_ratings = data.pivot_table('rating', index='title',columns='gender', aggfunc='mean')
In [38]:
mean_ratings[:5]
Out[38]:
In [39]:
ratings_by_title = data.groupby('title').size() #对title进行分组
In [40]:
ratings_by_title[:5]
Out[40]:
In [41]:
active_titles = ratings_by_title.index[ratings_by_title >= 250] # 获得评论数据大于250的电影
In [42]:
active_titles[:10]
Out[42]:
In [43]:
mean_ratings = mean_ratings.ix[active_titles]
mean_ratings
Out[43]:
In [44]:
mean_ratings = mean_ratings.rename(index={'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)':
'Seven Samurai (Shichinin no samurai) (1954)'})
In [45]:
top_female_ratings = mean_ratings.sort_index(by='F', ascending=False)# 获取女性观众最喜欢的电影
top_female_ratings[:10]
Out[45]: