# coding=utf-8
__author__='changgy'
__date__ ='20160205'
import sys
import math
import numpy as np
import pandas as pd
import datetime
'''
u.data 表示100k条评分记录,每一列的数值含义是:
user_id | item_id | rating | timestamp
u.user表示用户的信息,每一列的数值含义是:
user_id | age | gender | occupation | zip code
u.item文件表示电影的相关信息,每一列的数值含义是
item_id| movie title | release date | video release date |IMDb URL | unknown | Action | Adventure | Animation
| Children's | Comedy | Crime | Documentary | Drama | Fantasy |Film-Noir | Horror | Musical | Mystery | Romance
| Sci-Fi |Thriller | War | Western |
'''
global g_DF_UDATA
global g_DF_USER
global g_DF_MOVIE_ITEM
def readSrcData(fileDir,sep, columns):
loop = True
chunkSize = 100000
chunks = []
#reader = pd.read_csv('u.data', sep='\t', iterator=True)
reader = pd.read_table(fileDir,sep, header=None,iterator=True)
i = 0
while loop:
try:
chunk = reader.get_chunk(chunkSize)
chunks.append(chunk)
except StopIteration:
loop = False
#print 'iteration is stoppd'
df = pd.concat(chunks,ignore_index=True)
df.columns=columns
return df
class CcalcCostTime():
#d1,d2
def __init__(self):
self.d1 = datetime.datetime.now()
def __del__(self):
d2 = datetime.datetime.now()
print 'cost is ',(d2-self.d1).seconds ,'s ',(d2-self.d1).microseconds / 1000 ,'ms'
def getUserDataFramebyGender(user_all, gender):
# dataframe to series
series_user = user_all[ user_all[gender] > 0 ][gender]
# series to dataFrame
return pd.DataFrame(series_user)
def readData():
global g_DF_UDATA
global g_DF_USER
global g_DF_MOVIE_ITEM
# 读取udata文件
__author__='changgy'
__date__ ='20160205'
import sys
import math
import numpy as np
import pandas as pd
import datetime
'''
u.data 表示100k条评分记录,每一列的数值含义是:
user_id | item_id | rating | timestamp
u.user表示用户的信息,每一列的数值含义是:
user_id | age | gender | occupation | zip code
u.item文件表示电影的相关信息,每一列的数值含义是
item_id| movie title | release date | video release date |IMDb URL | unknown | Action | Adventure | Animation
| Children's | Comedy | Crime | Documentary | Drama | Fantasy |Film-Noir | Horror | Musical | Mystery | Romance
| Sci-Fi |Thriller | War | Western |
'''
global g_DF_UDATA
global g_DF_USER
global g_DF_MOVIE_ITEM
def readSrcData(fileDir,sep, columns):
loop = True
chunkSize = 100000
chunks = []
#reader = pd.read_csv('u.data', sep='\t', iterator=True)
reader = pd.read_table(fileDir,sep, header=None,iterator=True)
i = 0
while loop:
try:
chunk = reader.get_chunk(chunkSize)
chunks.append(chunk)
except StopIteration:
loop = False
#print 'iteration is stoppd'
df = pd.concat(chunks,ignore_index=True)
df.columns=columns
return df
class CcalcCostTime():
#d1,d2
def __init__(self):
self.d1 = datetime.datetime.now()
def __del__(self):
d2 = datetime.datetime.now()
print 'cost is ',(d2-self.d1).seconds ,'s ',(d2-self.d1).microseconds / 1000 ,'ms'
def getUserDataFramebyGender(user_all, gender):
# dataframe to series
series_user = user_all[ user_all[gender] > 0 ][gender]
# series to dataFrame
return pd.DataFrame(series_user)
def readData():
global g_DF_UDATA
global g_DF_USER
global g_DF_MOVIE_ITEM
# 读取udata文件