尝试随机生成测试集和训练集
通常在做数据挖掘或者训练模型时,需要随机按比例划分数据集和训练集,这里我自己写了一段划分的代码
还有一份是使用sklearn中一个函数就划分好的(emmmmm,感谢python各种库)
import pandas as pd
import numpy as np
import math
import random
path1=r'data files\ratings.csv'
csvpath1=open(path1)
ratings_df = pd.read_csv(csvpath1)
#ratings_df.tail()
#tail命令用于输入文件中的尾部内容。tail命令默认在屏幕上显示指定文件的末尾5行。
path2=r'data files\movies.csv'
csvpath2=open(path2,encoding='UTF-8')
movies_df = pd.read_csv(csvpath2)
print(ratings_df.iloc[:,0].size)
#line=ratings_df.iloc[:,0].size
#for i in range(int(line*0.2)):
row_list=[]
line=30
for i in range(line):
#su=random.randint(0,line-1)
su=random.randint(0,line-1)
#print(list(ratings_df.iloc[su,:]))
row_list.append(list(ratings_df.iloc[su,:]))
ratings_df.drop([su],axis=0,inplace=True)#训练集删除该行
#ratings_df.reset_index(drop = True)
ratings_df = pd.DataFrame(ratings_df,columns=['userId','movieId','rating','timestamp'])
print(ratings_df.iloc[:,0].size)
line-=1
df = pd.DataFrame(row_list,columns=['userId','movieId','rating','timestamp'])
ratings_df.head()
df.head()
print(ratings_df.iloc[:,0].size)
df.drop([0, 1],axis=0,inplace=True)
df
补充一下看到别人用sklearn 的一个函数实现方法
from numpy import random
import numpy as np
num = 10
x = np.floor(10*random.rand(num,2))
y = np.floor(10*random.rand(num,1))
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(x, y, test_size=0.4, random_state=0)