通常一个数据集合并没有划分为training set 和 test set,而为了减少过拟合,就需要我们自己对数据集进行划分
索性写了一个python函数方便任何比例的划分,其中每个样本的选取是随机的(不重复选)
"""
divide the original data set into training set and test set
percent -- percentage of training set
"""
import numpy as np
import random as rd
def divideTrainAndTest():
# python 2.x needs raw_input()
filename = input("Enter the file name:")
o_data = np.loadtxt(filename)
o_rows = o_data.shape[0]
o_columns = o_data.shape[1]
# require input of trainingset percent
percent = float(input("Enter the percentage of training data:(0~1)"))
tr_rows = int(percent * o_rows)
ts_rows = o_rows - tr_rows
tr_data = np.zeros([tr_rows, o_columns], dtype=list)
ts_data = np.zeros([ts_rows, o_columns], dtype=list)
# get a bool array to record whether a sample is in the training set
i = 0
isInTrain = []
while i < o_rows:
isInTrain.append(False)
i = i+1
# get the training set
i = 0
while i < tr_rows:
j = rd.randint(0,o_rows-1)
if isInTrain[j] == False:
isInTrain[j] = True
k = 0
while k < o_columns:
tr_data[i][k] = o_data[j][k]
k = k+1
i = i+1
# get the test set
i = 0
j = 0
while i<o_rows:
if isInTrain[i] == False:
k = 0
while k < o_columns:
ts_data[j][k] = o_data[i][k]
k = k+1
j = j+1
i = i+1
trainname = input("Enter the training data name:")
testname = input("Enter the test data name:")
np.savetxt(trainname,tr_data,fmt="%lf")
np.savetxt(testname,ts_data,fmt="%lf")
divideTrainAndTest()