转：『Sklearn』数据划分方法及python代码

本文详细介绍了多种交叉验证方法，包括K折交叉验证、留一法、随机划分法等，并提供了每种方法的代码实现示例。

原理介绍

K折交叉验证：

KFold，GroupKFold，StratifiedKFold，

留一法：

LeaveOneGroupOut，LeavePGroupsOut，LeaveOneOut，LeavePOut，

随机划分法：

ShuffleSplit，GroupShuffleSplit，StratifiedShuffleSplit，

代码实现

流程：

实例化分类器 -> 迭代器迭代组[.split()]

KFold(n_splits=2)

#KFold<br>import
 numpy as np

from sklearn.model_selection import KFold

X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])

y=np.array([1,2,3,4,5,6])

kf=KFold(n_splits=2)    #
 定义分成几个组

#
 kf.get_n_splits(X)    # 查询分成几个组

print(kf)

for train_index,test_index in kf.split(X):

    print("Train
 Index:",train_index,",Test
 Index:",test_index)

    X_train,X_test=X[train_index],X[test_index]

    y_train,y_test=y[train_index],y[test_index]

    #print(X_train,X_test,y_train,y_test)

GroupKFold(n_splits=2)

#
 GroupKFold，不是很懂这个划分方法

import numpy
 as np

from sklearn.model_selection import GroupKFold

X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])

y=np.array([1,2,3,4,5,6])

groups=np.array([1,2,3,4,5,6])

group_kfold=GroupKFold(n_splits=2)

group_kfold.get_n_splits(X,y,groups)

print(group_kfold)

for train_index,test_index in group_kfold.split(X,y,groups):

    print("Train
 Index:",train_index,",Test
 Index:",test_index)

    X_train,X_test=X[train_index],X[test_index]

    y_train,y_test=y[train_index],y[test_index]

    #print(X_train,X_test,y_train,y_test)

#GroupKFold(n_splits=2)

#Train
 Index: [0 2 4] ,Test Index: [1 3 5]

#Train
 Index: [1 3 5] ,Test Index: [0 2 4]

StratifiedKFold(n_splits=3)

#
 stratifiedKFold：保证训练集中每一类的比例是相同的（尽量）

import numpy
 as np

from sklearn.model_selection import StratifiedKFold

X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])

y=np.array([1,1,1,2,2,2])

skf=StratifiedKFold(n_splits=3)

skf.get_n_splits(X,y)

print(skf)

for train_index,test_index in skf.split(X,y):

    print("Train
 Index:",train_index,",Test
 Index:",test_index)

    X_train,X_test=X[train_index],X[test_index]

    y_train,y_test=y[train_index],y[test_index]

    #print(X_train,X_test,y_train,y_test)

#StratifiedKFold(n_splits=3,
 random_state=None, shuffle=False)

#Train
 Index: [1 2 4 5] ,Test Index: [0 3]

#Train
 Index: [0 2 3 5] ,Test Index: [1 4]

LeaveOneOut()

#
 leaveOneOut：测试集就留下一个

import numpy
 as np

from sklearn.model_selection import LeaveOneOut

X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])

y=np.array([1,2,3,4,5,6])

loo=LeaveOneOut()

loo.get_n_splits(X)

print(loo)

for train_index,test_index in loo.split(X,y):

    print("Train
 Index:",train_index,",Test
 Index:",test_index)

    X_train,X_test=X[train_index],X[test_index]

    y_train,y_test=y[train_index],y[test_index]

    #print(X_train,X_test,y_train,y_test)

#LeaveOneOut()

#Train
 Index: [1 2 3 4 5] ,Test Index: [0]

#Train
 Index: [0 2 3 4 5] ,Test Index: [1]

#Train
 Index: [0 1 3 4 5] ,Test Index: [2]

#Train
 Index: [0 1 2 4 5] ,Test Index: [3]

#Train
 Index: [0 1 2 3 5] ,Test Index: [4]

#Train
 Index: [0 1 2 3 4] ,Test Index: [5]

LeavePOut(p=3)

LeavePOut：测试集留下P个

import numpy
 as np

from sklearn.model_selection import LeavePOut

X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])

y=np.array([1,2,3,4,5,6])

lpo=LeavePOut(p=3)

lpo.get_n_splits(X)

print(lpo)

for train_index,test_index in lpo.split(X,y):

    print("Train
 Index:",train_index,",Test
 Index:",test_index)

    X_train,X_test=X[train_index],X[test_index]

    y_train,y_test=y[train_index],y[test_index]

    #print(X_train,X_test,y_train,y_test)

 

#LeavePOut(p=3)

#Train
 Index: [3 4 5] ,Test Index: [0 1 2]

#Train
 Index: [2 4 5] ,Test Index: [0 1 3]

#Train
 Index: [2 3 5] ,Test Index: [0 1 4]

#Train
 Index: [2 3 4] ,Test Index: [0 1 5]

#Train
 Index: [1 4 5] ,Test Index: [0 2 3]

#Train
 Index: [1 3 5] ,Test Index: [0 2 4]

#Train
 Index: [1 3 4] ,Test Index: [0 2 5]

#Train
 Index: [1 2 5] ,Test Index: [0 3 4]

#Train
 Index: [1 2 4] ,Test Index: [0 3 5]

#Train
 Index: [1 2 3] ,Test Index: [0 4 5]

#Train
 Index: [0 4 5] ,Test Index: [1 2 3]

#Train
 Index: [0 3 5] ,Test Index: [1 2 4]

#Train
 Index: [0 3 4] ,Test Index: [1 2 5]

#Train
 Index: [0 2 5] ,Test Index: [1 3 4]

#Train
 Index: [0 2 4] ,Test Index: [1 3 5]

#Train
 Index: [0 2 3] ,Test Index: [1 4 5]

#Train
 Index: [0 1 5] ,Test Index: [2 3 4]

#Train
 Index: [0 1 4] ,Test Index: [2 3 5]

#Train
 Index: [0 1 3] ,Test Index: [2 4 5]

#Train
 Index: [0 1 2] ,Test Index: [3 4 5]

ShuffleSplit(n_splits=3,test_size=.25,random_state=0)

#ShuffleSplit
 把数据集打乱顺序，然后划分测试集和训练集，训练集额和测试集的比例随机选定，训练集和测试集的比例的和可以小于1

import numpy
 as np

from sklearn.model_selection import ShuffleSplit

X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])

y=np.array([1,2,3,4,5,6])

rs=ShuffleSplit(n_splits=3,test_size=.25,random_state=0)

rs.get_n_splits(X)

print(rs)

for train_index,test_index in rs.split(X,y):

    print("Train
 Index:",train_index,",Test
 Index:",test_index)

    X_train,X_test=X[train_index],X[test_index]

    y_train,y_test=y[train_index],y[test_index]

    #print(X_train,X_test,y_train,y_test)

print("==============================")

rs=ShuffleSplit(n_splits=3,train_size=.5,test_size=.25,random_state=0)

rs.get_n_splits(X)

print(rs)

for train_index,test_index in rs.split(X,y):

    print("Train
 Index:",train_index,",Test
 Index:",test_index)

#ShuffleSplit(n_splits=3,
 random_state=0, test_size=0.25, train_size=None)

#Train
 Index: [1 3 0 4] ,Test Index: [5 2]

#Train
 Index: [4 0 2 5] ,Test Index: [1 3]

#Train
 Index: [1 2 4 0] ,Test Index: [3 5]

#==============================

#ShuffleSplit(n_splits=3,
 random_state=0, test_size=0.25, train_size=0.5)

#Train
 Index: [1 3 0] ,Test Index: [5 2]

#Train
 Index: [4 0 2] ,Test Index: [1 3]

#Train
 Index: [1 2 4] ,Test Index: [3 5]

StratifiedShuffleSplit(n_splits=3,test_size=.5,random_state=0)

#
 StratifiedShuffleSplitShuffleSplit 把数据集打乱顺序，然后划分测试集和训练集，

#
 训练集额和测试集的比例随机选定，训练集和测试集的比例的和可以小于1,但是还要保证训练集中各类所占的比例是一样的

import numpy
 as np

from sklearn.model_selection import StratifiedShuffleSplit

X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])

y=np.array([1,2,1,2,1,2])

sss=StratifiedShuffleSplit(n_splits=3,test_size=.5,random_state=0)

sss.get_n_splits(X,y)

print(sss)

for train_index,test_index in sss.split(X,y):

    print("Train
 Index:",train_index,",Test
 Index:",test_index)

    X_train,X_test=X[train_index],X[test_index]

    y_train,y_test=y[train_index],y[test_index]

    #print(X_train,X_test,y_train,y_test)

#StratifiedShuffleSplit(n_splits=3,
 random_state=0, test_size=0.5,train_size=None)

#Train
 Index: [5 4 1] ,Test Index: [3 2 0]

#Train
 Index: [5 2 3] ,Test Index: [0 4 1]

#Train
 Index: [5 0 4] ,Test Index: [3 1 2]