1、自编函数简单随机抽取
#!/usr/bin/python
# 加载包
import numpy as np
import pandas as pd
def split_train_test(data, test_ratio,seed = 1234):
np.random.seed(seed) # 使得生成的测试训练集具有可重复性
index = np.random.permutation(len(data)) # 乱序
t_size = int(len(data) * test_ratio)
test_index = index[: t_size]
train_index = index[t_size :]
return data.iloc[train_index], data.iloc[test_index] # interge
该方法可以保证每次生成的数据集相同
但当数据刷新时,该方法又失效。
使用唯一ID 对数据进行分组才是保证更新数据后也照样和之前分的相同,仅加入了新增的数据
2、ID的哈希值抽取
#!/usr/bin/python
import hashlib
def test_set_check(identifier, test_ratio, hash = hashlib.md5):
"""
对哈希值加密后用二进制表示,提取最后一个字节
对最后一个字节按照 256(一个字节大小) * test_ratio 划分成两类
:param identifier: 需要处理成hashlib.md5 的ID编码
:param test_ratio: 提取测试集的占比
:return: 布尔值
"""
return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio # 加密后的结果用二进制表示
def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
ids = data[id_column]
in_test_set = ids.apply(lambda id_ : test_set_check(id_, test_ratio, hash))
return data.loc[~in_test_set], data.loc[in_test_set]
3、sklearn函数随机抽取
# 从model_selection中载入分测试集训练集函数
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size = 0.2, random_state = 42)
4、sklearn函数分层抽取
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits= 1, test_size= 0.2, random_state=42)
for train_index, test_index in split.split(data, data['income_cat']):
strat_train_set = data.loc[train_index]
strat_test_set = data.loc[test_index]
分层采样的比例几乎和原数据集一样 优于 随机抽样
两者数据抽样比例比较如下:
# 原数据
a = data['income_cat'].value_counts()/len(data)
# 分层抽样
ftr = strat_train_set['income_cat'].value_counts()/len(strat_train_set)
fte = strat_test_set['income_cat'].value_counts()/len(strat_test_set)
# 随机抽样
train_set, test_set = train_test_split(data, test_size = 0.2, random_state = 42)
sjtr = train_set['income_cat'].value_counts()/len(train_set)
sjte = test_set['income_cat'].value_counts()/len(test_set)
# 查看标准差
np.std(ftr - a), np.std(fte - a), np.std(sjtr - a), np.std(sjtr - a)
某一数据比较结果如下:
(2.1667325363377233e-05, 8.666930145347169e-05, 0.001571964081795336, 0.001571964081795336)
###显然分层抽样效果更佳