sklearn.utils.shuffle 用法小技巧

本文介绍了一种使用Python中的sklearn库实现多个数组统一打乱的方法,通过示例展示了如何对不同数组进行同步随机排序,确保每个数组中元素对应位置的一致性,这对于机器学习数据预处理尤其有用。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

是可以打乱多组数据的,不局限与x,y。

import numpy as np
import random
from sklearn.utils import shuffle

a = np.array([1,2,3,4,5])
b = np.array([0.1,0.2,0.3,0.4,0.5])
c = np.array([-1,-2,-3,-4,-5])

a,b,c = shuffle(a,b,c)
print(a)
print(b)
print(c)

# output:
# [1 4 2 5 3]
# [0.1 0.4 0.2 0.5 0.3]
# [-1 -4 -2 -5 -3]

import numpy as np import matplotlib.pyplot as plt import pandas as pd import torch import torch.nn as nn from sklearn.preprocessing import StandardScaler from torch.utils.data import Dataset, DataLoader # 加载数据集 data = pd.read_csv('pfyh.csv') df = pd.DataFrame(data) dataset = df.iloc[:, 2:].to_numpy() df.head() # 可视化数据 # 简单数据可视化 plt.plot(df.iloc[:, 2]) plt.title("Data Visualization") plt.show() # 提取特征和标签 X = np.array(dataset[:, :-1]) y = np.array(dataset[:, -1]) # 数据标准化和归一化 scaler = StandardScaler() X = scaler.fit_transform(X) y = y / 1000 # 划分训练集和测试集(90%训练,10%测试) split_index = int(len(X) * 0.9) X_train, X_test = X[:split_index], X[split_index:] y_train, y_test = y[:split_index], y[split_index:] # 自定义PyTorch数据集类 class TimeSeriesDataset(Dataset): def __init__(self, x, y, sequence_length): self.x = x self.y = y self.sequence_length = sequence_length def __len__(self): return len(self.x) - self.sequence_length def __getitem__(self, idx): return ( torch.tensor(self.x[idx:idx + self.sequence_length], dtype=torch.float), torch.tensor(self.y[idx + self.sequence_length], dtype=torch.float) ) # 创建数据集和数据加载器 sequence_length = 14 train_dataset = TimeSeriesDataset(X_train, y_train, sequence_length) train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) test_dataset = TimeSeriesDataset(X_test, y_test, sequence_length) test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False) # 定义LSTM模型 class LSTMModel(nn.Module): def __init__(self, input_size, hidden_size, num_layers, output_size): super(LSTMModel, self).__init__() self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True) self.fc = nn.Linear(hidden_size, output_size) self.init_weights() def forward(self, x): out, _ = self.lstm(x) out = self.fc(out[:, -1, :]) return out def init_weights(self): torch.manual_seed(42)
03-31
import json import os import numpy as np import joblib from sklearn import metrics from sklearn.datasets import make_circles, load_iris from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.preprocessing import PolynomialFeatures import torch import torch.nn as nn import torch.nn.functional as F class CustomLinear(nn.Module): def __init__(self, in_features, out_features): super(CustomLinear, self).__init__() self.w = nn.Parameter(torch.empty(in_features, out_features)) self.b = nn.Parameter(torch.empty(out_features)) # 参数初始化 nn.init.uniform_(self.w) self.b.data.fill_(0.02) def forward(self, x): # return x @ self.w + self.b return torch.matmul(x, self.w) + self.b class NetworkV0(nn.Module): def __init__(self, in_features, num_classes): """ 负责模型的初始化:仅负责当前模块/模型内部包含的参数、子模块的初始化(创建);不包含执行图的构造 :param in_features: :param num_classes: """ super(NetworkV0, self).__init__() # alpha不属于参数 self.alpha = torch.randn(2) # 第一个隐层 self.w1 = nn.Parameter(torch.empty(in_features, 8)) self.b1 = nn.Parameter(torch.empty(8)) # 第二个隐层 self.fc2 = CustomLinear(8, 16) # 第三个隐层 self.fc3 = nn.Linear(16, 32) self.relu3 = nn.ReLU() # 输出层 self.classifier = nn.Linear(32, num_classes) # 参数初始化 nn.init.uniform_(self.w1) nn.init.uniform_(self.b1) # self.parameters(): 获取当前模块内的所有参数包括子模块 # for param in self.parameters(): # pass def forward(self, x): """ 当前模型/模块对应的前向执行图; ps: forward支持多个入参 ---> 也就是普通的方法定义 ps: 整个网络全部由全连接+relu激活函数构成,3个隐层 + 1个输出层 --> 总共4个全连接 + 3个激活 :param x: [bs, in_features] bs个样本,每个样本对应的特征属性维度大小为in_features :return: score 预测每个样本对应各个类别的置信度值 [bs, num_classes] """ # 第一个隐层 [bs,in_features] * [in_features,8] + [8] -> [bs,8] z1 = F.relu(x @ self.w1 + self.b1) # 第二个隐层 [bs,8] * [8,16] + [16] -> [bs,16] z2 = F.relu(self.fc2(z1)) # 第三个隐层 [bs,16] * [16,32] + [32] -> [bs,32] z3 = self.relu3(self.fc3(z2)) # 输出层 得到预测置信度值 return self.classifier(z3) class Network(nn.Module): def __init__(self, in_features, num_classes): """ 负责模型的初始化:仅负责当前模块/模型内部包含的参数、子模块的初始化(创建);不包含执行图的构造 :param in_features: :param num_classes: """ super(Network, self).__init__() # 特征提取层 self.features = nn.Sequential( CustomLinear(in_features, 8), nn.ReLU(), CustomLinear(8, 16), nn.ReLU(), nn.Linear(16, 32), nn.ReLU() ) # 输出层 self.classifier = nn.Linear(32, num_classes) def forward(self, x): """ 当前模型/模块对应的前向执行图; ps: forward支持多个入参 ---> 也就是普通的方法定义 ps: 整个网络全部由全连接+relu激活函数构成,3个隐层 + 1个输出层 --> 总共4个全连接 + 3个激活 :param x: [bs, in_features] bs个样本,每个样本对应的特征属性维度大小为in_features :return: score 预测每个样本对应各个类别的置信度值 [bs, num_classes] """ # 特征提取 [bs,in_features] -> [bs,32] x = self.features(x) # 输出层 得到预测置信度值 return self.classifier(x) def tt_network(): net = Network(in_features=4, num_classes=2) print(net) print("=" * 100) for name, param in net.named_parameters(): print(f"参数名称: {name} 对应参数shape: {param.shape} 参数里面的第一个值: {param.view(-1)[0]}") if name == 'b1': print(param) x = torch.randn(5, 4) r = net(x) print(r.shape) # noinspection DuplicatedCode def training(): # 1. 数据加载 x, y = make_circles(n_samples=1000, noise=0.2, factor=0.01, random_state=1) # x, y = load_iris(return_X_y=True) print(f"特征属性shape: {x.shape}") print(f"目标属性shape: {y.shape}") y = ['M' if _y == 0 else 'F' for _y in y] y = np.asarray(y) print(f"目标属性y的取值: {np.unique(y)}") in_features = x.shape[1] classes_number = len(np.unique(y)) # 2. 数据处理 X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=42) # 3. 模型训练 # 3.1 创建 --> 需要人为构造出网络结构、优化器、损失函数 # 模型初始化 ----> 构造网络的执行图(构建图中的各个模块) net = Network(in_features=in_features, num_classes=classes_number) # Loss Function的构造 # 优化器构造 # 3.2 训练 --> 需要人为进行数据的遍历以及前向反向过程的代码编写 # 3.2.1 前向过程的执行 ----> # 属于网络的执行图的构建(模型的执行顺序) # loss的获取 # 3.2.2 反向过程的执行 ----> 不需要人为构造(框架会帮我们完成) # + 梯度计算 + 参数的更新 + 梯度重置为0 # NOTE: 训练是一个循环的过程,所以在训练过程中会有模型评估和模型持久化的操作 # 4. 模型评估 # 4.1 需要人为进行数据遍历、模型的推理预测、预测结果的评估 # 5. 模型持久化保存磁盘 pass if __name__ == '__main__': tt_network()
最新发布
07-15
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值