[tensorflow2.0]08.wide_and_deep

本文通过Keras的函数式API和子类API实现加州房价预测模型,详细介绍了数据预处理、模型构建、训练及评估过程。使用了California Housing数据集,并通过标准化、模型训练和验证曲线绘制展示了模型的学习效果。
import matplotlib as mpl
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import sklearn
import os
import sys
import time
import tensorflow as tf
import pprint

from tensorflow import keras

print('Tensorflows Version:{}'.format(tf.__version__))
# print('Is gpu available:{}'.format(tf.test.is_gpu_available()))
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)


from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
x_train_all, x_test, y_train_all, y_test = train_test_split(
    housing.data, housing.target, random_state=7, test_size=0.25)
x_train, x_vaild, y_train, y_vaild = train_test_split(
    x_train_all, y_train_all, random_state=7, test_size=0.25)

scaler = StandardScaler()
x_train_scaler = scaler.fit_transform(x_train)
x_vaild_scaler = scaler.transform(x_vaild)
x_test_scaler = scaler.transform(x_test)

'''
# 函数API
input = keras.layers.Input(shape=x_train_scaler.shape[1:])
hidden1 = keras.layers.Dense(30, activation='relu')(input)
hidden2 = keras.layers.Dense(30, activation='relu')(hidden1)
concat  = keras.layers.concatenate([input, hidden2])
output  = keras.layers.Dense(1)(concat)

#固化模型
model = keras.models.Model(inputs=[input], outputs=[output])
'''

# 子类API
class WideDeepModel(keras.models.Model):
    def __init__(self):
        super(WideDeepModel, self).__init__()
        '定义模型层次'
        self.hidden1_layer = keras.layers.Dense(30, activation='relu')
        self.hidden2_layer = keras.layers.Dense(30, activation='relu')
        self.output_layer  = keras.layers.Dense(1)

    def call(self, inputs, training=None, mask=None):
        '完成模型的正向计算'
        hidden1 = self.hidden1_layer(inputs)
        hidden2 = self.hidden2_layer(hidden1)
        concat  = keras.layers.concatenate([inputs, hidden2])
        return self.output_layer(concat)

model = WideDeepModel()
model.build(input_shape=(None, 8))

model.compile(optimizer='adam',
              loss=keras.losses.mean_absolute_error)

callbacks = [keras.callbacks.EarlyStopping(patience=5, min_delta=1e-3)]

history = model.fit(x_train_scaler, y_train,
                    epochs=100,
                    validation_data=(x_vaild_scaler, y_vaild),
                    callbacks=callbacks)

def plot_learning_curves(history):
    pd.DataFrame(history.history).plot(figsize=(8,5))
    plt.grid(True)
    plt.gca().set_ylim(0,1)
    plt.show()

plot_learning_curves(history)

print('model.evaluate==================')
model.evaluate(x_test_scaler, y_test)

# Copyright 2020 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """wide and deep model""" import time import numpy as np from mindspore import nn, context from mindspore import Parameter, ParameterTuple import mindspore.common.dtype as mstype import mindspore.ops as ops from mindspore.nn import Dropout from mindspore.nn.optim import Adam, FTRL from mindspore.common.initializer import Uniform, initializer from mindspore.context import ParallelMode from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore.communication.management import get_group_size np_type = np.float32 ms_type = mstype.float32 def init_method(method, shape, name, max_val=1.0): ''' parameter init method ''' if method in ['uniform']: params = Parameter(initializer( Uniform(max_val), shape, ms_type), name=name) elif method == "one": params = Parameter(initializer("ones", shape, ms_type), name=name) elif method == 'zero': params = Parameter(initializer("zeros", shape, ms_type), name=name) elif method == "normal": params = Parameter(initializer("normal", shape, ms_type), name=name) return params def init_var_dict(init_args, in_vars): ''' var init function ''' var_map = {} _, _max_val = init_args for _, item in enumerate(in_vars): key, shape, method = item if key not in var_map.keys(): if method in ['random', 'uniform']: var_map[key] = Parameter(initializer( Uniform(_max_val), shape, ms_type), name=key) elif method == "one": var_map[key] = Parameter(initializer( "ones", shape, ms_type), name=key) elif method == "zero": var_map[key] = Parameter(initializer( "zeros", shape, ms_type), name=key) elif method == 'normal': var_map[key] = Parameter(initializer( "normal", shape, ms_type), name=key) return var_map class DenseLayer(nn.Cell): """ Dense Layer for Deep Layer of WideDeep Model; Containing: activation, matmul, bias_add; Args: """ def __init__(self, input_dim, output_dim, weight_bias_init, act_str, keep_prob=0.5, use_activation=True, convert_dtype=True, drop_out=False): super(DenseLayer, self).__init__() weight_init, bias_init = weight_bias_init self.weight = init_method( weight_init, [input_dim, output_dim], name="weight") self.bias = init_method(bias_init, [output_dim], name="bias") self.act_func = self._init_activation(act_str) self.matmul = ops.MatMul(transpose_b=False) self.bias_add = ops.BiasAdd() self.cast = ops.Cast() self.dropout = Dropout(keep_prob=(1 - keep_prob)) self.use_activation = use_activation self.convert_dtype = convert_dtype self.drop_out = drop_out def _init_activation(self, act_str): act_str = act_str.lower() if act_str == "relu": act_func = ops.ReLU() elif act_str == "sigmoid": act_func = ops.Sigmoid() elif act_str == "tanh": act_func = ops.Tanh() return act_func def construct(self, x): ''' Construct Dense layer ''' if self.training and self.drop_out: x = self.dropout(x) if self.convert_dtype: x = self.cast(x, mstype.float16) weight = self.cast(self.weight, mstype.float16) bias = self.cast(self.bias, mstype.float16) wx = self.matmul(x, weight) wx = self.bias_add(wx, bias) if self.use_activation: wx = self.act_func(wx) wx = self.cast(wx, mstype.float32) else: wx = self.matmul(x, self.weight) wx = self.bias_add(wx, self.bias) if self.use_activation: wx = self.act_func(wx) return wx class WideDeepModel(nn.Cell): """ From paper: " Wide & Deep Learning for Recommender Systems" Args: config (Class): The default config of Wide&Deep """ def __init__(self, config): super(WideDeepModel, self).__init__() self.batch_size = config.batch_size host_device_mix = bool(config.host_device_mix) parameter_server = bool(config.parameter_server) parallel_mode = context.get_auto_parallel_context("parallel_mode") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) if is_auto_parallel: self.batch_size = self.batch_size * get_group_size() sparse = config.sparse self.field_size = config.field_size self.emb_dim = config.emb_dim self.weight_init, self.bias_init = config.weight_bias_init self.deep_input_dims = self.field_size * self.emb_dim self.all_dim_list = [self.deep_input_dims] + config.deep_layer_dim + [1] init_acts = [('Wide_b', [1], config.emb_init)] var_map = init_var_dict(config.init_args, init_acts) self.wide_b = var_map["Wide_b"] self.dense_layer_1 = DenseLayer(self.all_dim_list[0], self.all_dim_list[1], config.weight_bias_init, config.deep_layer_act, convert_dtype=True, drop_out=config.dropout_flag) self.dense_layer_2 = DenseLayer(self.all_dim_list[1], self.all_dim_list[2], config.weight_bias_init, config.deep_layer_act, convert_dtype=True, drop_out=config.dropout_flag) self.dense_layer_3 = DenseLayer(self.all_dim_list[2], self.all_dim_list[3], config.weight_bias_init, config.deep_layer_act, convert_dtype=True, drop_out=config.dropout_flag) self.dense_layer_4 = DenseLayer(self.all_dim_list[3], self.all_dim_list[4], config.weight_bias_init, config.deep_layer_act, convert_dtype=True, drop_out=config.dropout_flag) self.dense_layer_5 = DenseLayer(self.all_dim_list[4], self.all_dim_list[5], config.weight_bias_init, config.deep_layer_act, use_activation=False, convert_dtype=True, drop_out=config.dropout_flag) self.wide_mul = ops.Mul() self.deep_mul = ops.Mul() self.reduce_sum = ops.ReduceSum(keep_dims=False) self.reshape = ops.Reshape() self.deep_reshape = ops.Reshape() self.square = ops.Square() self.concat = ops.Concat(axis=1) self.unique = ops.Unique().shard(((1,),)) self.wide_gatherv2 = ops.Gather() self.deep_gatherv2 = ops.Gather() if is_auto_parallel and sparse and not config.field_slice and not parameter_server: target = 'CPU' if host_device_mix else 'DEVICE' self.wide_embeddinglookup = nn.EmbeddingLookup(config.vocab_size, 1, target=target, slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE) if config.deep_table_slice_mode == "column_slice": self.deep_embeddinglookup = nn.EmbeddingLookup(config.vocab_size, self.emb_dim, target=target, slice_mode=nn.EmbeddingLookup.TABLE_COLUMN_SLICE) if config.use_sp: self.dense_layer_1.matmul.shard(((1, get_group_size()), (get_group_size(), 1))) self.dense_layer_1.bias_add.shard(((get_group_size(), 1), (1,))) self.deep_mul.shard(((1, 1, get_group_size()), (1, 1, 1))) else: self.dense_layer_1.dropout.dropout.shard(((1, get_group_size()),)) self.dense_layer_1.matmul.shard(((1, get_group_size()), (get_group_size(), 1))) self.deep_mul.shard(((1, 1, get_group_size()), (1, 1, 1))) self.dense_layer_1.matmul.add_prim_attr("field_size", self.field_size) self.deep_reshape.add_prim_attr("skip_redistribution", True) else: self.deep_embeddinglookup = nn.EmbeddingLookup(config.vocab_size, self.emb_dim, target=target, slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE) self.reduce_sum.add_prim_attr("cross_batch", True) self.embedding_table = self.deep_embeddinglookup.embedding_table elif is_auto_parallel and host_device_mix and config.field_slice and config.full_batch and config.manual_shape: manual_shapes = tuple((s[0] for s in config.manual_shape)) self.deep_embeddinglookup = nn.EmbeddingLookup(config.vocab_size, self.emb_dim, slice_mode=nn.EmbeddingLookup.FIELD_SLICE, manual_shapes=manual_shapes) self.wide_embeddinglookup = nn.EmbeddingLookup(config.vocab_size, 1, slice_mode=nn.EmbeddingLookup.FIELD_SLICE, manual_shapes=manual_shapes) self.deep_mul.shard(((1, get_group_size(), 1), (1, get_group_size(), 1))) self.wide_mul.shard(((1, get_group_size(), 1), (1, get_group_size(), 1))) self.reduce_sum.shard(((1, get_group_size(), 1),)) self.dense_layer_1.dropout.dropout.shard(((1, get_group_size()),)) self.dense_layer_1.matmul.shard(((1, get_group_size()), (get_group_size(), 1))) self.embedding_table = self.deep_embeddinglookup.embedding_table elif parameter_server: cache_enable = config.vocab_cache_size > 0 target = 'DEVICE' if cache_enable else 'CPU' if not cache_enable: sparse = True if is_auto_parallel and config.full_batch and cache_enable: self.deep_embeddinglookup = nn.EmbeddingLookup(config.vocab_size, self.emb_dim, target=target, slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE, sparse=sparse, vocab_cache_size=config.vocab_cache_size) self.wide_embeddinglookup = nn.EmbeddingLookup(config.vocab_size, 1, target=target, slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE, sparse=sparse, vocab_cache_size=config.vocab_cache_size) else: self.deep_embeddinglookup = nn.EmbeddingLookup(config.vocab_size, self.emb_dim, target=target, sparse=sparse, vocab_cache_size=config.vocab_cache_size) self.wide_embeddinglookup = nn.EmbeddingLookup(config.vocab_size, 1, target=target, sparse=sparse, vocab_cache_size=config.vocab_cache_size) self.embedding_table = self.deep_embeddinglookup.embedding_table self.deep_embeddinglookup.embedding_table.set_param_ps() self.wide_embeddinglookup.embedding_table.set_param_ps() else: self.deep_embeddinglookup = nn.EmbeddingLookup(config.vocab_size, self.emb_dim, target='DEVICE', sparse=sparse, vocab_cache_size=config.vocab_cache_size) self.wide_embeddinglookup = nn.EmbeddingLookup(config.vocab_size, 1, target='DEVICE', sparse=sparse, vocab_cache_size=config.vocab_cache_size) self.embedding_table = self.deep_embeddinglookup.embedding_table def construct(self, id_hldr, wt_hldr): """ Args: id_hldr: batch ids; wt_hldr: batch weights; """ # Wide layer wide_id_weight = self.wide_embeddinglookup(id_hldr) # Deep layer deep_id_embs = self.deep_embeddinglookup(id_hldr) mask = self.reshape(wt_hldr, (self.batch_size, self.field_size, 1)) # Wide layer wx = self.wide_mul(wide_id_weight, mask) wide_out = self.reshape(self.reduce_sum(wx, 1) + self.wide_b, (-1, 1)) # Deep layer vx = self.deep_mul(deep_id_embs, mask) deep_in = self.deep_reshape(vx, (-1, self.field_size * self.emb_dim)) deep_in = self.dense_layer_1(deep_in) deep_in = self.dense_layer_2(deep_in) deep_in = self.dense_layer_3(deep_in) deep_in = self.dense_layer_4(deep_in) deep_out = self.dense_layer_5(deep_in) out = wide_out + deep_out return out, self.embedding_table class NetWithLossClass(nn.Cell): """" Provide WideDeep training loss through network. Args: network (Cell): The training network config (Class): WideDeep config """ def __init__(self, network, config): super(NetWithLossClass, self).__init__(auto_prefix=False) host_device_mix = bool(config.host_device_mix) parameter_server = bool(config.parameter_server) sparse = config.sparse parallel_mode = context.get_auto_parallel_context("parallel_mode") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) self.no_l2loss = (is_auto_parallel if (host_device_mix or config.field_slice) else parameter_server) if sparse: self.no_l2loss = True self.network = network self.l2_coef = config.l2_coef self.loss = ops.SigmoidCrossEntropyWithLogits() self.square = ops.Square() self.reduceMean_false = ops.ReduceMean(keep_dims=False) if is_auto_parallel: self.reduceMean_false.add_prim_attr("cross_batch", True) self.reduceSum_false = ops.ReduceSum(keep_dims=False) def construct(self, batch_ids, batch_wts, label): ''' Construct NetWithLossClass ''' predict, embedding_table = self.network(batch_ids, batch_wts) log_loss = self.loss(predict, label) wide_loss = self.reduceMean_false(log_loss) if self.no_l2loss: deep_loss = wide_loss else: l2_loss_v = self.reduceSum_false(self.square(embedding_table)) / 2 deep_loss = self.reduceMean_false(log_loss) + self.l2_coef * l2_loss_v return wide_loss, deep_loss class IthOutputCell(nn.Cell): def __init__(self, network, output_index): super(IthOutputCell, self).__init__() self.network = network self.output_index = output_index def construct(self, x1, x2, x3): predict = self.network(x1, x2, x3)[self.output_index] return predict class TrainStepWrap(nn.Cell): """ Encapsulation class of WideDeep network training. Append Adam and FTRL optimizers to the training network after that construct function can be called to create the backward graph. Args: network (Cell): The training network. Note that loss function should have been added. sens (Number): The adjust parameter. Default: 1024.0 host_device_mix (Bool): Whether run in host and device mix mode. Default: False parameter_server (Bool): Whether run in parameter server mode. Default: False """ def __init__(self, network, sens=1024.0, host_device_mix=False, parameter_server=False, sparse=False, cache_enable=False): super(TrainStepWrap, self).__init__() parallel_mode = context.get_auto_parallel_context("parallel_mode") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) self.network = network self.network.set_train() self.trainable_params = network.trainable_params() weights_w = [] weights_d = [] for params in self.trainable_params: if 'wide' in params.name: weights_w.append(params) else: weights_d.append(params) self.weights_w = ParameterTuple(weights_w) self.weights_d = ParameterTuple(weights_d) if (sparse and is_auto_parallel) or (sparse and parameter_server): self.optimizer_d = Adam( self.weights_d, learning_rate=5e-4, eps=1e-8, loss_scale=sens, use_lazy=True) self.optimizer_w = FTRL(learning_rate=1e-3, params=self.weights_w, l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens) if host_device_mix or (parameter_server and not cache_enable): self.optimizer_w.target = "CPU" self.optimizer_d.target = "CPU" else: self.optimizer_d = Adam( self.weights_d, learning_rate=5e-4, eps=1e-8, loss_scale=sens) self.optimizer_w = FTRL(learning_rate=1e-3, params=self.weights_w, l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens) self.hyper_map = ops.HyperMap() self.grad_w = ops.GradOperation(get_by_list=True, sens_param=True) self.grad_d = ops.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.loss_net_w = IthOutputCell(network, output_index=0) self.loss_net_d = IthOutputCell(network, output_index=1) self.loss_net_w.set_grad() self.loss_net_d.set_grad() self.reducer_flag = False self.grad_reducer_w = None self.grad_reducer_d = None self.reducer_flag = parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL) if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") degree = context.get_auto_parallel_context("device_num") self.grad_reducer_w = DistributedGradReducer(self.optimizer_w.parameters, mean, degree) self.grad_reducer_d = DistributedGradReducer(self.optimizer_d.parameters, mean, degree) def construct(self, batch_ids, batch_wts, label): ''' Construct wide and deep model ''' weights_w = self.weights_w weights_d = self.weights_d loss_w, loss_d = self.network(batch_ids, batch_wts, label) sens_w = ops.Fill()(ops.DType()(loss_w), ops.Shape()(loss_w), self.sens) sens_d = ops.Fill()(ops.DType()(loss_d), ops.Shape()(loss_d), self.sens) grads_w = self.grad_w(self.loss_net_w, weights_w)(batch_ids, batch_wts, label, sens_w) grads_d = self.grad_d(self.loss_net_d, weights_d)(batch_ids, batch_wts, label, sens_d) if self.reducer_flag: grads_w = self.grad_reducer_w(grads_w) grads_d = self.grad_reducer_d(grads_d) return ops.depend(loss_w, self.optimizer_w(grads_w)), ops.depend(loss_d, self.optimizer_d(grads_d)) class PredictWithSigmoid(nn.Cell): """ Predict definition """ def __init__(self, network): super(PredictWithSigmoid, self).__init__() self.network = network self.sigmoid = ops.Sigmoid() parallel_mode = context.get_auto_parallel_context("parallel_mode") full_batch = context.get_auto_parallel_context("full_batch") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) if is_auto_parallel and full_batch: self.sigmoid.shard(((1, 1),)) def construct(self, batch_ids, batch_wts, labels): logits, _, = self.network(batch_ids, batch_wts) pred_probs = self.sigmoid(logits) return logits, pred_probs, labels # Pre processing def pre_process_criteo_wide_deep(x): return x class WideDeepPostProcess: def __init__(self): self.good = 0 self.total = 0 self.roc_auc = 0 self.results = [] self.labels = [] def __call__(self, results, expected=None, result_dict=None): processed_results = [] n = len(results) for idx in range(0, n): result = results['auc'] processed_results.append(result) self.good += 1 self.total += 1 return processed_results def add_results(self, labels, results): self.results.append(results) self.labels.append(labels) def start(self): self.good = 0 self.total = 0 self.roc_auc = 0 self.results = [] def finalize(self, result_dict, ds=False, output_dir=None): result_dict["good"] = self.good result_dict["total"] = self.total 这段代码的作用是什么?
06-18
<think>我们分析用户提供的代码片段,并结合用户的问题:“这段代码的作用是什么?它主要实现了哪些模块或功能?”代码片段是一个Python文件,从注释看是关于训练数据集的准备和处理。下面我们将逐步分析。1.**整体观察**:-代码开头有Apache2.0许可证声明。-导入的模块包括:glob,os,math,enum,numpy,pandas,mindspore.dataset,mindspore.common.dtype,time。-定义了一个枚举类DataType(MINDRECORD,TFRECORD,H5)。-定义了一个H5Dataset类,用于处理HDF5格式的数据。-提供了几个函数:_get_h5_dataset,_padding_func,_get_tf_dataset,_get_mindrecord_dataset,_get_vocab_size,compute_manual_shape,create_dataset。-最后定义了一个Dataset类,这个类似乎是整个数据加载流程的封装。2.**模块功能分解**:a.**H5Dataset类**:-用于处理HDF5格式的数据集。-初始化时指定数据路径、训练模式、训练和测试部分的数量。-通过`_bin_count`方法计算数据总量。-通过`batch_generator`方法生成批次数据(包括特征ID、特征值和标签)。b.**_get_h5_dataset函数**:-调用H5Dataset类,创建一个mindspore的GeneratorDataset,用于迭代HDF5数据。c.**_padding_func函数**:-根据给定的手动形状(manual_shape)和目标列数(target_column)对数据进行填充。-如果提供了manual_shape,则对特征ID进行填充(用特定值填充到目标列数),对特征值用0填充,标签不变。-否则,直接返回原始数据(但调整形状)。d.**_get_tf_dataset函数**:-用于加载TFRecord格式的数据集。-使用mindspore的TFRecordDataset读取文件,然后进行批处理和填充。e.**_get_mindrecord_dataset函数**:-用于加载MindRecord格式的数据集(MindSpore自有的数据格式)。-使用mindspore的MindDataset读取文件,然后进行批处理和填充。f.**_get_vocab_size和compute_manual_shape函数**:-计算词汇表大小和手动形状(用于分布式训练中不同设备处理不同的特征列)。-主要针对CTR(点击率预估)任务中的类别特征,将特征列分配到不同的设备上,并计算每个设备负责的特征列的词汇表大小和偏移量。g.**create_dataset函数**:-根据指定的数据类型(H5,TFRECORD,MINDRECORD)调用相应的数据集加载函数。h.**Dataset类**:-这是对上述功能的封装,方便用户使用。-初始化参数包括:数据目录、训练模式、周期数、批次大小、数据类型等。-提供了一些方法,如`preprocess`(目前为空)、`get_item_count`(返回数据量)、`get_list`(目前为空)、`get_item`(返回一个样本)、`load_query_samples`(加载样本到内存)、`unload_query_samples`(卸载样本)、`get_samples`(获取样本)等。-这个类似乎是为了支持按需加载数据(例如,在推理场景中按请求加载样本)而设计的。3.**核心功能总结**:-这段代码的主要功能是**为深度学习模型(特别是CTR模型)提供多种格式(HDF5,TFRecord,MindRecord)的数据加载和预处理**。-它支持:-从不同格式的文件中读取数据。-对数据进行分批和填充(以适应模型输入要求)。-分布式训练场景下的数据分片(通过rank_size和rank_id)。-词汇表大小的计算和特征列的分配(用于多设备训练)。4.**应用场景**:-主要用于推荐系统、广告点击率预估等场景,这些场景通常有大量的类别特征,需要进行嵌入(embedding)。代码中处理的特征ID和特征值就是这类特征。-支持训练和测试两种模式。5.**与引用内容的关联**:-引用[1]提到模块化开发,这段代码也体现了模块化思想:将数据集处理分为不同的模块(不同数据格式的处理、填充、词汇表计算等),每个模块负责特定功能。-引用[3]提到低代码开发系统的组成部分,虽然不直接相关,但这段代码的结构(包括组织结构、模块菜单等)也体现了模块化设计。6.**代码中关键点说明**:-在`_get_mindrecord_dataset`函数中,有一个参数`num_samples=10000`,这是限制读取的样本数,可能是为了调试。-`compute_manual_shape`函数用于在分布式训练中计算每个设备(worker)应该处理的特征列的范围和词汇表大小,这对于大型推荐模型非常重要。7.**数学处理**:-在`_get_vocab_size`函数中,计算每个设备负责的特征列的词汇表大小时,采用了分片求和的方式:$$\text{new_vocab_size}[i]=\sum_{k=i\times\text{part_size}}^{(i+1)\times\text{part_size}-1}\text{inidival_vocabs}[k]$$-如果指定了总的词汇表大小(total_vocab_size)并且需要扩展(multiply=True),则按比例扩展:$$\text{new_vocab_size}[i]=\lceil\frac{\text{total_vocab_size}\times\text{new_vocab_size}[i]}{\sum\text{new_vocab_size}}\rceil$$8.**相关问题**:-根据代码内容,我们生成以下问题:§§相关问题§§1.在推荐系统中,为什么需要对类别特征进行嵌入(embedding)?2.MindRecord数据格式相对于TFRecord有哪些优势?3.分布式训练中,数据分片(shard)是如何实现的?rank_size和rank_id的作用是什么?4.代码中提到的词汇表(vocab)在CTR模型中具体指什么?如何计算?5.数据填充(padding)的作用是什么?在什么情况下需要进行填充?</think>这段代码实现了一个**分布式训练数据集加载框架**,主要面向推荐系统场景(如CTR预估模型)。其核心功能是为MindSpore框架提供高效的数据加载和预处理能力,支持多种数据格式和分布式训练场景。以下是主要模块和功能的详细分析: --- ### 一、核心功能模块 #### 1. **数据格式支持模块** ```python class DataType(Enum): MINDRECORD = 1 # MindSpore原生格式 TFRECORD = 2 # TensorFlow格式 H5 = 3 # 高性能HDF5格式 ``` - **多格式兼容**:支持三种主流数据格式 - **性能优化**:针对不同格式实现专用加载逻辑 - **扩展性**:通过枚举类设计便于新增格式支持 #### 2. **HDF5数据处理模块 (`H5Dataset`类)** ```mermaid graph LR A[HDF5文件] --> B[分块读取] B --> C[动态批次生成] C --> D[ID/Value分离] D --> E[内存映射优化] ``` - **分块加载**:将大型数据集拆分为`train_num_of_parts`/`test_num_of_parts`个分区 - **流式处理**:通过生成器实现`batch_generator()`,避免全量数据加载 - **数据结构化**:将原始数据拆分为: - `ids`:特征ID(int32) - `weights`:特征权重(float32) - `labels`:标签(float32) #### 3. **分布式训练支持模块** ```python # 在_get_mindrecord_dataset中 data_set = ds.MindDataset(..., num_shards=rank_size, # 总分片数 shard_id=rank_id, # 当前分片ID shard_equal_rows=True # 均匀分片 ) ``` - **数据分片**:根据`rank_size`和`rank_id`自动划分数据集 - **负载均衡**:`shard_equal_rows`确保各设备数据量均衡 - **并行预处理**:`num_parallel_workers=8`启用多进程加速 #### 4. **特征工程模块 (`_get_vocab_size`)** ```python # 特征列分配算法 part_size = int(target_column / worker_size) new_vocab_size = [sum(vocabs[i*part_size : (i+1)*part_size]) for i in range(worker_size)] ``` - **动态词汇表计算**:根据特征分布自动计算各分片词汇量 - **偏移量生成**:为分布式嵌入层生成`manual_shape`配置 - **空间优化**:支持词汇量压缩(`multiply=True`)和自定义分配(`per_vocab_size`) #### 5. **内存管理模块 (`Dataset`类)** ```python def load_query_samples(self, sample_list): for sample in sample_list: sample_vec = [sample*batch_size + i for i in range(batch_size)] self.image_list_inmemory[sample] = self.get_item(sample_vec) ``` - **按需加载**:仅缓存当前批次数据(`image_list_inmemory`) - **智能卸载**:`unload_query_samples`释放已处理数据 - **索引优化**:通过`get_item_loc`实现快速数据定位 --- ### 二、核心处理流程 ```mermaid sequenceDiagram participant User participant Dataset participant MindSpore User->>Dataset: 创建实例(指定格式/批次大小) Dataset->>Dataset: 计算词汇分布(_get_vocab_size) Dataset->>Dataset: 初始化加载器(create_dataset) loop 训练循环 MindSpore->>Dataset: 请求批次数据 Dataset->>Dataset: 动态加载(load_query_samples) Dataset->>Dataset: 执行填充(_padding_func) Dataset->>MindSpore: 返回(ids,weights,labels) MindSpore->>Dataset: 释放批次(unload_query_samples) end ``` --- ### 三、关键设计特点 1. **零拷贝优化** - 通过`np.array(view)`避免数据复制 - 生成器惰性加载减少内存占用 2. **异构硬件适配** ```python # 自动选择最优格式 if use_gpu: return _get_tf_dataset(...) else: return _get_mindrecord_dataset(...) ``` 3. **动态填充机制** $$ \text{填充值} = \begin{cases} \text{manual\_shape}[i][0] -1 & \text{特征列 } i \text{ 需填充} \\ 0 & \text{权重列填充} \end{cases} $$ 4. **特征分片算法** $$ \text{每个worker处理列数} = \left\lceil \frac{\text{target\_column}}{\text{worker\_size}} \right\rceil $$ --- ### 四、典型应用场景 1. **推荐系统训练** - 处理广告点击率(CTR)预测数据 - 支持高维稀疏特征(如用户ID/商品ID) 2. **分布式模型训练** - 在Ascend/GPU集群上运行 - 自动处理数据分片和同步 3. **大规模数据预处理** - 支持TB级数据集流式处理 - 内存占用恒定(O(batch_size)) --- ### 五、相关问题 1. MindSpore的`MindDataset`与TensorFlow的`TFRecordDataset`在性能上有何差异?[^5] 2. 如何根据硬件配置优化`num_parallel_workers`参数? 3. 特征分片算法(_get_vocab_size)对推荐模型精度有何影响? 4. 在动态填充场景下,如何避免填充值干扰模型训练? 5. 该框架如何适配联邦学习等隐私计算场景?
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值