tensorflow 单机多卡到分布式 记录

本文介绍了一个基于TensorFlow的分布式训练程序实例,通过设置不同的设备和集群配置实现多GPU和多节点间的并行训练。该程序定义了数据输入、模型构建、损失函数计算及优化过程,并实现了训练流程。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

import argparse
import sys

import tensorflow as tf

import model
import icdar

import time
import numpy as np
import tensorflow as tf
from tensorflow.contrib import slim

tf.app.flags.DEFINE_integer('input_size', 512, '')
tf.app.flags.DEFINE_integer('batch_size_per_gpu', 14, '')
tf.app.flags.DEFINE_integer('num_readers',2 , '')
tf.app.flags.DEFINE_float('learning_rate', 0.0001, '')
tf.app.flags.DEFINE_integer('max_steps', 100000, '')
tf.app.flags.DEFINE_float('moving_average_decay', 0.997, '')
tf.app.flags.DEFINE_integer('num_gpus', 1 , '')
tf.app.flags.DEFINE_string('checkpoint_path', ' ', '')
tf.app.flags.DEFINE_boolean('restore', False, 'whether to resotre from checkpoint')
tf.app.flags.DEFINE_integer('save_checkpoint_steps', 1000, '')
tf.app.flags.DEFINE_integer('save_summary_steps', 100, '')
tf.app.flags.DEFINE_string('pretrained_model_path', None, '')

tf.app.flags.DEFINE_enum('job_name', '', ('ps', 'worker', 'controller', ''),
                  'One of "ps", "worker", "controller", "".  Empty for local '
                  'training')
tf.app.flags.DEFINE_string('ps_hosts', '', 'Comma-separated list of target hosts')
tf.app.flags.DEFINE_string('worker_hosts', '', 'Comma-separated list of target hosts')
tf.app.flags.DEFINE_integer('task_index', 0, 'Index of task within the job')

FLAGS = tf.app.flags.FLAGS

def tower_loss(images, score_maps, geo_maps, training_masks, reuse_variables=None):

   return total_loss, model_loss

def average_gradients(tower_grads):

    return average_grads

def main(argv=None):
  ps_hosts = FLAGS.ps_hosts.split(",")
  worker_hosts = FLAGS.worker_hosts.split(",")

  # Create a cluster from the parameter server and worker hosts.
  cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})

  # Create and start a server for the local task.
  server = tf.train.Server(cluster,
                           job_name=FLAGS.job_name,
                           task_index=FLAGS.task_index)

  if FLAGS.job_name == "ps":
    server.join()

  elif FLAGS.job_name == "worker":
#    worker_device = "/job:worker/replica:0/task:%d/gpu:0" % FLAGS.task_index
    # Assigns ops to the local worker by default.

    with tf.device(tf.train.replica_device_setter(
        ps_device = "/job:ps/cpu:0",
        worker_device="/job:worker/replica:0/task:%d" % FLAGS.task_index,
        cluster=cluster)): 
      global_step = tf.Variable(0, name="global_step", trainable=False)
      # Build model...
 
      input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images')
      input_score_maps = tf.placeholder(tf.float32, shape=[None, None, None, 1], name='input_score_maps')
      if FLAGS.geometry == 'RBOX':
          input_geo_maps = tf.placeholder(tf.float32, shape=[None, None, None, 5], name='input_geo_maps')
      else:
          input_geo_maps = tf.placeholder(tf.float32, shape=[None, None, None, 8], name='input_geo_maps')
      input_training_masks = tf.placeholder(tf.float32, shape=[None, None, None, 1], name='input_training_masks')

      #split
      gpus = range(0,FLAGS.num_gpus)
      input_images_split = tf.split(input_images, len(gpus))
      input_score_maps_split = tf.split(input_score_maps, len(gpus))
      input_geo_maps_split = tf.split(input_geo_maps, len(gpus))
      input_training_masks_split = tf.split(input_training_masks, len(gpus))
 
      num_workers = len(worker_hosts)
 
      opt = tf.train.AdamOptimizer(FLAGS.learning_rate)
      opt = tf.train.SyncReplicasOptimizer(
        opt,
        replicas_to_aggregate=num_workers,
        total_num_replicas=num_workers,
        name="sync_replicas")

      tower_grads = 0
      reuse_variables = None
      for gpu_id in gpus:
          with tf.device("/job:worker/replica:0/task:%d/gpu:%d" % (FLAGS.task_index, gpu_id)):
              # multi_gpu
              with tf.name_scope('model_%d' % gpu_id) as scope:
                  i = gpu_id
                  tf.logging.info('aaaaaaaa')
                  iis = input_images_split[i]
                  isms = input_score_maps_split[i]
                  igms = input_geo_maps_split[i]
                  itms = input_training_masks_split[i]
                  total_loss, model_loss = tower_loss(iis, isms, igms, itms, reuse_variables)
                  batch_norm_updates_op = tf.group(*tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope))
                  reuse_variables = True
                  tower_grads+=total_loss
 
      apply_gradient_op = opt.minimize(tower_grads, global_step=global_step)

    # The StopAtStepHook handles stopping after running given steps.
      with tf.control_dependencies([apply_gradient_op, batch_norm_updates_op]):
        train_op = tf.no_op(name='no')

      sync_replicas_hook = opt.make_session_run_hook((FLAGS.task_index == 0))

      hook = tf.train.StopAtStepHook(last_step=10)
      hooks = [hook,
               sync_replicas_hook,
               tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': total_loss}, every_n_iter=1)]
      # The MonitoredTrainingSession takes care of session initialization,
      # restoring from a checkpoint, saving to a checkpoint, and closing when done
      # or an error occurs.
      saver = tf.train.Saver(tf.global_variables(),save_relative_paths=True)

      config = tf.ConfigProto()
      config.allow_soft_placement = True
 
      with tf.train.MonitoredTrainingSession(master=server.target,
                                             is_chief=(FLAGS.task_index == 0),
                                             checkpoint_dir=" ",
                                             hooks = hooks,
                                             config = config) as mon_sess:

        data_generator = icdar.get_batch(num_workers=FLAGS.num_readers,
                                           input_size=FLAGS.input_size,
                                           batch_size=FLAGS.batch_size_per_gpu * len(gpus))

        while not mon_sess.should_stop():
          # Run a training step asynchronously.
          # See <a href="../api_docs/python/tf/train/SyncReplicasOptimizer"><code>tf.train.SyncReplicasOptimizer</code></a> for additional details on how to
          # perform *synchronous* training.
          # mon_sess.run handles AbortedError in case of preempted PS.
 
          start = time.time()
          for step in range(FLAGS.max_steps):
              print(step)
              data = next(data_generator)

  mon_sess.run([train_op], feed_dict={input_images: data[0],

                                                             input_score_maps: data[2],

                                                             input_geo_maps: data[3],

                                                            input_training_masks: data[4]})

if __name__ == "__main__":

  tf.app.run()

 

问题记录:

1.with tf.device(tf.train.replica_device_setter(
        ps_device = "/job:ps/cpu:0",
        worker_device="/job:worker/replica:0/task:%d" % FLAGS.task_index,
        cluster=cluster)): 

注意worker_device设置

2.整个程序中没有做参数initialize,但benchmark、mnist分布式程序中都做,尤其是benchmark做的比较详细,这应该是程序继续要改进的地方

3 global_step的设置等

4.tf.summary等要多应用

更新:

tf.logging 先设置,再引用tf.logging.info等

如果有幸被大佬看到,请不吝赐教!

### 单机分布式部署方案 在单机环境下进行分布式训练是深度学习框架(如PyTorch和TensorFlow)的一项重要功能。以下是两种框架的实现方式及特点。 #### PyTorch 的单机分布式部署 PyTorch 提供了简洁的接口来管理和优化设备计算,支持通过 `torch.nn.DataParallel` 或 `torch.nn.parallel.DistributedDataParallel` 实现单机分布式训练[^1]。以下是一个典型的实现流程: - 使用 `torch.distributed` 模块初始化进程组,并指定后端(如 `NCCL`)以加速 GPU 之间的通信。 - 将模型封装到 `DistributedDataParallel` 中,以便在个 GPU 上并行执行前向和反向传播。 - 配置数据加载器以确保每个 GPU 获取不同的数据子集。 代码示例如下: ```python import torch import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP # 初始化分布式环境 def setup(rank, world_size): dist.init_process_group("nccl", rank=rank, world_size=world_size) # 清理分布式环境 def cleanup(): dist.destroy_process_group() # 定义模型 model = torch.nn.Linear(10, 5).cuda() ddp_model = DDP(model) # 训练逻辑 def train(rank, world_size): setup(rank, world_size) # 加载数据和定义优化器 optimizer = torch.optim.SGD(ddp_model.parameters(), lr=0.01) loss_fn = torch.nn.MSELoss() for epoch in range(10): outputs = ddp_model(torch.randn(20, 10).cuda()) labels = torch.randn(20, 5).cuda() loss = loss_fn(outputs, labels) loss.backward() optimizer.step() optimizer.zero_grad() cleanup() # 启动训练 torch.multiprocessing.spawn(train, args=(4,), nprocs=4, join=True) ``` #### TensorFlow单机分布式部署 TensorFlow 提供了更高级别的抽象模块 `tf.distribute` 来简化分布式训练过程[^2]。其核心策略包括 `MirroredStrategy` 和 `MultiWorkerMirroredStrategy`,其中 `MirroredStrategy` 专门用于单机场景。以下是实现方法: - 使用 `MirroredStrategy` 创建一个分布策略对象。 - 在策略范围内构建模型、优化器和数据管道,确保所有操作都符合分布式要求。 - 自动选择最佳的通信协议(如 NCCL 或 RING),优化 GPU 间的同步效率。 代码示例如下: ```python import tensorflow as tf # 定义分布策略 strategy = tf.distribute.MirroredStrategy() # 构建模型 with strategy.scope(): model = tf.keras.Sequential([tf.keras.layers.Dense(5, input_shape=(10,))]) optimizer = tf.keras.optimizers.SGD(learning_rate=0.01) loss_fn = tf.keras.losses.MeanSquaredError() # 数据准备 dataset = tf.data.Dataset.from_tensor_slices((tf.random.normal([100, 10]), tf.random.normal([100, 5]))).batch(16) # 分布式训练步骤 @tf.function def distributed_train_step(data, labels): def train_step(inputs, targets): with tf.GradientTape() as tape: predictions = model(inputs) loss = loss_fn(targets, predictions) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) return loss per_replica_losses = strategy.run(train_step, args=(data, labels)) return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) for epoch in range(10): for data, labels in dataset: loss = distributed_train_step(data, labels) ``` #### 框架对比与选择 - **PyTorch** 更适合需要动态计算图的场景,提供了灵活的接口,但需要用户手动管理更细节[^3]。 - **TensorFlow** 的 `tf.distribute` 模块提供了更高层次的抽象,降低了开发复杂度,尤其适合初学者或希望快速实现分布式训练的开发者[^2]。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值