def add_sync_queues_and_barrier(self, name_prefix, enqueue_after_list):
"""Adds ops to enqueue on all worker queues.
Args:
name_prefix: prefixed for the shared_name of ops.
enqueue_after_list: control dependency from ops.
Returns:
An op that should be used as control dependency before starting next step.
"""
self.sync_queue_counter += 1
with tf.device(self.sync_queue_devices[(
self.sync_queue_counter % len(self.sync_queue_devices))]):
sync_queues = [
tf.FIFOQueue(self.num_workers, [tf.bool], shapes=[[]],
shared_name='%s%s' % (name_prefix, i))
# For each other worker, add an entry in a queue, signaling that it can
# finish this step.
token = tf.constant(False)
with tf.control_dependencies(enqueue_after_list):
# 全部完成才可以继续下面的op 此时worker已经向ps更新了梯度
for i, q in enumerate(sync_queues):
if i == self.task_index:
queue_ops.append(tf.no_op())
# tf.no_op 什么都不干
else:
queue_ops.append(q.enqueue(token))
# tf.FIFOqueue.enqueue 入队 token是生成一个constant的op
# Drain tokens off queue for this worker, one for each other worker.
queue_ops.append(
sync_queues[self.task_index].dequeue_many(len(sync_queues) - 1))
"""Adds ops to enqueue on all worker queues.
Args:
name_prefix: prefixed for the shared_name of ops.
enqueue_after_list: control dependency from ops.
Returns:
An op that should be used as control dependency before starting next step.
"""
self.sync_queue_counter += 1
with tf.device(self.sync_queue_devices[(
self.sync_queue_counter % len(self.sync_queue_devices))]):
sync_queues = [
tf.FIFOQueue(self.num_workers, [tf.bool], shapes=[[]],
shared_name='%s%s' % (name_prefix, i))
for i in range(self.num_workers)]
# Creates a queue that dequeues elements in a first-in first-out order.
# 创建了num_worker个queue 每个worker一个 利用了shared_name
queue_ops = []# For each other worker, add an entry in a queue, signaling that it can
# finish this step.
token = tf.constant(False)
with tf.control_dependencies(enqueue_after_list):
# 全部完成才可以继续下面的op 此时worker已经向ps更新了梯度
for i, q in enumerate(sync_queues):
if i == self.task_index:
queue_ops.append(tf.no_op())
# tf.no_op 什么都不干
else:
queue_ops.append(q.enqueue(token))
# tf.FIFOqueue.enqueue 入队 token是生成一个constant的op
# Drain tokens off queue for this worker, one for each other worker.
queue_ops.append(
sync_queues[self.task_index].dequeue_many(len(sync_queues) - 1))
# 先进队列 再出队列 如果现在queue里面没有num_worker个element 就block在这里
# 借队列的FIFO的特性 解决分布式中同步更新梯度的问题 但是官方仍然推荐使用
# tf.train.SyncReplicasOptimizer
return tf.group(*queue_ops)