1. 从event file解析loss:
import tensorflow as tf
from tensorflow.python.summary import summary_iterator
event_file = 'events.filename'
for event in summary_iterator.summary_iterator(event_file):
if event.HasField('summary'):
event_eval_result = {}
for value in event.summary.value:
if value.HasField('simple_value') and value.tag == 'loss':
print(value.simple_value)
2. 多进程找出有问题的tfrecords:
import tensorflow as tf
import glob
import multiprocessing
def task2(id, q):
while not q.empty():
file = q.get()
try:
a=[1 for _ in tf.python_io.tf_record_iterator(file)]
except Exception as e:
print("=====",file)
return None
pool = multiprocessing.Pool()
m = multiprocessing.Manager()
cpus = multiprocessing.cpu_count()
q = m.Queue()
results = []
train_files = sorted(glob.glob('/path/to/tfrecords_dir/*'))
for each in train_files:
q.put(each)
for i in range(cpus):
results.append(pool.apply_async(task2, args=(i,q)))
pool.close()
pool.join()
for result in results:
result.get()
3.往ckpt里面添加变量
import os,sys
import tensorflow as tf
import horovod.tensorflow as hvd
hvd.init()
ckpt_path=sys.argv[1]
tf.get_variable('var_name',dtype=tf.int32,shape=[],initializer=tf.constant_initializer(0))
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
saver = tf.train.import_meta_graph(ckpt_path+'.meta')
saver.restore(sess, ckpt_path)
path=sys.argv[2]#
saver2 = tf.train.Saver()
ckpt_name=ckpt_path[ckpt_path.rindex('/')+1:]
tf.gfile.MakeDirs(path)
os.chdir(path)
print(saver2.save(sess, ckpt_name))
4. 查询参数的梯度:
x = tf.Variable(1.0, name="x")
y = tf.add(x, x, name="y")
z = tf.square(debug_y)
# Create a train op under the grad_debugger context.
grad_debugger = tf_debug.GradientsDebugger()
with grad_debugger.watch_gradients_by_tensor_names(r"(x|y):0$"):
train_op = tf.compat.v1.train.GradientDescentOptimizer(z)
# Now we can reflect through grad_debugger to get the gradient tensor
# with respect to x and y.
x_grad = grad_debugger.gradient_tensor("x:0")
y_grad = grad_debugger.gradient_tensor("y:0")
5. 打印日志 TF_CPP_MIN_VLOG_LEVEL=1
6. local_variable定义:
tf.get_variable(
'local_var',
shape=[],
dtype=tf.int64,
initializer=tf.constant_initializer(1),
trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES])
不能漏了trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES],否则在代码中加入local variable原有的ckpt restore会报找不到local variable,local variable restore时需要忽略的