UCF-101 3d卷积输入样例
- create_tfrecord_dataset.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
import cv2;
import numpy as np;
from numpy.random import uniform;
from random import shuffle;
import os;
import tensorflow as tf;
import pickle;
def video2sample(ucf_rootdir):
if False == os.path.exists(ucf_rootdir) or False == os.path.isdir(ucf_rootdir):
print("invalid UCF root directory!");
exit(1);
dirs = [d for d in os.listdir(ucf_rootdir)];
classname = dict();
samplelist = list();
#collect all samples
for i,dirname in enumerate(dirs):
assert True == os.path.isdir(os.path.join(ucf_rootdir,dirname));
classname[i] = dirname;
videos = [v for v in os.listdir(os.path.join(ucf_rootdir,dirname))];
#for every video
for j in range(0,len(videos)):
vidname = videos[j];
name,ext = os.path.splitext(vidname);
if ext == '.avi' or ext == '.AVI':
vp = os.path.join(ucf_rootdir,dirname,vidname);
samplelist.append((vp,i));
#output id->classname map to file
with open('id2classname.dat','wb') as f:
f.write(pickle.dumps(classname));
#shuffle samples
shuffle(samplelist);
trainset_size = 9 * len(samplelist) // 10;
#write all train samples to tfrecord
if True == os.path.exists('trainset.tfrecord'):
os.remove('trainset.tfrecord');
writer = tf.python_io.TFRecordWriter('trainset.tfrecord');
for sample in samplelist[0:trainset_size]:
cap = cv2.VideoCapture(sample[0]);
if False == cap.isOpened():
print(sample[0] + " can't be opened!");
continue;
features = np.zeros((16,112,112,3),dtype = np.uint8);
count = 0;
while True:
#stride = 32, therefore skip 16 frames after every 16 frames for training samples
if count == 16:
trainsample = tf.train.Example(features = tf.train.Features(
feature = {
'clips': tf.train.Feature(bytes_list = tf.train.BytesList(value = [features.tobytes()])),
'label': tf.train.Feature(int64_list = tf.train.Int64List(value = [sample[1]]))
}
));
writer.write(trainsample.SerializeToString());
#copy the last 8 frames to first 8 frame position of features[]
features[0:8,...] = features[8:16,...];
count = 8;
ret,frame = cap.read();
if False == ret:
break;
frame = cv2.resize(frame,(160,120))[4:116,24:136];
features[count,...] = frame;
count = count + 1;
writer.close();
#write all test samples to tfrecord
if True == os.path.exists('testset.tfrecord'):
os.remove('testset.tfrecord');
writer = tf.python_io.TFRecordWriter('testset.tfrecord');
for sample in samplelist[trainset_size:]:
cap = cv2.VideoCapture(sample[0]);
if False == cap.isOpened():
print(sample[0] + " can't be opened!");
continue;
features = np.zeros((16,112,112,3),dtype = np.uint8);
count = 0;
while True:
#framestride = 8,framelen=16
if count == 16:
trainsample = tf.train.Example(features = tf.train.Features(
feature = {
'clips': tf.train.Feature(bytes_list = tf.train.BytesList(value = [features.tobytes()])),
'label': tf.train.Feature(int64_list = tf.train.Int64List(value = [sample[1]]))
}
));
writer.write(trainsample.SerializeToString());
#copy the last 8 frames to first 8 frame position of features[]
features[0:8,...] = features[8:16,...];
count = 8;
ret,frame = cap.read();#####################读取视频帧
if False == ret:
break;
frame = cv2.resize(frame,(160,120))[4:116,24:136];########在(120,160)图片上截取(112,112)大小
features[count,...] = frame;#################四维数组
count = count + 1;
writer.close();
if __name__ == "__main__":
video2sample('UCF-101');
- train_ucf101.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import absolute_import;
from __future__ import division;
from __future__ import print_function;
import os;
import numpy as np;
import tensorflow as tf;
batch_size = 8;
class_num = 101;
def main(unused_argv):
action_classifier = tf.estimator.Estimator(model_fn = action_model_fn, model_dir = "action_classifier_model");
tf.logging.set_verbosity(tf.logging.DEBUG);
logging_hook = tf.train.LoggingTensorHook(tensors = {"loss":"loss"}, every_n_iter = 1);
action_classifier.train(input_fn = train_input_fn,steps = 200000,hooks = [logging_hook]);
eval_results = action_classifier.evaluate(input_fn = eval_input_fn);
print(eval_results);
def parse_function(serialized_example):
feature = tf.parse_single_example(
serialized_example,
features = {
'clips': tf.FixedLenFeature((),dtype = tf.string, default_value = ''),
'label': tf.FixedLenFeature((),dtype = tf.int64, default_value = 0)
}
);
clips = tf.decode_raw(feature['clips'],out_type = tf.uint8);
clips = tf.reshape(clips,[16,112,112,3]);
clips = tf.cast(clips, dtype = tf.float32);
label = tf.cast(feature['label'], dtype = tf.int32);
return clips,label;
def train_input_fn():
dataset = tf.data.TFRecordDataset(['trainset.tfrecord']);
dataset = dataset.map(parse_function);
dataset = dataset.shuffle(buffer_size = 512);
dataset = dataset.batch(batch_size);
dataset = dataset.repeat(200);
iterator = dataset.make_one_shot_iterator();
features, labels = iterator.get_next();
return features, labels;
def eval_input_fn():
dataset = tf.data.TFRecordDataset(['testset.tfrecord']);
dataset = dataset.map(parse_function);
dataset = dataset.shuffle(buffer_size = 512);
dataset = dataset.batch(batch_size);
dataset = dataset.repeat(1);
iterator = dataset.make_one_shot_iterator();
features, labels = iterator.get_next();
return features, labels;
def action_model_fn(features, labels, mode):
# with tf.device('/device:GPU:1'):
#layer 1
c1 = tf.layers.conv3d(features,filters = 64, kernel_size = [3,3,3], padding = "same");
b1 = tf.contrib.layers.layer_norm(c1,activation_fn = tf.nn.relu, trainable = mode == tf.estimator.ModeKeys.TRAIN);
p1 = tf.layers.max_pooling3d(b1,pool_size = [1,2,2], strides = [1,2,2], padding = "same");
#layer 2
c2 = tf.layers.conv3d(p1,filters = 128, kernel_size = [3,3,3], padding = "same");
b2 = tf.contrib.layers.layer_norm(c2,activation_fn = tf.nn.relu, trainable = mode == tf.estimator.ModeKeys.TRAIN);
p2 = tf.layers.max_pooling3d(b2,pool_size = [2,2,2], strides = [2,2,2], padding = "same");
#layer 3
c3a = tf.layers.conv3d(p2,filters = 256, kernel_size = [3,3,3], padding = "same");
b3a = tf.contrib.layers.layer_norm(c3a,activation_fn = tf.nn.relu, trainable = mode == tf.estimator.ModeKeys.TRAIN);
c3b = tf.layers.conv3d(b3a,filters = 256, kernel_size = [3,3,3], padding = "same");
b3b = tf.contrib.layers.layer_norm(c3b,activation_fn = tf.nn.relu, trainable = mode == tf.estimator.ModeKeys.TRAIN);
p3 = tf.layers.max_pooling3d(b3b,pool_size = [2,2,2], strides = [2,2,2], padding = "same");
#layer 4
c4a = tf.layers.conv3d(p3,filters = 512, kernel_size = [3,3,3], padding = "same");
b4a = tf.contrib.layers.layer_norm(c4a,activation_fn = tf.nn.relu, trainable = mode == tf.estimator.ModeKeys.TRAIN);
c4b = tf.layers.conv3d(b4a,filters = 512, kernel_size = [3,3,3], padding = "same");
b4b = tf.contrib.layers.layer_norm(c4b,activation_fn = tf.nn.relu, trainable = mode == tf.estimator.ModeKeys.TRAIN);
p4 = tf.layers.max_pooling3d(b4b,pool_size = [2,2,2], strides = [2,2,2], padding = "same");
# with tf.device('/device:GPU:2'):
#layer 5
c5a = tf.layers.conv3d(p4,filters = 512, kernel_size = [3,3,3], padding = "same");
b5a = tf.contrib.layers.layer_norm(c5a,activation_fn = tf.nn.relu, trainable = mode == tf.estimator.ModeKeys.TRAIN);
c5b = tf.layers.conv3d(b5a,filters = 512, kernel_size = [3,3,3], padding = "same");
b5b = tf.contrib.layers.layer_norm(c5b,activation_fn = tf.nn.relu, trainable = mode == tf.estimator.ModeKeys.TRAIN);
p5 = tf.layers.max_pooling3d(b5b,pool_size = [2,2,2], strides = [2,2,2], padding = "same");
#flatten
f = tf.layers.flatten(p5);
d1 = tf.layers.dense(f,units = 4096, activation = tf.nn.relu);
dp1 = tf.layers.dropout(d1,training = mode == tf.estimator.ModeKeys.TRAIN);
d2 = tf.layers.dense(dp1,units = 4096, activation = tf.nn.relu);
dp2 = tf.layers.dropout(d2,training = mode == tf.estimator.ModeKeys.TRAIN);
logits = tf.layers.dense(dp2,units = class_num);
#predict mode
if mode == tf.estimator.ModeKeys.PREDICT:
action = tf.argmax(logits,axis = 1);
return tf.estimator.EstimatorSpec(mode = mode,predictions = action);
if mode == tf.estimator.ModeKeys.TRAIN:
onehot_labels = tf.one_hot(labels,class_num);
loss = tf.losses.softmax_cross_entropy(onehot_labels,logits);
loss = tf.identity(loss,name = "loss");
optimizer = tf.train.AdamOptimizer(1e-4);
train_op = optimizer.minimize(loss = loss, global_step = tf.train.get_global_step());
return tf.estimator.EstimatorSpec(mode = mode, loss = loss, train_op = train_op);
if mode == tf.estimator.ModeKeys.EVAL:
onehot_labels = tf.one_hot(labels,class_num);
loss = tf.losses.softmax_cross_entropy(onehot_labels,logits);
loss = tf.identity(loss,name = "loss");
eval_metric_ops = {"accuracy": tf.metrics.accuracy(labels = labels,predictions = tf.argmax(logits,axis = 1))};
return tf.estimator.EstimatorSpec(mode = mode, loss = loss, eval_metric_ops = eval_metric_ops);
raise Exception('Unknown mode of estimator!');
if __name__ == "__main__":
tf.app.run();
- tf.train.Feature使用
import struct
import tensorflow as tf
import cv2
def read_text_file(text_file):
lines = []
with open(text_file, "r") as f:
for line in f:
lines.append(line.strip())
return lines
def text_to_binary(in_file, out_file):
inputs = read_text_file(in_file)
with open(out_file, 'wb') as writer:
data_id = tf.train.Int64List(value=[int(inputs[0])])
data = tf.train.BytesList(value=[bytes(' '.join(inputs[1:]), encoding='utf-8')])
feature_dict = {
"data_id": tf.train.Feature(int64_list=data_id),
"data": tf.train.Feature(bytes_list=data)
}
features = tf.train.Features(feature=feature_dict)
example = tf.train.Example(features=features)
example_str = example.SerializeToString()
str_len = len(example_str)
writer.write(struct.pack('H', str_len))
writer.write(struct.pack('%ds' % str_len, example_str))
if __name__ == '__main__':
#text_to_binary('data.txt', 'data.bin')
cap = cv2.VideoCapture('/storage/workspace/zhujieenv/c3d/UCF-101/Biking/v_Biking_g25_c04.avi');
print(cap.isOpened())
exit(1)
- ActionRecognize.py
#!/usr/bin/python3
import numpy as np
import tensorflow as tf;
import cv2;
import pickle;
from train_c3d import action_model_fn;
class ActionRecognition(object):
def __init__(self):
self.classifier = tf.estimator.Estimator(model_fn = action_model_fn, model_dir = 'action_classifier_model');
with open('id2classname.dat','rb') as f:
self.labels = pickle.loads(f.read());
f.close();
def predict(self,fname = None):
#play video and print the class label
assert type(fname) is str;
cap = cv2.VideoCapture(fname);
if False == cap.isOpened(): raise 'invalid video';
cv2.namedWindow('show');
features = np.zeros((16,112,112,3),dtype = np.uint8);
count = 0;
status = -1;
while True:
if count == 16:
#update status
batch = np.reshape(features,(1,16,112,112,3)).astype(np.float32);
input_fn = lambda:tf.convert_to_tensor(batch);
prediction = self.classifier.predict(input_fn);
status = next(prediction);
#top earliest 8 frames in features
features[0:8,...] = features[8:16,...];
count = 8;
ret,frame = cap.read();
if False == ret: break;
#stack cropped frame to features
cropped = cv2.resize(frame,(160,120))[4:116,24:136];
features[count,...] = cropped;
#show labeled frame
if status != -1:
label = self.labels[status];
cv2.putText(frame, label, (50,50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0));
cv2.imshow('show',frame);
k = cv2.waitKey(25);
if k == 'q': break;
#update counter
count += 1;
if __name__ == "__main__":
recognizer = ActionRecognition();
recognizer.predict('UCF-101/CricketShot/v_CricketShot_g12_c07.avi');