UCF-101数据集上的3D卷积动作识别-优快云博客

UCF-101 3d卷积输入样例

create_tfrecord_dataset.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

import cv2;
import numpy as np;
from numpy.random import uniform;
from random import shuffle;
import os;
import tensorflow as tf;
import pickle;

def video2sample(ucf_rootdir):
	if False == os.path.exists(ucf_rootdir) or False == os.path.isdir(ucf_rootdir):
		print("invalid UCF root directory!");
		exit(1);
	dirs = [d for d in os.listdir(ucf_rootdir)];
	classname = dict();
	samplelist = list();
	#collect all samples
	for i,dirname in enumerate(dirs):
		assert True == os.path.isdir(os.path.join(ucf_rootdir,dirname));
		classname[i] = dirname;
		videos = [v for v in os.listdir(os.path.join(ucf_rootdir,dirname))];
		#for every video
		for j in range(0,len(videos)):
			vidname = videos[j];
			name,ext = os.path.splitext(vidname);
			if ext == '.avi' or ext == '.AVI':
				vp = os.path.join(ucf_rootdir,dirname,vidname);
				samplelist.append((vp,i));
	#output id->classname map to file
	with open('id2classname.dat','wb') as f:
		f.write(pickle.dumps(classname));
	#shuffle samples
	shuffle(samplelist);
	trainset_size = 9 * len(samplelist) // 10;
	#write all train samples to tfrecord
	if True == os.path.exists('trainset.tfrecord'):
		os.remove('trainset.tfrecord');
	writer = tf.python_io.TFRecordWriter('trainset.tfrecord');
	for sample in samplelist[0:trainset_size]:
		cap = cv2.VideoCapture(sample[0]);
		if False == cap.isOpened():
			print(sample[0] + " can't be opened!");
			continue;
		features = np.zeros((16,112,112,3),dtype = np.uint8);
		count = 0;
		while True:

			#stride = 32, therefore skip 16 frames after every 16 frames for training samples
			if count == 16:
				trainsample = tf.train.Example(features = tf.train.Features(
					feature = {
						'clips': tf.train.Feature(bytes_list = tf.train.BytesList(value = [features.tobytes()])),
						'label': tf.train.Feature(int64_list = tf.train.Int64List(value = [sample[1]]))
					}
				));
				writer.write(trainsample.SerializeToString());
				#copy the last 8 frames to first 8 frame position of features[]
				features[0:8,...] = features[8:16,...];
				count = 8;
			ret,frame = cap.read();
			if False == ret:
				break;
			frame = cv2.resize(frame,(160,120))[4:116,24:136];
			features[count,...] = frame;
			count = count + 1;
	writer.close();

	#write all test samples to tfrecord
	if True == os.path.exists('testset.tfrecord'):
		os.remove('testset.tfrecord');
	writer = tf.python_io.TFRecordWriter('testset.tfrecord');
	for sample in samplelist[trainset_size:]:
		cap = cv2.VideoCapture(sample[0]);
		if False == cap.isOpened():
			print(sample[0] + " can't be opened!");
			continue;
		features = np.zeros((16,112,112,3),dtype = np.uint8);
		count = 0;
		while True:
			#framestride = 8,framelen=16 
			if count == 16:
				trainsample = tf.train.Example(features = tf.train.Features(
					feature = {
						'clips': tf.train.Feature(bytes_list = tf.train.BytesList(value = [features.tobytes()])),
						'label': tf.train.Feature(int64_list = tf.train.Int64List(value = [sample[1]]))
					}
				));
				writer.write(trainsample.SerializeToString());
				#copy the last 8 frames to first 8 frame position of features[]
				features[0:8,...] = features[8:16,...];
				count = 8;
			ret,frame = cap.read();#####################读取视频帧
			if False == ret:
				break;
			frame = cv2.resize(frame,(160,120))[4:116,24:136];########在(120,160)图片上截取(112,112)大小
			features[count,...] = frame;#################四维数组
			count = count + 1;
	writer.close();

if __name__ == "__main__":
	video2sample('UCF-101');

train_ucf101.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import absolute_import;
from __future__ import division;
from __future__ import print_function;

import os;
import numpy as np;
import tensorflow as tf;

batch_size = 8;
class_num = 101;

def main(unused_argv):
	action_classifier = tf.estimator.Estimator(model_fn = action_model_fn, model_dir = "action_classifier_model");
	tf.logging.set_verbosity(tf.logging.DEBUG);
	logging_hook = tf.train.LoggingTensorHook(tensors = {"loss":"loss"}, every_n_iter = 1);
	action_classifier.train(input_fn = train_input_fn,steps = 200000,hooks = [logging_hook]);
	eval_results = action_classifier.evaluate(input_fn = eval_input_fn);
	print(eval_results);

def parse_function(serialized_example):
	feature = tf.parse_single_example(
		serialized_example,
		features = {
			'clips': tf.FixedLenFeature((),dtype = tf.string, default_value = ''),
			'label': tf.FixedLenFeature((),dtype = tf.int64, default_value = 0)
		}
	);
	clips = tf.decode_raw(feature['clips'],out_type = tf.uint8);
	clips = tf.reshape(clips,[16,112,112,3]);
	clips = tf.cast(clips, dtype = tf.float32);
	label = tf.cast(feature['label'], dtype = tf.int32);
	return clips,label;

def train_input_fn():
	dataset = tf.data.TFRecordDataset(['trainset.tfrecord']);
	dataset = dataset.map(parse_function);
	dataset = dataset.shuffle(buffer_size = 512);
	dataset = dataset.batch(batch_size);
	dataset = dataset.repeat(200);
	iterator = dataset.make_one_shot_iterator();
	features, labels = iterator.get_next();
	return features, labels;

def eval_input_fn():
	dataset = tf.data.TFRecordDataset(['testset.tfrecord']);
	dataset = dataset.map(parse_function);
	dataset = dataset.shuffle(buffer_size = 512);
	dataset = dataset.batch(batch_size);
	dataset = dataset.repeat(1);
	iterator = dataset.make_one_shot_iterator();
	features, labels = iterator.get_next();
	return features, labels;

def action_model_fn(features, labels, mode):
#	with tf.device('/device:GPU:1'):
	#layer 1
	c1 = tf.layers.conv3d(features,filters = 64, kernel_size = [3,3,3], padding = "same");
	b1 = tf.contrib.layers.layer_norm(c1,activation_fn = tf.nn.relu, trainable = mode == tf.estimator.ModeKeys.TRAIN);
	p1 = tf.layers.max_pooling3d(b1,pool_size = [1,2,2], strides = [1,2,2], padding = "same");
	#layer 2
	c2 = tf.layers.conv3d(p1,filters = 128, kernel_size = [3,3,3], padding = "same");
	b2 = tf.contrib.layers.layer_norm(c2,activation_fn = tf.nn.relu, trainable = mode == tf.estimator.ModeKeys.TRAIN);
	p2 = tf.layers.max_pooling3d(b2,pool_size = [2,2,2], strides = [2,2,2], padding = "same");
	#layer 3
	c3a = tf.layers.conv3d(p2,filters = 256, kernel_size = [3,3,3], padding = "same");
	b3a = tf.contrib.layers.layer_norm(c3a,activation_fn = tf.nn.relu, trainable = mode == tf.estimator.ModeKeys.TRAIN);
	c3b = tf.layers.conv3d(b3a,filters = 256, kernel_size = [3,3,3], padding = "same");
	b3b = tf.contrib.layers.layer_norm(c3b,activation_fn = tf.nn.relu, trainable = mode == tf.estimator.ModeKeys.TRAIN);
	p3 = tf.layers.max_pooling3d(b3b,pool_size = [2,2,2], strides = [2,2,2], padding = "same");
	#layer 4
	c4a = tf.layers.conv3d(p3,filters = 512, kernel_size = [3,3,3], padding = "same");
	b4a = tf.contrib.layers.layer_norm(c4a,activation_fn = tf.nn.relu, trainable = mode == tf.estimator.ModeKeys.TRAIN);
	c4b = tf.layers.conv3d(b4a,filters = 512, kernel_size = [3,3,3], padding = "same");
	b4b = tf.contrib.layers.layer_norm(c4b,activation_fn = tf.nn.relu, trainable = mode == tf.estimator.ModeKeys.TRAIN);
	p4 = tf.layers.max_pooling3d(b4b,pool_size = [2,2,2], strides = [2,2,2], padding = "same");
#	with tf.device('/device:GPU:2'):
	#layer 5
	c5a = tf.layers.conv3d(p4,filters = 512, kernel_size = [3,3,3], padding = "same");
	b5a = tf.contrib.layers.layer_norm(c5a,activation_fn = tf.nn.relu, trainable = mode == tf.estimator.ModeKeys.TRAIN);
	c5b = tf.layers.conv3d(b5a,filters = 512, kernel_size = [3,3,3], padding = "same");
	b5b = tf.contrib.layers.layer_norm(c5b,activation_fn = tf.nn.relu, trainable = mode == tf.estimator.ModeKeys.TRAIN);
	p5 = tf.layers.max_pooling3d(b5b,pool_size = [2,2,2], strides = [2,2,2], padding = "same");
	#flatten
	f = tf.layers.flatten(p5);
	d1 = tf.layers.dense(f,units = 4096, activation = tf.nn.relu);
	dp1 = tf.layers.dropout(d1,training = mode == tf.estimator.ModeKeys.TRAIN);
	d2 = tf.layers.dense(dp1,units = 4096, activation = tf.nn.relu);
	dp2 = tf.layers.dropout(d2,training = mode == tf.estimator.ModeKeys.TRAIN);
	logits = tf.layers.dense(dp2,units = class_num);
	#predict mode
	if mode == tf.estimator.ModeKeys.PREDICT:
		action = tf.argmax(logits,axis = 1);
		return tf.estimator.EstimatorSpec(mode = mode,predictions = action);
	if mode == tf.estimator.ModeKeys.TRAIN:
		onehot_labels = tf.one_hot(labels,class_num);
		loss = tf.losses.softmax_cross_entropy(onehot_labels,logits);
		loss = tf.identity(loss,name = "loss");
		optimizer = tf.train.AdamOptimizer(1e-4);
		train_op = optimizer.minimize(loss = loss, global_step = tf.train.get_global_step());
		return tf.estimator.EstimatorSpec(mode = mode, loss = loss, train_op = train_op);
	if mode == tf.estimator.ModeKeys.EVAL:
		onehot_labels = tf.one_hot(labels,class_num);
		loss = tf.losses.softmax_cross_entropy(onehot_labels,logits);
		loss = tf.identity(loss,name = "loss");
		eval_metric_ops = {"accuracy": tf.metrics.accuracy(labels = labels,predictions = tf.argmax(logits,axis = 1))};
		return tf.estimator.EstimatorSpec(mode = mode, loss = loss, eval_metric_ops = eval_metric_ops);
	raise Exception('Unknown mode of estimator!');

if __name__ == "__main__":
	tf.app.run();

tf.train.Feature使用

import struct
import tensorflow as tf
import cv2

def read_text_file(text_file):
    lines = []
    with open(text_file, "r") as f:
        for line in f:
            lines.append(line.strip())
    return lines

def text_to_binary(in_file, out_file):
    inputs = read_text_file(in_file)

    with open(out_file, 'wb') as writer:
        data_id = tf.train.Int64List(value=[int(inputs[0])])
        data = tf.train.BytesList(value=[bytes(' '.join(inputs[1:]), encoding='utf-8')])
        feature_dict = {
            "data_id": tf.train.Feature(int64_list=data_id),
            "data": tf.train.Feature(bytes_list=data)
        }
        features = tf.train.Features(feature=feature_dict)
        example = tf.train.Example(features=features)
        example_str = example.SerializeToString()
        str_len = len(example_str)
        writer.write(struct.pack('H', str_len))
        writer.write(struct.pack('%ds' % str_len, example_str))

if __name__ == '__main__':
    #text_to_binary('data.txt', 'data.bin')
    cap = cv2.VideoCapture('/storage/workspace/zhujieenv/c3d/UCF-101/Biking/v_Biking_g25_c04.avi');
    print(cap.isOpened())
    exit(1)

ActionRecognize.py

#!/usr/bin/python3

import numpy as np
import tensorflow as tf;
import cv2;
import pickle;
from train_c3d import action_model_fn;

class ActionRecognition(object):
    def __init__(self):
        self.classifier = tf.estimator.Estimator(model_fn = action_model_fn, model_dir = 'action_classifier_model');
        with open('id2classname.dat','rb') as f:
            self.labels = pickle.loads(f.read());
        f.close();
    def predict(self,fname = None):
        #play video and print the class label
        assert type(fname) is str;
        cap = cv2.VideoCapture(fname);
        if False == cap.isOpened(): raise 'invalid video';
        cv2.namedWindow('show');
        features = np.zeros((16,112,112,3),dtype = np.uint8);
        count = 0;
        status = -1;
        while True:
            if count == 16:
                #update status
                batch = np.reshape(features,(1,16,112,112,3)).astype(np.float32);
                input_fn = lambda:tf.convert_to_tensor(batch);
                prediction = self.classifier.predict(input_fn);
                status = next(prediction);
                #top earliest 8 frames in features
                features[0:8,...] = features[8:16,...];
                count = 8;
            ret,frame = cap.read();
            if False == ret: break;
            #stack cropped frame to features
            cropped = cv2.resize(frame,(160,120))[4:116,24:136];
            features[count,...] = cropped;
            #show labeled frame
            if status != -1: 
                label = self.labels[status];
                cv2.putText(frame, label, (50,50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0));
            cv2.imshow('show',frame);
            k = cv2.waitKey(25);
            if k == 'q': break;
            #update counter
            count += 1;

if __name__ == "__main__":
    recognizer = ActionRecognition();
    recognizer.predict('UCF-101/CricketShot/v_CricketShot_g12_c07.avi');