diff --git a/fluid/PaddleCV/video/.gitignore b/fluid/PaddleCV/video/.gitignore index c06b6205e64969ad19649c8c55e18d82d147da09..7052bdda1c76c2ab1adebd204bdef9ebf1a39755 100644 --- a/fluid/PaddleCV/video/.gitignore +++ b/fluid/PaddleCV/video/.gitignore @@ -1,4 +1,3 @@ -dataset checkpoints output* *.pyc diff --git a/fluid/PaddleCV/video/dataset/kinetics/README.md b/fluid/PaddleCV/video/dataset/kinetics/README.md new file mode 100644 index 0000000000000000000000000000000000000000..25eaee375dd126cd58f1188ed348619a7675f513 --- /dev/null +++ b/fluid/PaddleCV/video/dataset/kinetics/README.md @@ -0,0 +1,5 @@ +1. download kinetics-400_train.csv and kinetics-400_val.csv +2. ffmpeg is required to decode mp4 +3. transfer mp4 video to pkl file, with each pkl stores [video_id, images, label] + python generate_label.py kinetics-400_train.csv kinetics400_label.txt # generate label file + python video2pkl.py kinetics-400_train.csv $Source_dir $Target_dir $NUM_THREADS diff --git a/fluid/PaddleCV/video/dataset/kinetics/generate_label.py b/fluid/PaddleCV/video/dataset/kinetics/generate_label.py new file mode 100644 index 0000000000000000000000000000000000000000..4f7c504c56821527cde57bacf7e9a2d07c666c8f --- /dev/null +++ b/fluid/PaddleCV/video/dataset/kinetics/generate_label.py @@ -0,0 +1,31 @@ +import sys + +# kinetics-400_train.csv should be down loaded first and set as sys.argv[1] +# sys.argv[2] can be set as kinetics400_label.txt +# python generate_label.py kinetics-400_train.csv kinetics400_label.txt + +num_classes = 400 + +fname = sys.argv[1] +outname = sys.argv[2] +fl = open(fname).readlines() +fl = fl[1:] +outf = open(outname, 'w') + +label_list = [] +for line in fl: + label = line.strip().split(',')[0].strip('"') + if label in label_list: + continue + else: + label_list.append(label) + +assert len(label_list + ) == num_classes, "there should be {} labels in list, but ".format( + num_classes, len(label_list)) + +label_list.sort() +for i in range(num_classes): + outf.write('{} {}'.format(label_list[i], i) + '\n') + +outf.close() diff --git a/fluid/PaddleCV/video/dataset/kinetics/video2pkl.py b/fluid/PaddleCV/video/dataset/kinetics/video2pkl.py new file mode 100644 index 0000000000000000000000000000000000000000..881857c40c4ece2f192e681526e2622ef1ce2f81 --- /dev/null +++ b/fluid/PaddleCV/video/dataset/kinetics/video2pkl.py @@ -0,0 +1,84 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +import os +import sys +import glob +import cPickle +from multiprocessing import Pool + +# example command line: python generate_k400_pkl.py kinetics-400_train.csv 8 +# +# kinetics-400_train.csv is the training set file of K400 official release +# each line contains laebl,youtube_id,time_start,time_end,split,is_cc + +assert (len(sys.argv) == 5) + +f = open(sys.argv[1]) +source_dir = sys.argv[2] +target_dir = sys.argv[3] +num_threads = sys.argv[4] +all_video_entries = [x.strip().split(',') for x in f.readlines()] +all_video_entries = all_video_entries[1:] +f.close() + +category_label_map = {} +f = open('kinetics400_label.txt') +for line in f: + ens = line.strip().split(' ') + category = " ".join(ens[0:-1]) + label = int(ens[-1]) + category_label_map[category] = label +f.close() + + +def generate_pkl(entry): + mode = entry[4] + category = entry[0].strip('"') + category_dir = category + video_path = os.path.join( + './', + entry[1] + "_%06d" % int(entry[2]) + "_%06d" % int(entry[3]) + ".mp4") + video_path = os.path.join(source_dir, category_dir, video_path) + label = category_label_map[category] + + vid = './' + video_path.split('/')[-1].split('.')[0] + if os.path.exists(video_path): + if not os.path.exists(vid): + os.makedirs(vid) + os.system('ffmpeg -i ' + video_path + ' -q 0 ' + vid + '/%06d.jpg') + else: + print("File not exists {}".format(video_path)) + return + + images = sorted(glob.glob(vid + '/*.jpg')) + ims = [] + for img in images: + f = open(img) + ims.append(f.read()) + f.close() + + output_pkl = vid + ".pkl" + output_pkl = os.path.join(target_dir, output_pkl) + f = open(output_pkl, 'w') + cPickle.dump((vid, label, ims), f, -1) + f.close() + + os.system('rm -rf %s' % vid) + + +pool = Pool(processes=int(sys.argv[4])) +pool.map(generate_pkl, all_video_entries) +pool.close() +pool.join() diff --git a/fluid/PaddleCV/video/dataset/youtube8m/README.md b/fluid/PaddleCV/video/dataset/youtube8m/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e9f2d2c9a617c55a5ab4f48752057d0baf03b723 --- /dev/null +++ b/fluid/PaddleCV/video/dataset/youtube8m/README.md @@ -0,0 +1,2 @@ +1. Tensorflow is required to process tfrecords +2. python tf2pkl.py $Source_dir $Target_dir diff --git a/fluid/PaddleCV/video/dataset/youtube8m/tf2pkl.py b/fluid/PaddleCV/video/dataset/youtube8m/tf2pkl.py new file mode 100644 index 0000000000000000000000000000000000000000..3b32e3b41a705d6e294581ca3b92c911d238798f --- /dev/null +++ b/fluid/PaddleCV/video/dataset/youtube8m/tf2pkl.py @@ -0,0 +1,278 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. +"""Provides readers configured for different datasets.""" +import os, sys +import numpy as np +import tensorflow as tf +from tensorflow import logging +import cPickle + +from tensorflow.python.platform import gfile + +assert (len(sys.argv) == 3) +source_dir = sys.argv[1] +target_dir = sys.argv[2] + + +def Dequantize(feat_vector, max_quantized_value=2, min_quantized_value=-2): + """Dequantize the feature from the byte format to the float format. + + Args: + feat_vector: the input 1-d vector. + max_quantized_value: the maximum of the quantized value. + min_quantized_value: the minimum of the quantized value. + + Returns: + A float vector which has the same shape as feat_vector. + """ + assert max_quantized_value > min_quantized_value + quantized_range = max_quantized_value - min_quantized_value + scalar = quantized_range / 255.0 + bias = (quantized_range / 512.0) + min_quantized_value + return feat_vector * scalar + bias + + +def resize_axis(tensor, axis, new_size, fill_value=0): + """Truncates or pads a tensor to new_size on on a given axis. + + Truncate or extend tensor such that tensor.shape[axis] == new_size. If the + size increases, the padding will be performed at the end, using fill_value. + + Args: + tensor: The tensor to be resized. + axis: An integer representing the dimension to be sliced. + new_size: An integer or 0d tensor representing the new value for + tensor.shape[axis]. + fill_value: Value to use to fill any new entries in the tensor. Will be + cast to the type of tensor. + + Returns: + The resized tensor. + """ + tensor = tf.convert_to_tensor(tensor) + shape = tf.unstack(tf.shape(tensor)) + + pad_shape = shape[:] + pad_shape[axis] = tf.maximum(0, new_size - shape[axis]) + + shape[axis] = tf.minimum(shape[axis], new_size) + shape = tf.stack(shape) + + resized = tf.concat([ + tf.slice(tensor, tf.zeros_like(shape), shape), + tf.fill(tf.stack(pad_shape), tf.cast(fill_value, tensor.dtype)) + ], axis) + + # Update shape. + new_shape = tensor.get_shape().as_list() # A copy is being made. + new_shape[axis] = new_size + resized.set_shape(new_shape) + return resized + + +class BaseReader(object): + """Inherit from this class when implementing new readers.""" + + def prepare_reader(self, unused_filename_queue): + """Create a thread for generating prediction and label tensors.""" + raise NotImplementedError() + + +class YT8MFrameFeatureReader(BaseReader): + """Reads TFRecords of SequenceExamples. + + The TFRecords must contain SequenceExamples with the sparse in64 'labels' + context feature and a fixed length byte-quantized feature vector, obtained + from the features in 'feature_names'. The quantized features will be mapped + back into a range between min_quantized_value and max_quantized_value. + """ + + def __init__(self, + num_classes=3862, + feature_sizes=[1024], + feature_names=["inc3"], + max_frames=300): + """Construct a YT8MFrameFeatureReader. + + Args: + num_classes: a positive integer for the number of classes. + feature_sizes: positive integer(s) for the feature dimensions as a list. + feature_names: the feature name(s) in the tensorflow record as a list. + max_frames: the maximum number of frames to process. + """ + + assert len(feature_names) == len(feature_sizes), \ + "length of feature_names (={}) != length of feature_sizes (={})".format( \ + len(feature_names), len(feature_sizes)) + + self.num_classes = num_classes + self.feature_sizes = feature_sizes + self.feature_names = feature_names + self.max_frames = max_frames + + def get_video_matrix(self, features, feature_size, max_frames, + max_quantized_value, min_quantized_value): + """Decodes features from an input string and quantizes it. + + Args: + features: raw feature values + feature_size: length of each frame feature vector + max_frames: number of frames (rows) in the output feature_matrix + max_quantized_value: the maximum of the quantized value. + min_quantized_value: the minimum of the quantized value. + + Returns: + feature_matrix: matrix of all frame-features + num_frames: number of frames in the sequence + """ + decoded_features = tf.reshape( + tf.cast(tf.decode_raw(features, tf.uint8), tf.float32), + [-1, feature_size]) + + num_frames = tf.minimum(tf.shape(decoded_features)[0], max_frames) + + feature_matrix = decoded_features + + return feature_matrix, num_frames + + def prepare_reader(self, + filename_queue, + max_quantized_value=2, + min_quantized_value=-2): + """Creates a single reader thread for YouTube8M SequenceExamples. + + Args: + filename_queue: A tensorflow queue of filename locations. + max_quantized_value: the maximum of the quantized value. + min_quantized_value: the minimum of the quantized value. + + Returns: + A tuple of video indexes, video features, labels, and padding data. + """ + reader = tf.TFRecordReader() + _, serialized_example = reader.read(filename_queue) + + contexts, features = tf.parse_single_sequence_example( + serialized_example, + context_features={ + "id": tf.FixedLenFeature([], tf.string), + "labels": tf.VarLenFeature(tf.int64) + }, + sequence_features={ + feature_name: tf.FixedLenSequenceFeature( + [], dtype=tf.string) + for feature_name in self.feature_names + }) + + # read ground truth labels + labels = (tf.cast( + tf.sparse_to_dense( + contexts["labels"].values, (self.num_classes, ), + 1, + validate_indices=False), + tf.bool)) + + # loads (potentially) different types of features and concatenates them + num_features = len(self.feature_names) + assert num_features > 0, "No feature selected: feature_names is empty!" + + assert len(self.feature_names) == len(self.feature_sizes), \ + "length of feature_names (={}) != length of feature_sizes (={})".format( \ + len(self.feature_names), len(self.feature_sizes)) + + num_frames = -1 # the number of frames in the video + feature_matrices = [None + ] * num_features # an array of different features + + for feature_index in range(num_features): + feature_matrix, num_frames_in_this_feature = self.get_video_matrix( + features[self.feature_names[feature_index]], + self.feature_sizes[feature_index], self.max_frames, + max_quantized_value, min_quantized_value) + if num_frames == -1: + num_frames = num_frames_in_this_feature + #else: + # tf.assert_equal(num_frames, num_frames_in_this_feature) + + feature_matrices[feature_index] = feature_matrix + + # cap the number of frames at self.max_frames + num_frames = tf.minimum(num_frames, self.max_frames) + + # concatenate different features + video_matrix = feature_matrices[0] + audio_matrix = feature_matrices[1] + + return contexts["id"], video_matrix, audio_matrix, labels, num_frames + + +def main(files_pattern): + data_files = gfile.Glob(files_pattern) + filename_queue = tf.train.string_input_producer( + data_files, num_epochs=1, shuffle=False) + + reader = YT8MFrameFeatureReader( + feature_sizes=[1024, 128], feature_names=["rgb", "audio"]) + vals = reader.prepare_reader(filename_queue) + + with tf.Session() as sess: + sess.run(tf.initialize_local_variables()) + sess.run(tf.initialize_all_variables()) + coord = tf.train.Coordinator() + threads = tf.train.start_queue_runners(sess=sess, coord=coord) + + vid_num = 0 + all_data = [] + try: + while not coord.should_stop(): + vid, features, audios, labels, nframes = sess.run(vals) + label_index = np.where(labels == True)[0].tolist() + vid_num += 1 + + #print vid, features.shape, audios.shape, label_index, nframes + + features_int = features.astype(np.uint8) + audios_int = audios.astype(np.uint8) + + value_dict = {} + value_dict['video'] = vid + value_dict['feature'] = features_int + value_dict['audio'] = audios_int + value_dict['label'] = label_index + value_dict['nframes'] = nframes + all_data.append(value_dict) + + except tf.errors.OutOfRangeError: + print('Finished extracting.') + + finally: + coord.request_stop() + coord.join(threads) + + print vid_num + + record_name = files_pattern.split('/')[-1].split('.')[0] + outputdir = target_dir + fn = '%s.pkl' % record_name + outp = open(os.path.join(outputdir, fn), 'wb') + cPickle.dump(all_data, outp, protocol=cPickle.HIGHEST_PROTOCOL) + outp.close() + + +if __name__ == '__main__': + record_dir = source_dir + record_files = os.listdir(record_dir) + for f in record_files: + record_path = os.path.join(record_dir, f) + main(record_path) diff --git a/fluid/PaddleCV/video/dataset/youtube8m/yt8m_pca/eigenvals.npy b/fluid/PaddleCV/video/dataset/youtube8m/yt8m_pca/eigenvals.npy new file mode 100644 index 0000000000000000000000000000000000000000..632506b9ad68f030d64643cc8100868b21c3eb98 Binary files /dev/null and b/fluid/PaddleCV/video/dataset/youtube8m/yt8m_pca/eigenvals.npy differ