Add PaddleCV/video/dataset (#1743)

dbc24a3b · qingqing01 · GitHub · 68ad52d4 · dbc24a3b · dbc24a3b
7 changed file
--- a/fluid/PaddleCV/video/.gitignore
+++ b/fluid/PaddleCV/video/.gitignore
-dataset
 checkpoints
 output*
 *.pyc

--- a/fluid/PaddleCV/video/dataset/kinetics/README.md
+++ b/fluid/PaddleCV/video/dataset/kinetics/README.md
+1. download kinetics-400_train.csv and kinetics-400_val.csv
+2. ffmpeg is required to decode mp4
+3. transfer mp4 video to pkl file, with each pkl stores [video_id, images, label]
+   python generate_label.py kinetics-400_train.csv kinetics400_label.txt # generate label file
+   python video2pkl.py kinetics-400_train.csv $Source_dir $Target_dir $NUM_THREADS
--- a/fluid/PaddleCV/video/dataset/kinetics/generate_label.py
+++ b/fluid/PaddleCV/video/dataset/kinetics/generate_label.py
+import sys
+
+# kinetics-400_train.csv should be down loaded first and set as sys.argv[1]
+# sys.argv[2] can be set as kinetics400_label.txt
+# python generate_label.py kinetics-400_train.csv kinetics400_label.txt
+
+num_classes = 400
+
+fname = sys.argv[1]
+outname = sys.argv[2]
+fl = open(fname).readlines()
+fl = fl[1:]
+outf = open(outname, 'w')
+
+label_list = []
+for line in fl:
+    label = line.strip().split(',')[0].strip('"')
+    if label in label_list:
+        continue
+    else:
+        label_list.append(label)
+
+assert len(label_list
+           ) == num_classes, "there should be {} labels in list, but ".format(
+               num_classes, len(label_list))
+
+label_list.sort()
+for i in range(num_classes):
+    outf.write('{} {}'.format(label_list[i], i) + '\n')
+
+outf.close()
--- a/fluid/PaddleCV/video/dataset/kinetics/video2pkl.py
+++ b/fluid/PaddleCV/video/dataset/kinetics/video2pkl.py
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import sys
+import glob
+import cPickle
+from multiprocessing import Pool
+
+# example command line: python generate_k400_pkl.py kinetics-400_train.csv 8
+# 
+# kinetics-400_train.csv is the training set file of K400 official release
+# each line contains laebl,youtube_id,time_start,time_end,split,is_cc
+
+assert (len(sys.argv) == 5)
+
+f = open(sys.argv[1])
+source_dir = sys.argv[2]
+target_dir = sys.argv[3]
+num_threads = sys.argv[4]
+all_video_entries = [x.strip().split(',') for x in f.readlines()]
+all_video_entries = all_video_entries[1:]
+f.close()
+
+category_label_map = {}
+f = open('kinetics400_label.txt')
+for line in f:
+    ens = line.strip().split(' ')
+    category = " ".join(ens[0:-1])
+    label = int(ens[-1])
+    category_label_map[category] = label
+f.close()
+
+
+def generate_pkl(entry):
+    mode = entry[4]
+    category = entry[0].strip('"')
+    category_dir = category
+    video_path = os.path.join(
+        './',
+        entry[1] + "_%06d" % int(entry[2]) + "_%06d" % int(entry[3]) + ".mp4")
+    video_path = os.path.join(source_dir, category_dir, video_path)
+    label = category_label_map[category]
+
+    vid = './' + video_path.split('/')[-1].split('.')[0]
+    if os.path.exists(video_path):
+        if not os.path.exists(vid):
+            os.makedirs(vid)
+        os.system('ffmpeg -i ' + video_path + ' -q 0 ' + vid + '/%06d.jpg')
+    else:
+        print("File not exists {}".format(video_path))
+        return
+
+    images = sorted(glob.glob(vid + '/*.jpg'))
+    ims = []
+    for img in images:
+        f = open(img)
+        ims.append(f.read())
+        f.close()
+
+    output_pkl = vid + ".pkl"
+    output_pkl = os.path.join(target_dir, output_pkl)
+    f = open(output_pkl, 'w')
+    cPickle.dump((vid, label, ims), f, -1)
+    f.close()
+
+    os.system('rm -rf %s' % vid)
+
+
+pool = Pool(processes=int(sys.argv[4]))
+pool.map(generate_pkl, all_video_entries)
+pool.close()
+pool.join()
--- a/fluid/PaddleCV/video/dataset/youtube8m/README.md
+++ b/fluid/PaddleCV/video/dataset/youtube8m/README.md
+1. Tensorflow is required to process tfrecords
+2. python tf2pkl.py $Source_dir $Target_dir
--- a/fluid/PaddleCV/video/dataset/youtube8m/tf2pkl.py
+++ b/fluid/PaddleCV/video/dataset/youtube8m/tf2pkl.py
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+"""Provides readers configured for different datasets."""
+import os, sys
+import numpy as np
+import tensorflow as tf
+from tensorflow import logging
+import cPickle
+
+from tensorflow.python.platform import gfile
+
+assert (len(sys.argv) == 3)
+source_dir = sys.argv[1]
+target_dir = sys.argv[2]
+
+
+def Dequantize(feat_vector, max_quantized_value=2, min_quantized_value=-2):
+    """Dequantize the feature from the byte format to the float format.
+
+    Args:
+    feat_vector: the input 1-d vector.
+    max_quantized_value: the maximum of the quantized value.
+    min_quantized_value: the minimum of the quantized value.
+
+    Returns:
+    A float vector which has the same shape as feat_vector.
+    """
+    assert max_quantized_value > min_quantized_value
+    quantized_range = max_quantized_value - min_quantized_value
+    scalar = quantized_range / 255.0
+    bias = (quantized_range / 512.0) + min_quantized_value
+    return feat_vector * scalar + bias
+
+
+def resize_axis(tensor, axis, new_size, fill_value=0):
+    """Truncates or pads a tensor to new_size on on a given axis.
+
+    Truncate or extend tensor such that tensor.shape[axis] == new_size. If the
+    size increases, the padding will be performed at the end, using fill_value.
+
+    Args:
+      tensor: The tensor to be resized.
+      axis: An integer representing the dimension to be sliced.
+      new_size: An integer or 0d tensor representing the new value for
+        tensor.shape[axis].
+      fill_value: Value to use to fill any new entries in the tensor. Will be
+        cast to the type of tensor.
+
+    Returns:
+      The resized tensor.
+    """
+    tensor = tf.convert_to_tensor(tensor)
+    shape = tf.unstack(tf.shape(tensor))
+
+    pad_shape = shape[:]
+    pad_shape[axis] = tf.maximum(0, new_size - shape[axis])
+
+    shape[axis] = tf.minimum(shape[axis], new_size)
+    shape = tf.stack(shape)
+
+    resized = tf.concat([
+        tf.slice(tensor, tf.zeros_like(shape), shape),
+        tf.fill(tf.stack(pad_shape), tf.cast(fill_value, tensor.dtype))
+    ], axis)
+
+    # Update shape.
+    new_shape = tensor.get_shape().as_list()  # A copy is being made.
+    new_shape[axis] = new_size
+    resized.set_shape(new_shape)
+    return resized
+
+
+class BaseReader(object):
+    """Inherit from this class when implementing new readers."""
+
+    def prepare_reader(self, unused_filename_queue):
+        """Create a thread for generating prediction and label tensors."""
+        raise NotImplementedError()
+
+
+class YT8MFrameFeatureReader(BaseReader):
+    """Reads TFRecords of SequenceExamples.
+
+    The TFRecords must contain SequenceExamples with the sparse in64 'labels'
+    context feature and a fixed length byte-quantized feature vector, obtained
+    from the features in 'feature_names'. The quantized features will be mapped
+    back into a range between min_quantized_value and max_quantized_value.
+    """
+
+    def __init__(self,
+                 num_classes=3862,
+                 feature_sizes=[1024],
+                 feature_names=["inc3"],
+                 max_frames=300):
+        """Construct a YT8MFrameFeatureReader.
+
+        Args:
+          num_classes: a positive integer for the number of classes.
+          feature_sizes: positive integer(s) for the feature dimensions as a list.
+          feature_names: the feature name(s) in the tensorflow record as a list.
+          max_frames: the maximum number of frames to process.
+        """
+
+        assert len(feature_names) == len(feature_sizes), \
+        "length of feature_names (={}) != length of feature_sizes (={})".format( \
+        len(feature_names), len(feature_sizes))
+
+        self.num_classes = num_classes
+        self.feature_sizes = feature_sizes
+        self.feature_names = feature_names
+        self.max_frames = max_frames
+
+    def get_video_matrix(self, features, feature_size, max_frames,
+                         max_quantized_value, min_quantized_value):
+        """Decodes features from an input string and quantizes it.
+
+        Args:
+          features: raw feature values
+          feature_size: length of each frame feature vector
+          max_frames: number of frames (rows) in the output feature_matrix
+          max_quantized_value: the maximum of the quantized value.
+          min_quantized_value: the minimum of the quantized value.
+
+        Returns:
+          feature_matrix: matrix of all frame-features
+          num_frames: number of frames in the sequence
+        """
+        decoded_features = tf.reshape(
+            tf.cast(tf.decode_raw(features, tf.uint8), tf.float32),
+            [-1, feature_size])
+
+        num_frames = tf.minimum(tf.shape(decoded_features)[0], max_frames)
+
+        feature_matrix = decoded_features
+
+        return feature_matrix, num_frames
+
+    def prepare_reader(self,
+                       filename_queue,
+                       max_quantized_value=2,
+                       min_quantized_value=-2):
+        """Creates a single reader thread for YouTube8M SequenceExamples.
+
+        Args:
+          filename_queue: A tensorflow queue of filename locations.
+          max_quantized_value: the maximum of the quantized value.
+          min_quantized_value: the minimum of the quantized value.
+
+        Returns:
+          A tuple of video indexes, video features, labels, and padding data.
+        """
+        reader = tf.TFRecordReader()
+        _, serialized_example = reader.read(filename_queue)
+
+        contexts, features = tf.parse_single_sequence_example(
+            serialized_example,
+            context_features={
+                "id": tf.FixedLenFeature([], tf.string),
+                "labels": tf.VarLenFeature(tf.int64)
+            },
+            sequence_features={
+                feature_name: tf.FixedLenSequenceFeature(
+                    [], dtype=tf.string)
+                for feature_name in self.feature_names
+            })
+
+        # read ground truth labels
+        labels = (tf.cast(
+            tf.sparse_to_dense(
+                contexts["labels"].values, (self.num_classes, ),
+                1,
+                validate_indices=False),
+            tf.bool))
+
+        # loads (potentially) different types of features and concatenates them
+        num_features = len(self.feature_names)
+        assert num_features > 0, "No feature selected: feature_names is empty!"
+
+        assert len(self.feature_names) == len(self.feature_sizes), \
+        "length of feature_names (={}) != length of feature_sizes (={})".format( \
+        len(self.feature_names), len(self.feature_sizes))
+
+        num_frames = -1  # the number of frames in the video
+        feature_matrices = [None
+                            ] * num_features  # an array of different features
+
+        for feature_index in range(num_features):
+            feature_matrix, num_frames_in_this_feature = self.get_video_matrix(
+                features[self.feature_names[feature_index]],
+                self.feature_sizes[feature_index], self.max_frames,
+                max_quantized_value, min_quantized_value)
+            if num_frames == -1:
+                num_frames = num_frames_in_this_feature
+            #else:
+            #  tf.assert_equal(num_frames, num_frames_in_this_feature)
+
+            feature_matrices[feature_index] = feature_matrix
+
+        # cap the number of frames at self.max_frames
+        num_frames = tf.minimum(num_frames, self.max_frames)
+
+        # concatenate different features
+        video_matrix = feature_matrices[0]
+        audio_matrix = feature_matrices[1]
+
+        return contexts["id"], video_matrix, audio_matrix, labels, num_frames
+
+
+def main(files_pattern):
+    data_files = gfile.Glob(files_pattern)
+    filename_queue = tf.train.string_input_producer(
+        data_files, num_epochs=1, shuffle=False)
+
+    reader = YT8MFrameFeatureReader(
+        feature_sizes=[1024, 128], feature_names=["rgb", "audio"])
+    vals = reader.prepare_reader(filename_queue)
+
+    with tf.Session() as sess:
+        sess.run(tf.initialize_local_variables())
+        sess.run(tf.initialize_all_variables())
+        coord = tf.train.Coordinator()
+        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
+
+        vid_num = 0
+        all_data = []
+        try:
+            while not coord.should_stop():
+                vid, features, audios, labels, nframes = sess.run(vals)
+                label_index = np.where(labels == True)[0].tolist()
+                vid_num += 1
+
+                #print vid, features.shape, audios.shape, label_index, nframes
+
+                features_int = features.astype(np.uint8)
+                audios_int = audios.astype(np.uint8)
+
+                value_dict = {}
+                value_dict['video'] = vid
+                value_dict['feature'] = features_int
+                value_dict['audio'] = audios_int
+                value_dict['label'] = label_index
+                value_dict['nframes'] = nframes
+                all_data.append(value_dict)
+
+        except tf.errors.OutOfRangeError:
+            print('Finished extracting.')
+
+        finally:
+            coord.request_stop()
+            coord.join(threads)
+
+    print vid_num
+
+    record_name = files_pattern.split('/')[-1].split('.')[0]
+    outputdir = target_dir
+    fn = '%s.pkl' % record_name
+    outp = open(os.path.join(outputdir, fn), 'wb')
+    cPickle.dump(all_data, outp, protocol=cPickle.HIGHEST_PROTOCOL)
+    outp.close()
+
+
+if __name__ == '__main__':
+    record_dir = source_dir
+    record_files = os.listdir(record_dir)
+    for f in record_files:
+        record_path = os.path.join(record_dir, f)
+        main(record_path)
--- a/fluid/PaddleCV/video/dataset/youtube8m/yt8m_pca/eigenvals.npy
+++ b/fluid/PaddleCV/video/dataset/youtube8m/yt8m_pca/eigenvals.npy