From e34d18a51393fdfb4ae749ec64ca3c26fa1c375d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 13 Mar 2017 15:45:08 -0800
Subject: [PATCH] Add support for FixedLenSequenceFeature for
 parse_example/parse_single_example. With this you may parse variable-length
 feature of the Example into a padded Tensor. Change: 150009250

---
 .../python/kernel_tests/parsing_ops_test.py   | 118 +++++++++++++++---
 tensorflow/python/ops/parsing_ops.py          |  70 +++++++++--
 2 files changed, 163 insertions(+), 25 deletions(-)

diff --git a/tensorflow/python/kernel_tests/parsing_ops_test.py b/tensorflow/python/kernel_tests/parsing_ops_test.py
index f99dd510cf8..741bc3f392e 100644
--- a/tensorflow/python/kernel_tests/parsing_ops_test.py
+++ b/tensorflow/python/kernel_tests/parsing_ops_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
@@ -735,17 +736,50 @@ class ParseExampleTest(test.TestCase):
         "serialized": ops.convert_to_tensor(serialized),
         "features": {
             aname:
-                parsing_ops.FixedLenFeature((None, 2, 1), dtype=dtypes.float32),
+                parsing_ops.FixedLenSequenceFeature(
+                    (2, 1), dtype=dtypes.float32, allow_missing=True),
             bname:
-                parsing_ops.FixedLenFeature(
-                    (None, 1, 1, 1), dtype=dtypes.string),
+                parsing_ops.FixedLenSequenceFeature(
+                    (1, 1, 1), dtype=dtypes.string, allow_missing=True),
             cname:
-                parsing_ops.FixedLenFeature((None,), dtype=dtypes.int64),
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[], dtype=dtypes.int64, allow_missing=True),
             dname:
-                parsing_ops.FixedLenFeature((None,), dtype=dtypes.string),
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[], dtype=dtypes.string, allow_missing=True),
         }
     }, expected_output)
 
+    # Test with padding values.
+    expected_output_custom_padding = dict(expected_output)
+    expected_output_custom_padding[aname] = np.array(
+        [
+            [-2, -2, -2, -2],
+            [1, 1, -2, -2],
+            [-1, -1, 2, 2],
+            [-2, -2, -2, -2],
+        ],
+        dtype=np.float32).reshape(4, 2, 2, 1)
+    self._test({
+        "example_names": example_names,
+        "serialized": ops.convert_to_tensor(serialized),
+        "features": {
+            aname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (2, 1), dtype=dtypes.float32, allow_missing=True,
+                    default_value=-2.0),
+            bname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (1, 1, 1), dtype=dtypes.string, allow_missing=True),
+            cname:
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[], dtype=dtypes.int64, allow_missing=True),
+            dname:
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[], dtype=dtypes.string, allow_missing=True),
+        }
+    }, expected_output_custom_padding)
+
     # Change number of required values so the inputs are not a
     # multiple of this size.
     self._test(
@@ -754,17 +788,34 @@ class ParseExampleTest(test.TestCase):
             "serialized": ops.convert_to_tensor(serialized),
             "features": {
                 aname:
-                    parsing_ops.FixedLenFeature(
-                        (None, 2, 1), dtype=dtypes.float32),
+                    parsing_ops.FixedLenSequenceFeature(
+                        (2, 1), dtype=dtypes.float32, allow_missing=True),
                 bname:
-                    parsing_ops.FixedLenFeature(
-                        (None, 2, 1, 1), dtype=dtypes.string),
+                    parsing_ops.FixedLenSequenceFeature(
+                        (2, 1, 1), dtype=dtypes.string, allow_missing=True),
             }
         },
         expected_err=(
             errors_impl.OpError, "Name: in3, Key: b, Index: 2.  "
             "Number of bytes values is not a multiple of stride length."))
 
+    self._test(
+        {
+            "example_names": example_names,
+            "serialized": ops.convert_to_tensor(serialized),
+            "features": {
+                aname:
+                    parsing_ops.FixedLenSequenceFeature(
+                        (2, 1), dtype=dtypes.float32, allow_missing=True,
+                        default_value=[]),
+                bname:
+                    parsing_ops.FixedLenSequenceFeature(
+                        (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+            }
+        },
+        expected_err=(ValueError,
+                      "Cannot reshape a tensor with 0 elements to shape"))
+
     self._test(
         {
             "example_names": example_names,
@@ -772,14 +823,50 @@ class ParseExampleTest(test.TestCase):
             "features": {
                 aname:
                     parsing_ops.FixedLenFeature(
-                        (None, 2, 1), dtype=dtypes.float32, default_value=[]),
+                        (None, 2, 1), dtype=dtypes.float32),
                 bname:
+                    parsing_ops.FixedLenSequenceFeature(
+                        (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+            }
+        },
+        expected_err=(ValueError,
+                      "First dimension of shape for feature a unknown. "
+                      "Consider using FixedLenSequenceFeature."))
+
+    self._test(
+        {
+            "example_names": example_names,
+            "serialized": ops.convert_to_tensor(serialized),
+            "features": {
+                cname:
                     parsing_ops.FixedLenFeature(
-                        (None, 2, 1, 1), dtype=dtypes.string),
+                        (1, None), dtype=dtypes.int64, default_value=[[1]]),
             }
         },
         expected_err=(ValueError,
-                      "Cannot reshape a tensor with 0 elements to shape"))
+                      "All dimensions of shape for feature c need to be known "
+                      r"but received \(1, None\)."))
+
+    self._test({
+        "example_names": example_names,
+        "serialized": ops.convert_to_tensor(serialized),
+        "features": {
+            aname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (2, 1), dtype=dtypes.float32, allow_missing=True),
+            bname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (1, 1, 1), dtype=dtypes.string, allow_missing=True),
+            cname:
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[], dtype=dtypes.int64, allow_missing=False),
+            dname:
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[], dtype=dtypes.string, allow_missing=True),
+        }
+    }, expected_err=(ValueError,
+                     "Unsupported: FixedLenSequenceFeature requires "
+                     "allow_missing to be True."))
 
 
 class ParseSingleExampleTest(test.TestCase):
@@ -801,7 +888,8 @@ class ParseSingleExampleTest(test.TestCase):
       # Check shapes.
       for k, f in kwargs["features"].items():
         if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None:
-          self.assertEqual(tuple(out[k].get_shape()), f.shape)
+          self.assertEqual(tuple(out[k].get_shape()),
+                           tensor_shape.as_shape(f.shape))
         elif isinstance(f, parsing_ops.VarLenFeature):
           self.assertEqual(
               tuple(out[k].indices.get_shape().as_list()), (None, 1))
@@ -864,7 +952,7 @@ class ParseSingleExampleTest(test.TestCase):
                         (3, 3), dtypes.string, default_value=b_default),
                 # Feature "c" must be provided, since it has no default_value.
                 "c":
-                    parsing_ops.FixedLenFeature((2,), dtypes.float32),
+                    parsing_ops.FixedLenFeature(2, dtypes.float32),
             }
         },
         expected_output)
@@ -1040,7 +1128,7 @@ class ParseSequenceExampleTest(test.TestCase):
                 "b":
                     parsing_ops.FixedLenSequenceFeature((2, 2), dtypes.string),
                 "c":
-                    parsing_ops.FixedLenSequenceFeature((2,), dtypes.float32),
+                    parsing_ops.FixedLenSequenceFeature(2, dtypes.float32),
                 "d":
                     parsing_ops.FixedLenSequenceFeature(
                         (5,), dtypes.float32, allow_missing=True),
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index 7c19457cbd4..b9ec50642ac 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -135,29 +135,41 @@ class FixedLenFeature(collections.namedtuple(
     shape: Shape of input data.
     dtype: Data type of input.
     default_value: Value to be used if an example is missing this feature. It
-        must be compatible with `dtype`.
+        must be compatible with `dtype` and of the specified `shape`.
   """
   pass
 FixedLenFeature.__new__.__defaults__ = (None,)
 
 
-# NOTE: If we ever support a default_value for sequence dense features, we can
-# remove this class and use FixedLenFeature in its place.
 class FixedLenSequenceFeature(collections.namedtuple(
-    "FixedLenSequenceFeature", ["shape", "dtype", "allow_missing"])):
-  """Configuration for a dense input feature in a sequence item.
+    "FixedLenSequenceFeature",
+    ["shape", "dtype", "allow_missing", "default_value"])):
+  """Configuration for parsing a variable-length input feature into a `Tensor`.
+
+  The resulting `Tensor` of parsing a single `SequenceExample` or `Example` has
+  a static `shape` of `[None] + shape` and the specified `dtype`.
+  The resulting `Tensor` of parsing a `batch_size` many `Example`s has
+  a static `shape` of `[batch_size, None] + shape` and the specified `dtype`.
+  The entries in the `batch` from different `Examples` will be padded with
+  `default_value` to the maximum length present in the `batch`.
 
   To treat a sparse input as dense, provide `allow_missing=True`; otherwise,
   the parse functions will fail on any examples missing this feature.
 
   Fields:
-    shape: Shape of input data.
+    shape: Shape of input data for dimension 2 and higher. First dimension is
+      of variable length `None`.
     dtype: Data type of input.
     allow_missing: Whether to allow this feature to be missing from a feature
-      list item.
+      list item. Is available only for parsing `SequenceExample` not for
+      parsing `Examples`.
+    default_value: Scalar value to be used to pad multiple `Example`s to their
+      maximum length. Irrelevant for parsing a single `Example` or
+      `SequenceExample`. Defaults to "" for dtype string and 0 otherwise
+      (optional).
   """
   pass
-FixedLenSequenceFeature.__new__.__defaults__ = (False,)
+FixedLenSequenceFeature.__new__.__defaults__ = (False, None)
 
 
 def _features_to_raw_params(features, types):
@@ -236,6 +248,15 @@ def _features_to_raw_params(features, types):
           raise ValueError("Missing type for feature %s." % key)
         if feature.shape is None:
           raise ValueError("Missing shape for feature %s." % key)
+        feature_tensor_shape = tensor_shape.as_shape(feature.shape)
+        if (feature.shape and feature_tensor_shape.ndims and
+            feature_tensor_shape.dims[0].value is None):
+          raise ValueError("First dimension of shape for feature %s unknown. "
+                           "Consider using FixedLenSequenceFeature." % key)
+        if (feature.shape is not None and
+            not feature_tensor_shape.is_fully_defined()):
+          raise ValueError("All dimensions of shape for feature %s need to be "
+                           "known but received %s." % (key, str(feature.shape)))
         dense_keys.append(key)
         dense_shapes.append(feature.shape)
         dense_types.append(feature.dtype)
@@ -253,6 +274,8 @@ def _features_to_raw_params(features, types):
         dense_types.append(feature.dtype)
         if feature.allow_missing:
           dense_defaults[key] = None
+        if feature.default_value is not None:
+          dense_defaults[key] = feature.default_value
       else:
         raise ValueError("Invalid feature %s:%s." % (key, feature))
   return (
@@ -346,9 +369,15 @@ def parse_example(serialized, features, name=None, example_names=None):
   value, we will fail if that `Feature` is missing from any example in
   `serialized`.
 
+  Each `FixedLenSequenceFeature` `df` maps to a `Tensor` of the specified type
+  (or `tf.float32` if not specified) and shape
+  `(serialized.size(), None) + df.shape`.
+  All examples in `serialized` will be padded with `default_value` along the
+  second dimension.
+
   Examples:
 
-  For example, if one expects a `tf.float32` VarlenFeature `ft` and three
+  For example, if one expects a `tf.float32` `VarLenFeature` `ft` and three
   serialized `Example`s are provided:
 
   ```
@@ -370,6 +399,13 @@ def parse_example(serialized, features, name=None, example_names=None):
                       dense_shape=(3, 2)) }
   ```
 
+  If instead a `FixedLenSequenceFeature` with `default_value = -1.0` and
+  `shape=[]` is used then the output will look like:
+
+  ```
+  {"ft": [[1.0, 2.0], [3.0, -1.0]]}
+  ```
+
   Given two `Example` input protos in `serialized`:
 
   ```
@@ -505,9 +541,23 @@ def parse_example(serialized, features, name=None, example_names=None):
   """
   if not features:
     raise ValueError("Missing: features was %s." % features)
+  if features:
+    modified_features = dict(features)  # Create a copy to modify
+    for key, feature in features.items():
+      if isinstance(feature, FixedLenSequenceFeature):
+        if not feature.allow_missing:
+          raise ValueError("Unsupported: FixedLenSequenceFeature requires "
+                           "allow_missing to be True.")
+        modified_features[key] = FixedLenSequenceFeature(
+            [None] + list(feature.shape),
+            feature.dtype,
+            feature.allow_missing,
+            feature.default_value)
+    features = modified_features
   (sparse_keys, sparse_types, dense_keys, dense_types, dense_defaults,
    dense_shapes) = _features_to_raw_params(
-       features, [VarLenFeature, SparseFeature, FixedLenFeature])
+       features,
+       [VarLenFeature, SparseFeature, FixedLenFeature, FixedLenSequenceFeature])
   outputs = _parse_example_raw(
       serialized, example_names, sparse_keys, sparse_types, dense_keys,
       dense_types, dense_defaults, dense_shapes, name)
-- 
GitLab