From e34d18a51393fdfb4ae749ec64ca3c26fa1c375d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 13 Mar 2017 15:45:08 -0800 Subject: [PATCH] Add support for FixedLenSequenceFeature for parse_example/parse_single_example. With this you may parse variable-length feature of the Example into a padded Tensor. Change: 150009250 --- .../python/kernel_tests/parsing_ops_test.py | 118 +++++++++++++++--- tensorflow/python/ops/parsing_ops.py | 70 +++++++++-- 2 files changed, 163 insertions(+), 25 deletions(-) diff --git a/tensorflow/python/kernel_tests/parsing_ops_test.py b/tensorflow/python/kernel_tests/parsing_ops_test.py index f99dd510cf8..741bc3f392e 100644 --- a/tensorflow/python/kernel_tests/parsing_ops_test.py +++ b/tensorflow/python/kernel_tests/parsing_ops_test.py @@ -31,6 +31,7 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors_impl from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor +from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import tensor_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import parsing_ops @@ -735,17 +736,50 @@ class ParseExampleTest(test.TestCase): "serialized": ops.convert_to_tensor(serialized), "features": { aname: - parsing_ops.FixedLenFeature((None, 2, 1), dtype=dtypes.float32), + parsing_ops.FixedLenSequenceFeature( + (2, 1), dtype=dtypes.float32, allow_missing=True), bname: - parsing_ops.FixedLenFeature( - (None, 1, 1, 1), dtype=dtypes.string), + parsing_ops.FixedLenSequenceFeature( + (1, 1, 1), dtype=dtypes.string, allow_missing=True), cname: - parsing_ops.FixedLenFeature((None,), dtype=dtypes.int64), + parsing_ops.FixedLenSequenceFeature( + shape=[], dtype=dtypes.int64, allow_missing=True), dname: - parsing_ops.FixedLenFeature((None,), dtype=dtypes.string), + parsing_ops.FixedLenSequenceFeature( + shape=[], dtype=dtypes.string, allow_missing=True), } }, expected_output) + # Test with padding values. + expected_output_custom_padding = dict(expected_output) + expected_output_custom_padding[aname] = np.array( + [ + [-2, -2, -2, -2], + [1, 1, -2, -2], + [-1, -1, 2, 2], + [-2, -2, -2, -2], + ], + dtype=np.float32).reshape(4, 2, 2, 1) + self._test({ + "example_names": example_names, + "serialized": ops.convert_to_tensor(serialized), + "features": { + aname: + parsing_ops.FixedLenSequenceFeature( + (2, 1), dtype=dtypes.float32, allow_missing=True, + default_value=-2.0), + bname: + parsing_ops.FixedLenSequenceFeature( + (1, 1, 1), dtype=dtypes.string, allow_missing=True), + cname: + parsing_ops.FixedLenSequenceFeature( + shape=[], dtype=dtypes.int64, allow_missing=True), + dname: + parsing_ops.FixedLenSequenceFeature( + shape=[], dtype=dtypes.string, allow_missing=True), + } + }, expected_output_custom_padding) + # Change number of required values so the inputs are not a # multiple of this size. self._test( @@ -754,17 +788,34 @@ class ParseExampleTest(test.TestCase): "serialized": ops.convert_to_tensor(serialized), "features": { aname: - parsing_ops.FixedLenFeature( - (None, 2, 1), dtype=dtypes.float32), + parsing_ops.FixedLenSequenceFeature( + (2, 1), dtype=dtypes.float32, allow_missing=True), bname: - parsing_ops.FixedLenFeature( - (None, 2, 1, 1), dtype=dtypes.string), + parsing_ops.FixedLenSequenceFeature( + (2, 1, 1), dtype=dtypes.string, allow_missing=True), } }, expected_err=( errors_impl.OpError, "Name: in3, Key: b, Index: 2. " "Number of bytes values is not a multiple of stride length.")) + self._test( + { + "example_names": example_names, + "serialized": ops.convert_to_tensor(serialized), + "features": { + aname: + parsing_ops.FixedLenSequenceFeature( + (2, 1), dtype=dtypes.float32, allow_missing=True, + default_value=[]), + bname: + parsing_ops.FixedLenSequenceFeature( + (2, 1, 1), dtype=dtypes.string, allow_missing=True), + } + }, + expected_err=(ValueError, + "Cannot reshape a tensor with 0 elements to shape")) + self._test( { "example_names": example_names, @@ -772,14 +823,50 @@ class ParseExampleTest(test.TestCase): "features": { aname: parsing_ops.FixedLenFeature( - (None, 2, 1), dtype=dtypes.float32, default_value=[]), + (None, 2, 1), dtype=dtypes.float32), bname: + parsing_ops.FixedLenSequenceFeature( + (2, 1, 1), dtype=dtypes.string, allow_missing=True), + } + }, + expected_err=(ValueError, + "First dimension of shape for feature a unknown. " + "Consider using FixedLenSequenceFeature.")) + + self._test( + { + "example_names": example_names, + "serialized": ops.convert_to_tensor(serialized), + "features": { + cname: parsing_ops.FixedLenFeature( - (None, 2, 1, 1), dtype=dtypes.string), + (1, None), dtype=dtypes.int64, default_value=[[1]]), } }, expected_err=(ValueError, - "Cannot reshape a tensor with 0 elements to shape")) + "All dimensions of shape for feature c need to be known " + r"but received \(1, None\).")) + + self._test({ + "example_names": example_names, + "serialized": ops.convert_to_tensor(serialized), + "features": { + aname: + parsing_ops.FixedLenSequenceFeature( + (2, 1), dtype=dtypes.float32, allow_missing=True), + bname: + parsing_ops.FixedLenSequenceFeature( + (1, 1, 1), dtype=dtypes.string, allow_missing=True), + cname: + parsing_ops.FixedLenSequenceFeature( + shape=[], dtype=dtypes.int64, allow_missing=False), + dname: + parsing_ops.FixedLenSequenceFeature( + shape=[], dtype=dtypes.string, allow_missing=True), + } + }, expected_err=(ValueError, + "Unsupported: FixedLenSequenceFeature requires " + "allow_missing to be True.")) class ParseSingleExampleTest(test.TestCase): @@ -801,7 +888,8 @@ class ParseSingleExampleTest(test.TestCase): # Check shapes. for k, f in kwargs["features"].items(): if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None: - self.assertEqual(tuple(out[k].get_shape()), f.shape) + self.assertEqual(tuple(out[k].get_shape()), + tensor_shape.as_shape(f.shape)) elif isinstance(f, parsing_ops.VarLenFeature): self.assertEqual( tuple(out[k].indices.get_shape().as_list()), (None, 1)) @@ -864,7 +952,7 @@ class ParseSingleExampleTest(test.TestCase): (3, 3), dtypes.string, default_value=b_default), # Feature "c" must be provided, since it has no default_value. "c": - parsing_ops.FixedLenFeature((2,), dtypes.float32), + parsing_ops.FixedLenFeature(2, dtypes.float32), } }, expected_output) @@ -1040,7 +1128,7 @@ class ParseSequenceExampleTest(test.TestCase): "b": parsing_ops.FixedLenSequenceFeature((2, 2), dtypes.string), "c": - parsing_ops.FixedLenSequenceFeature((2,), dtypes.float32), + parsing_ops.FixedLenSequenceFeature(2, dtypes.float32), "d": parsing_ops.FixedLenSequenceFeature( (5,), dtypes.float32, allow_missing=True), diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py index 7c19457cbd4..b9ec50642ac 100644 --- a/tensorflow/python/ops/parsing_ops.py +++ b/tensorflow/python/ops/parsing_ops.py @@ -135,29 +135,41 @@ class FixedLenFeature(collections.namedtuple( shape: Shape of input data. dtype: Data type of input. default_value: Value to be used if an example is missing this feature. It - must be compatible with `dtype`. + must be compatible with `dtype` and of the specified `shape`. """ pass FixedLenFeature.__new__.__defaults__ = (None,) -# NOTE: If we ever support a default_value for sequence dense features, we can -# remove this class and use FixedLenFeature in its place. class FixedLenSequenceFeature(collections.namedtuple( - "FixedLenSequenceFeature", ["shape", "dtype", "allow_missing"])): - """Configuration for a dense input feature in a sequence item. + "FixedLenSequenceFeature", + ["shape", "dtype", "allow_missing", "default_value"])): + """Configuration for parsing a variable-length input feature into a `Tensor`. + + The resulting `Tensor` of parsing a single `SequenceExample` or `Example` has + a static `shape` of `[None] + shape` and the specified `dtype`. + The resulting `Tensor` of parsing a `batch_size` many `Example`s has + a static `shape` of `[batch_size, None] + shape` and the specified `dtype`. + The entries in the `batch` from different `Examples` will be padded with + `default_value` to the maximum length present in the `batch`. To treat a sparse input as dense, provide `allow_missing=True`; otherwise, the parse functions will fail on any examples missing this feature. Fields: - shape: Shape of input data. + shape: Shape of input data for dimension 2 and higher. First dimension is + of variable length `None`. dtype: Data type of input. allow_missing: Whether to allow this feature to be missing from a feature - list item. + list item. Is available only for parsing `SequenceExample` not for + parsing `Examples`. + default_value: Scalar value to be used to pad multiple `Example`s to their + maximum length. Irrelevant for parsing a single `Example` or + `SequenceExample`. Defaults to "" for dtype string and 0 otherwise + (optional). """ pass -FixedLenSequenceFeature.__new__.__defaults__ = (False,) +FixedLenSequenceFeature.__new__.__defaults__ = (False, None) def _features_to_raw_params(features, types): @@ -236,6 +248,15 @@ def _features_to_raw_params(features, types): raise ValueError("Missing type for feature %s." % key) if feature.shape is None: raise ValueError("Missing shape for feature %s." % key) + feature_tensor_shape = tensor_shape.as_shape(feature.shape) + if (feature.shape and feature_tensor_shape.ndims and + feature_tensor_shape.dims[0].value is None): + raise ValueError("First dimension of shape for feature %s unknown. " + "Consider using FixedLenSequenceFeature." % key) + if (feature.shape is not None and + not feature_tensor_shape.is_fully_defined()): + raise ValueError("All dimensions of shape for feature %s need to be " + "known but received %s." % (key, str(feature.shape))) dense_keys.append(key) dense_shapes.append(feature.shape) dense_types.append(feature.dtype) @@ -253,6 +274,8 @@ def _features_to_raw_params(features, types): dense_types.append(feature.dtype) if feature.allow_missing: dense_defaults[key] = None + if feature.default_value is not None: + dense_defaults[key] = feature.default_value else: raise ValueError("Invalid feature %s:%s." % (key, feature)) return ( @@ -346,9 +369,15 @@ def parse_example(serialized, features, name=None, example_names=None): value, we will fail if that `Feature` is missing from any example in `serialized`. + Each `FixedLenSequenceFeature` `df` maps to a `Tensor` of the specified type + (or `tf.float32` if not specified) and shape + `(serialized.size(), None) + df.shape`. + All examples in `serialized` will be padded with `default_value` along the + second dimension. + Examples: - For example, if one expects a `tf.float32` VarlenFeature `ft` and three + For example, if one expects a `tf.float32` `VarLenFeature` `ft` and three serialized `Example`s are provided: ``` @@ -370,6 +399,13 @@ def parse_example(serialized, features, name=None, example_names=None): dense_shape=(3, 2)) } ``` + If instead a `FixedLenSequenceFeature` with `default_value = -1.0` and + `shape=[]` is used then the output will look like: + + ``` + {"ft": [[1.0, 2.0], [3.0, -1.0]]} + ``` + Given two `Example` input protos in `serialized`: ``` @@ -505,9 +541,23 @@ def parse_example(serialized, features, name=None, example_names=None): """ if not features: raise ValueError("Missing: features was %s." % features) + if features: + modified_features = dict(features) # Create a copy to modify + for key, feature in features.items(): + if isinstance(feature, FixedLenSequenceFeature): + if not feature.allow_missing: + raise ValueError("Unsupported: FixedLenSequenceFeature requires " + "allow_missing to be True.") + modified_features[key] = FixedLenSequenceFeature( + [None] + list(feature.shape), + feature.dtype, + feature.allow_missing, + feature.default_value) + features = modified_features (sparse_keys, sparse_types, dense_keys, dense_types, dense_defaults, dense_shapes) = _features_to_raw_params( - features, [VarLenFeature, SparseFeature, FixedLenFeature]) + features, + [VarLenFeature, SparseFeature, FixedLenFeature, FixedLenSequenceFeature]) outputs = _parse_example_raw( serialized, example_names, sparse_keys, sparse_types, dense_keys, dense_types, dense_defaults, dense_shapes, name) -- GitLab