提交 fd9fa066 编写于 作者: M Mustafa Ispir 提交者: TensorFlower Gardener

Renamed 'hashed_embedding_x' as 'scattered_embedding_x' to eliminate confusion.

Change: 139849495
上级 97e39be4
......@@ -89,7 +89,7 @@ Feature columns provide a mechanism to map data to a model.
@@create_feature_spec_for_parsing
@@crossed_column
@@embedding_column
@@hashed_embedding_column
@@scattered_embedding_column
@@input_from_feature_columns
@@joint_weighted_sum_from_feature_columns
@@make_place_holder_tensors_for_base_features
......
......@@ -31,8 +31,10 @@ from tensorflow.python.ops import sparse_ops
from tensorflow.python.ops import variables
from tensorflow.python.platform import tf_logging as logging
__all__ = ["safe_embedding_lookup_sparse", "hashed_embedding_lookup",
"hashed_embedding_lookup_sparse", "embedding_lookup_unique"]
__all__ = [
"safe_embedding_lookup_sparse", "scattered_embedding_lookup",
"scattered_embedding_lookup_sparse", "embedding_lookup_unique"
]
def safe_embedding_lookup_sparse(embedding_weights,
......@@ -176,7 +178,10 @@ def _prune_invalid_ids(sparse_ids, sparse_weights):
return sparse_ids, sparse_weights
def hashed_embedding_lookup(params, values, dimension, name=None,
def scattered_embedding_lookup(params,
values,
dimension,
name=None,
hash_key=None):
"""Looks up embeddings using parameter hashing for each value in `values`.
......@@ -223,7 +228,7 @@ def hashed_embedding_lookup(params, values, dimension, name=None,
if not isinstance(params, list):
params = [params]
with ops.name_scope(name, "hashed_embedding_lookup",
with ops.name_scope(name, "scattered_embedding_lookup",
params + [dimension, values]):
if dimension <= 0:
raise ValueError("Dimension should be >0 not %d" % dimension)
......@@ -266,7 +271,7 @@ def hashed_embedding_lookup(params, values, dimension, name=None,
0, [values_shape, [dimension]]))
def hashed_embedding_lookup_sparse(params,
def scattered_embedding_lookup_sparse(params,
sparse_values,
dimension,
combiner=None,
......@@ -275,7 +280,7 @@ def hashed_embedding_lookup_sparse(params,
hash_key=None):
"""Looks up embeddings of a sparse feature using parameter hashing.
See `tf.contrib.layers.hashed_embedding_lookup` for embedding with hashing.
See `tf.contrib.layers.scattered_embedding_lookup` for embedding with hashing.
Args:
params: A `Tensor`, `list` of `Tensors`, or `PartitionedVariable`.
......@@ -311,7 +316,7 @@ def hashed_embedding_lookup_sparse(params,
if not isinstance(sparse_values, sparse_tensor.SparseTensor):
raise TypeError("sparse_values must be SparseTensor")
with ops.name_scope(name, "hashed_sparse_embedding_lookup",
with ops.name_scope(name, "scattered_embedding_lookup_sparse",
params + [sparse_values]) as scope:
# Fill in the empty rows.
if default_value is None:
......@@ -330,8 +335,8 @@ def hashed_embedding_lookup_sparse(params,
values = sparse_values.values
values, idx = array_ops.unique(values)
embeddings = hashed_embedding_lookup(params, values, dimension,
hash_key=hash_key)
embeddings = scattered_embedding_lookup(
params, values, dimension, hash_key=hash_key)
if combiner == "sum":
embeddings = math_ops.sparse_segment_sum(embeddings, idx, segment_ids,
......
......@@ -261,7 +261,7 @@ class SafeEmbeddingLookupSparseTest(tf.test.TestCase):
embedding_weights, sparse_ids, sparse_weights)
class HashedEmbeddingLookupTest(tf.test.TestCase):
class ScatteredEmbeddingLookupTest(tf.test.TestCase):
def setUp(self):
tf.set_random_seed(1)
......@@ -281,24 +281,24 @@ class HashedEmbeddingLookupTest(tf.test.TestCase):
w.initializer.run()
return embedding_weights
def test_hashed_embedding_consistency(self):
def test_scattered_embedding_consistency(self):
with self.test_session():
embedding_weights = self._random_weights()
values = tf.constant(["foo", "foo"])
embedding_lookup_result = tf.contrib.layers.hashed_embedding_lookup(
embedding_lookup_result = tf.contrib.layers.scattered_embedding_lookup(
embedding_weights, values, dimension=10).eval()
self.assertAllEqual(embedding_lookup_result.shape, [2, 10])
self.assertAllEqual(embedding_lookup_result[0],
embedding_lookup_result[1])
def test_hashed_embedding_multiple_partition(self):
def test_scattered_embedding_multiple_partition(self):
with self.test_session():
embedding_weights = self._random_weights(num_shards=7)
values = tf.constant([4, 4, 5])
embedding_lookup_result = tf.contrib.layers.hashed_embedding_lookup(
embedding_lookup_result = tf.contrib.layers.scattered_embedding_lookup(
embedding_weights, values, dimension=5).eval()
self.assertAllEqual(embedding_lookup_result.shape, [3, 5])
......@@ -309,31 +309,31 @@ class HashedEmbeddingLookupTest(tf.test.TestCase):
embedding_lookup_result[0]) ** 2)
self.assertGreater(embedding_diff, 0)
def test_hashed_embedding_coverage(self):
def test_scattered_embedding_coverage(self):
with self.test_session():
size = 8
embedding_weights = self._random_weights(size=size, num_shards=3)
values = tf.constant(["foo"])
# Large embedding dimension to cover the full range of weights.
embedding_lookup_result = tf.contrib.layers.hashed_embedding_lookup(
embedding_lookup_result = tf.contrib.layers.scattered_embedding_lookup(
embedding_weights, values, dimension=100).eval()
self.assertEqual(len(np.unique(embedding_lookup_result[0])), size)
def test_hashed_embedding_multi_dimension(self):
def test_scattered_embedding_multi_dimension(self):
with self.test_session():
embedding_weights = self._random_weights()
values = tf.constant([["foo", "bar", "bar"], ["bar", "bar", "foo"]])
embedding_lookup_result = tf.contrib.layers.hashed_embedding_lookup(
embedding_lookup_result = tf.contrib.layers.scattered_embedding_lookup(
embedding_weights, values, dimension=10).eval()
self.assertAllEqual(embedding_lookup_result.shape, [2, 3, 10])
self.assertAllEqual(embedding_lookup_result[0][0],
embedding_lookup_result[1][2])
def test_hashed_embedding_lookup_sparse(self):
def test_scattered_embedding_lookup_sparse(self):
with self.test_session():
embedding_weights = self._random_weights(num_shards=3)
sparse_tensor = tf.SparseTensor(values=["foo", "bar", "foo", "bar"],
......@@ -341,7 +341,7 @@ class HashedEmbeddingLookupTest(tf.test.TestCase):
shape=[5, 2])
embedding_lookup_result = (
tf.contrib.layers.hashed_embedding_lookup_sparse(
tf.contrib.layers.scattered_embedding_lookup_sparse(
embedding_weights, sparse_tensor, dimension=5, combiner="mean")
.eval())
......
......@@ -1094,10 +1094,12 @@ def shared_embedding_columns(sparse_id_columns,
return tuple(embedded_columns)
class _HashedEmbeddingColumn(collections.namedtuple(
"_HashedEmbeddingColumn", ["column_name", "size", "dimension", "combiner",
"initializer"]), _EmbeddingColumn):
"""See `hashed_embedding_column`."""
class _ScatteredEmbeddingColumn(
collections.namedtuple(
"_ScatteredEmbeddingColumn",
["column_name", "size", "dimension", "combiner", "initializer"]),
_EmbeddingColumn):
"""See `scattered_embedding_column`."""
def __new__(cls,
column_name,
......@@ -1113,13 +1115,13 @@ class _HashedEmbeddingColumn(collections.namedtuple(
# TODO(b/25671353): Better initial value?
initializer = init_ops.truncated_normal_initializer(
mean=0.0, stddev=stddev)
return super(_HashedEmbeddingColumn, cls).__new__(cls, column_name, size,
return super(_ScatteredEmbeddingColumn, cls).__new__(cls, column_name, size,
dimension, combiner,
initializer)
@property
def name(self):
return "{}_hashed_embedding".format(self.column_name)
return "{}_scattered_embedding".format(self.column_name)
@property
def config(self):
......@@ -1141,7 +1143,7 @@ class _HashedEmbeddingColumn(collections.namedtuple(
max_norm=None)
def hashed_embedding_column(column_name,
def scattered_embedding_column(column_name,
size,
dimension,
combiner=None,
......@@ -1151,6 +1153,18 @@ def hashed_embedding_column(column_name,
The i-th embedding component of a value v is found by retrieving an
embedding weight whose index is a fingerprint of the pair (v,i).
An embedding column with sparse_column_with_hash_bucket such as
embedding_column(
sparse_column_with_hash_bucket(column_name, bucket_size),
dimension)
could be replaced by
scattered_embedding_column(
column_name, size=bucket_size * dimension, dimension=dimension)
for the same number of embedding parameters and hopefully reduced impact of
collisions with a cost of slowing down training.
Args:
column_name: A string defining sparse column name.
size: An integer specifying the number of parameters in the embedding layer.
......@@ -1167,7 +1181,7 @@ def hashed_embedding_column(column_name,
`tf.truncated_normal_initializer` with mean 0 and standard deviation 0.1.
Returns:
A _HashedEmbeddingColumn.
A _ScatteredEmbeddingColumn.
Raises:
ValueError: if dimension or size is not a positive integer; or if combiner
......@@ -1188,7 +1202,7 @@ def hashed_embedding_column(column_name,
"combiner: {}, column_name: {}".format(combiner,
column_name))
return _HashedEmbeddingColumn(column_name, size, dimension, combiner,
return _ScatteredEmbeddingColumn(column_name, size, dimension, combiner,
initializer)
......
......@@ -75,7 +75,7 @@ def _embeddings_from_arguments(column,
trainable=trainable,
collections=weight_collections)
return embedding_ops.hashed_embedding_lookup_sparse(
return embedding_ops.scattered_embedding_lookup_sparse(
embeddings, input_tensor, args.dimension,
combiner=args.combiner, name='lookup')
......@@ -256,9 +256,9 @@ def sequence_input_from_feature_columns(columns_to_tensors,
See documentation for `input_from_feature_columns`. The following types of
`FeatureColumn` are permitted in `feature_columns`: `_OneHotColumn`,
`_EmbeddingColumn`, `_HashedEmbeddingColumn`, `_RealValuedColumn`,
`_EmbeddingColumn`, `_ScatteredEmbeddingColumn`, `_RealValuedColumn`,
`_DataFrameColumn`. In addition, columns in `feature_columns` may not be
constructed using any of the following: `HashedEmbeddingColumn`,
constructed using any of the following: `ScatteredEmbeddingColumn`,
`BucketizedColumn`, `CrossedColumn`.
Args:
......@@ -892,7 +892,7 @@ _SUPPORTED_SEQUENCE_COLUMNS = (fc._OneHotColumn,
fc._EmbeddingColumn,
fc._RealValuedColumn)
_FORBIDDEN_SEQUENCE_COLUMNS = (fc._HashedEmbeddingColumn,
_FORBIDDEN_SEQUENCE_COLUMNS = (fc._ScatteredEmbeddingColumn,
fc._BucketizedColumn,
fc._CrossedColumn)
......
......@@ -18,7 +18,6 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import numpy as np
import tensorflow as tf
......@@ -50,7 +49,6 @@ class TransformerTest(tf.test.TestCase):
with self.test_session():
self.assertAllEqual(output[bucket].eval(), [[2], [3], [0]])
def testBucketizedColumnWithMultiDimensions(self):
bucket = tf.contrib.layers.bucketized_column(
tf.contrib.layers.real_valued_column("price", 2),
......@@ -585,14 +583,15 @@ class CreateInputLayersForDNNsTest(tf.test.TestCase):
tf.global_variables_initializer().run()
self.assertAllEqual(output.eval().shape, [4, 10])
def testHashedEmbeddingColumnSucceedsForDNN(self):
def testScatteredEmbeddingColumnSucceedsForDNN(self):
wire_tensor = tf.SparseTensor(values=["omar", "stringer", "marlo", "omar"],
indices=[[0, 0], [1, 0], [1, 1], [2, 0]],
shape=[3, 2])
features = {"wire": wire_tensor}
# Big enough hash space so that hopefully there is no collision
embedded_sparse = tf.contrib.layers.hashed_embedding_column("wire", 1000, 3)
embedded_sparse = tf.contrib.layers.scattered_embedding_column("wire", 1000,
3)
output = tf.contrib.layers.input_from_feature_columns(
features, [embedded_sparse], weight_collections=["my_collection"])
weights = tf.get_collection("my_collection")
......@@ -2054,12 +2053,10 @@ class ParseExampleTest(tf.test.TestCase):
self.assertAllEqual(output[wire_cast].indices.eval(), [[0, 0], [0, 1]])
self.assertAllEqual(output[wire_cast].values.eval(), [2, 0])
def testParseSequenceExample(self):
location_keys = ["east_side", "west_side", "nyc"]
embedding_dimension = 10
location = tf.contrib.layers.sparse_column_with_keys(
"location", keys=location_keys)
location_onehot = tf.contrib.layers.one_hot_column(location)
......@@ -2067,7 +2064,8 @@ class ParseExampleTest(tf.test.TestCase):
"wire_cast", ["marlo", "omar", "stringer"])
wire_cast_embedded = tf.contrib.layers.embedding_column(
wire_cast, dimension=embedding_dimension)
measurements = tf.contrib.layers.real_valued_column("measurements", dimension=2)
measurements = tf.contrib.layers.real_valued_column(
"measurements", dimension=2)
context_feature_columns = [location_onehot]
sequence_feature_columns = [wire_cast_embedded, measurements]
......@@ -2098,7 +2096,6 @@ class ParseExampleTest(tf.test.TestCase):
])
}))
ctx, seq = tf.contrib.layers.parse_feature_columns_from_sequence_examples(
serialized=sequence_example.SerializeToString(),
context_feature_columns=context_feature_columns,
......@@ -2128,6 +2125,7 @@ class ParseExampleTest(tf.test.TestCase):
self.assertAllClose(
measurement_val, np.array([[0.2, 0.3], [0.1, 0.8], [0.5, 0.0]]))
class InferRealValuedColumnTest(tf.test.TestCase):
def testTensorInt32(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册