From 71af72b1c2fe90b2dfe7af17ef74815f1920e566 Mon Sep 17 00:00:00 2001 From: zhoukunsheng Date: Wed, 3 Jul 2019 10:46:13 +0800 Subject: [PATCH] upgrade hash op to support Tensor and LoDTensor input (#17998) --- paddle/fluid/API.spec | 2 +- paddle/fluid/operators/hash_op.cc | 9 ++-- paddle/fluid/operators/hash_op.h | 7 +-- python/paddle/fluid/layers/nn.py | 35 +++++------- .../fluid/tests/unittests/test_hash_op.py | 53 ++++++++++--------- 5 files changed, 48 insertions(+), 58 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 1ec3021bd3b..929f6e44d43 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -238,7 +238,7 @@ paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], va paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '65c8362e48810b8226e311c5d046db51')) paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name', 'act'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None, None)), ('document', '9f303c67538e468a36c5904a0a3aa110')) paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '18ec2e3afeb90e70c8b73d2b71c40fdb')) -paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'da621ba1363e8f5fe7b702526bbae18f')) +paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'a0b73c21be618cec0281e7903039e5e3')) paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5d16663e096d7f04954c70ce1cc5e195')) paddle.fluid.layers.log_loss (ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)), ('document', 'e3993a477c94729526040ff65d95728e')) paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e399f9436fed5f7ff480d8532e42c937')) diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc index 82222d0a7e7..6679c109b15 100644 --- a/paddle/fluid/operators/hash_op.cc +++ b/paddle/fluid/operators/hash_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -46,11 +46,10 @@ class HashOp : public framework::OperatorWithKernel { class HashOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("X", "(Tensor) Input tensor of scale operator."); - AddOutput("Out", "(Tensor) Output tensor of scale operator."); + AddInput("X", "(Tensor) Input tensor of hash operator."); + AddOutput("Out", "(Tensor) Output tensor of hash operator."); AddComment(R"DOC( -**Hash Operator** -$$Out = scale * X$$ + Execute `num_hash` times xxHash algorithm on all elements on second dimension of input. )DOC"); AddAttr("num_hash", "").SetDefault(1); AddAttr("mod_by", "").SetDefault(100000); diff --git a/paddle/fluid/operators/hash_op.h b/paddle/fluid/operators/hash_op.h index 9e7ad5235ff..14a4660aac7 100644 --- a/paddle/fluid/operators/hash_op.h +++ b/paddle/fluid/operators/hash_op.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -47,10 +47,6 @@ class HashKernel : public framework::OpKernel { int num_hash = context.Attr("num_hash"); auto in_dims = in_t->dims(); - auto in_lod = in_t->lod(); - PADDLE_ENFORCE_EQ( - static_cast(in_dims[0]), in_lod[0].back(), - "The actual input data's size mismatched with LoD information."); std::vector out_dims; HashOutputSize(in_dims, out_dims, num_hash); @@ -67,6 +63,7 @@ class HashKernel : public framework::OpKernel { } input += last_dim; } + out_t->set_lod(in_t->lod()); } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index f861bc2d30c..a673458ca14 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -10810,12 +10810,9 @@ def hash(input, hash_size, num_hash=1, name=None): Given: # shape [2, 2] - input.data = [ + input.data = [[1, 2], - [3, 4]], - ] - - input.lod = [[0, 2]] + [3, 4]] hash_size = 10000 @@ -10833,40 +10830,32 @@ def hash(input, hash_size, num_hash=1, name=None): [8310, 1327, 1654, 4567]], ] - output.lod = [[0, 2]] - Args: input (Variable): The input variable which is a one-hot word. The - dimensions of the input variable must be 2. + dimensions of the input variable must be 2. Both Tensor and LoDTensor are supported. hash_size (int): The space size for hash algorithm. The output value will keep in the range:math:`[0, hash_size - 1]`. num_hash (int): The times of hash, default 1. name (str, default None): The name of this layer. Returns: - Variable: The hash result variable which is a LoDTensor. + Variable: The hash result variable, which the same variable type as `input`. Examples: .. code-block:: python import paddle.fluid as fluid - import paddle.fluid.layers as layers - import numpy as np - - titles = fluid.layers.data(name='titles', shape=[1], dtype='int32', lod_level=1) - hash_r = fluid.layers.hash(name='hash_x', input=titles, num_hash=1, hash_size=1000) - place = fluid.core.CPUPlace() - exece = fluid.Executor(place) - exece.run(fluid.default_startup_program()) + # titles has shape [batch, 1] + titles = fluid.layers.data(name='titles', shape=[1], dtype='int32', lod_level=0) + # hash_r has shape [batch, 2] + hash_r = fluid.layers.hash(name='hash_x', input=titles, num_hash=2, hash_size=1000) - # Init Tensor - tensor = fluid.core.LoDTensor() - tensor.set(np.random.randint(0, 10, (3, 1)).astype("int32"), place) - # Set LoD - tensor.set_recursive_sequence_lengths([[1, 1, 1]]) - out = exece.run(feed={'titles': tensor}, fetch_list=[hash_r], return_numpy=False) + # titles has shape [batch, 1] and lod information + titles = fluid.layers.data(name='titles', shape=[1], dtype='int32', lod_level=1) + # hash_r has shape [batch, 2] and inherits lod information from titles + hash_r = fluid.layers.hash(name='hash_x', input=titles, num_hash=2, hash_size=1000) """ helper = LayerHelper('hash', **locals()) out = helper.create_variable_for_type_inference( diff --git a/python/paddle/fluid/tests/unittests/test_hash_op.py b/python/paddle/fluid/tests/unittests/test_hash_op.py index 1130ea39c42..7b4e9bf738b 100644 --- a/python/paddle/fluid/tests/unittests/test_hash_op.py +++ b/python/paddle/fluid/tests/unittests/test_hash_op.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,36 +17,41 @@ import numpy as np from op_test import OpTest -class TestScaleOp(OpTest): +class TestHashOp(OpTest): def setUp(self): self.op_type = "hash" self.init_test_case() self.inputs = {'X': (self.in_seq, self.lod)} - self.attrs = {'num_hash': 4, 'mod_by': 10000} + self.attrs = {'num_hash': 2, 'mod_by': 10000} self.outputs = {'Out': (self.out_seq, self.lod)} def init_test_case(self): - np.random.seed = 1 - self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32") - self.lod = [[9, 4, 11, 6]] - # self.out_seq = np.ones([30, 4, 1], dtype=np.int32) - self.out_seq = [ - [[9662], [9217], [1129], [8487]], [[9662], [9217], [1129], [8487]], - [[8310], [1327], [1654], [4567]], [[6897], [3218], [2013], [1241]], - [[9407], [6715], [6949], [8094]], [[8473], [694], [5142], [2479]], - [[8310], [1327], [1654], [4567]], [[6897], [3218], [2013], [1241]], - [[4372], [9456], [8204], [6695]], [[6897], [3218], [2013], [1241]], - [[8473], [694], [5142], [2479]], [[4372], [9456], [8204], [6695]], - [[4372], [9456], [8204], [6695]], [[8473], [694], [5142], [2479]], - [[9407], [6715], [6949], [8094]], [[9369], [4525], [8935], [9210]], - [[4372], [9456], [8204], [6695]], [[4372], [9456], [8204], [6695]], - [[9369], [4525], [8935], [9210]], [[6897], [3218], [2013], [1241]], - [[9038], [7951], [5953], [8657]], [[9407], [6715], [6949], [8094]], - [[9662], [9217], [1129], [8487]], [[9369], [4525], [8935], [9210]], - [[9038], [7951], [5953], [8657]], [[9662], [9217], [1129], [8487]], - [[9369], [4525], [8935], [9210]], [[1719], [5986], [9919], [3421]], - [[4372], [9456], [8204], [6695]], [[9038], [7951], [5953], [8657]] - ] + np.random.seed(1) + self.in_seq = np.random.randint(0, 10, (8, 1)).astype("int32") + self.lod = [[2, 6]] + self.out_seq = [[[3481], [7475]], [[1719], [5986]], [[8473], [694]], + [[3481], [7475]], [[4372], [9456]], [[4372], [9456]], + [[6897], [3218]], [[9038], [7951]]] + self.out_seq = np.array(self.out_seq) + + def test_check_output(self): + self.check_output() + + +class TestHashNotLoDOp(TestHashOp): + def setUp(self): + self.op_type = "hash" + self.init_test_case() + self.inputs = {'X': self.in_seq} + self.attrs = {'num_hash': 2, 'mod_by': 10000} + self.outputs = {'Out': self.out_seq} + + def init_test_case(self): + np.random.seed(1) + self.in_seq = np.random.randint(0, 10, (8, 1)).astype("int32") + self.out_seq = [[[3481], [7475]], [[1719], [5986]], [[8473], [694]], + [[3481], [7475]], [[4372], [9456]], [[4372], [9456]], + [[6897], [3218]], [[9038], [7951]]] self.out_seq = np.array(self.out_seq) def test_check_output(self): -- GitLab