From 206c44e2a8433c1011dee6363cae6f8928a5659f Mon Sep 17 00:00:00 2001
From: zhoukunsheng <zhoukunsheng@baidu.com>
Date: Wed, 3 Jul 2019 10:50:57 +0800
Subject: [PATCH] add unique kernel and op (#17557)

---
 paddle/fluid/API.spec                         |  1 +
 paddle/fluid/operators/unique_op.cc           | 61 ++++++++++++++
 paddle/fluid/operators/unique_op.h            | 83 +++++++++++++++++++
 python/paddle/fluid/layers/nn.py              | 40 +++++++++
 .../fluid/tests/unittests/test_unique.py      | 72 ++++++++++++++++
 5 files changed, 257 insertions(+)
 create mode 100644 paddle/fluid/operators/unique_op.cc
 create mode 100644 paddle/fluid/operators/unique_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_unique.py

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 929f6e44d4..3085c54bc3 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -202,6 +202,7 @@ paddle.fluid.layers.stack (ArgSpec(args=['x', 'axis'], varargs=None, keywords=No
 paddle.fluid.layers.pad2d (ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None)), ('document', '3f3abdb795a5c2aad8c2312249551ce5'))
 paddle.fluid.layers.unstack (ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b0c4ca08d4eb295189e1b107c920d093'))
 paddle.fluid.layers.sequence_enumerate (ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b870fed41abd2aecf929ece65f555fa1'))
+paddle.fluid.layers.unique (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=('int32',)), ('document', 'cab0b06e5683875f12f0efc62fa230a9'))
 paddle.fluid.layers.expand (ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '33bc4f6010282ffe044d77be7ba7c275'))
 paddle.fluid.layers.sequence_concat (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b992616c1afbd6b0c2a897ac23036381'))
 paddle.fluid.layers.scale (ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None)), ('document', '463e4713806e5adaa4d20a41e2218453'))
diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc
new file mode 100644
index 0000000000..08ce81d75e
--- /dev/null
+++ b/paddle/fluid/operators/unique_op.cc
@@ -0,0 +1,61 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/unique_op.h"
+
+namespace paddle {
+namespace operators {
+
+class UniqueOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of UniqueOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UniqueOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Index"),
+                   "Output(Index) of UniqueOp should not be null.");
+
+    auto in_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(in_dims.size() == 1, "Input(X) should be a vector.");
+
+    ctx->SetOutputDim("Out", {-1});
+    ctx->SetOutputDim("Index", in_dims);
+  }
+};
+
+class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input tensor. It should be a 1-D tensor.");
+    AddAttr<int>("dtype", "data type for output index");
+    AddOutput("Out", "A unique subsequence for input tensor.");
+    AddOutput("Index",
+              "An index tensor pointing to unique subsequence, which has "
+              "identical shape with input tensor and int64 dtype.");
+    AddComment(R"DOC(
+    Return a unique subsequence for 1-D input tensor, and an index tensor pointing to this unique subsequence
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(unique, ops::UniqueOp, ops::UniqueOpMaker);
+REGISTER_OP_CPU_KERNEL(unique, ops::UniqueKernel<float>,
+                       ops::UniqueKernel<double>, ops::UniqueKernel<int32_t>,
+                       ops::UniqueKernel<int64_t>);
diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h
new file mode 100644
index 0000000000..b6e4134791
--- /dev/null
+++ b/paddle/fluid/operators/unique_op.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cmath>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename InT>
+struct UniqueOpFunctor {
+  framework::Tensor* out_;
+  framework::Tensor* index_;
+  const framework::Tensor* in_;
+
+  UniqueOpFunctor(framework::Tensor* out, framework::Tensor* index,
+                  const framework::Tensor* in)
+      : out_(out), index_(index), in_(in) {}
+
+  template <typename IndexT>
+  void apply() const {
+    auto* in_data = in_->data<InT>();
+    auto* index_data = index_->mutable_data<IndexT>(platform::CPUPlace());
+
+    int64_t j = 0;
+
+    // TODO(fangzeyang): Should optimize performance here.
+    std::unordered_map<InT, int64_t> dict;
+    std::vector<InT> uniq;
+
+    PADDLE_ENFORCE(in_->numel() < pow(2, 31),
+                   "numel of Unique op input should less than INT_MAX");
+
+    for (auto i = 0; i < in_->numel(); i++) {
+      auto it = dict.find(in_data[i]);
+      if (it == dict.end()) {
+        dict.insert(std::make_pair(in_data[i], j));
+        uniq.push_back(in_data[i]);
+        index_data[i] = static_cast<IndexT>(j);
+        j++;
+      } else {
+        index_data[i] = static_cast<IndexT>(it->second);
+      }
+    }
+
+    out_->Resize(framework::make_ddim({static_cast<int64_t>(uniq.size())}));
+    auto out_data = out_->mutable_data<InT>(platform::CPUPlace());
+    std::memcpy(out_data, uniq.data(), uniq.size() * sizeof(InT));
+  }
+};
+
+template <typename T>
+class UniqueKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto data_type = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("dtype"));
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* index = context.Output<framework::Tensor>("Index");
+
+    framework::VisitDataType(data_type, UniqueOpFunctor<T>(out, index, x));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index a673458ca1..ae441cde4f 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -145,6 +145,7 @@ __all__ = [
     'pad2d',
     'unstack',
     'sequence_enumerate',
+    'unique',
     'expand',
     'sequence_concat',
     'scale',
@@ -12068,6 +12069,45 @@ def sign(x):
     return out
 
 
+def unique(x, dtype='int32'):
+    """
+    **unique** 
+
+    Return a unique tensor for `x` and an index tensor pointing to this unique tensor.
+
+    Args:
+        x(Variable): A 1-D input tensor.
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of index tensor: int32, int64.
+
+    Returns:
+        tuple: (out, index). `out` is the unique tensor for `x`, with identical dtype to `x`, and \
+            `index` is an index tensor pointing to `out`, by which user can recover the original `x` tensor.
+
+    Examples:
+        .. code-block:: python
+
+             import numpy as np
+             import paddle.fluid as fluid
+             x = fluid.assign(np.array([2, 3, 3, 1, 5, 3], dtype='int32'))
+             out, index = fluid.layers.unique(x) # out is [2, 3, 1, 5]; index is [0, 1, 1, 2, 3, 1]
+    """
+
+    helper = LayerHelper("unique", **locals())
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    index = helper.create_variable_for_type_inference(dtype)
+
+    helper.append_op(
+        type='unique',
+        inputs={'X': x},
+        attrs={'dtype': convert_np_dtype_to_dtype_(dtype)},
+        outputs={'Out': [out],
+                 'Index': [index]})
+
+    return out, index
+
+
 def deformable_conv(input,
                     offset,
                     mask,
diff --git a/python/paddle/fluid/tests/unittests/test_unique.py b/python/paddle/fluid/tests/unittests/test_unique.py
new file mode 100644
index 0000000000..2e91574954
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_unique.py
@@ -0,0 +1,72 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+class TestUniqueOp(OpTest):
+    def setUp(self):
+        self.op_type = "unique"
+        self.init_config()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init_config(self):
+        self.inputs = {'X': np.array([2, 3, 3, 1, 5, 3], dtype='int64'), }
+        self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
+        self.outputs = {
+            'Out': np.array(
+                [2, 3, 1, 5], dtype='int64'),
+            'Index': np.array(
+                [0, 1, 1, 2, 3, 1], dtype='int32')
+        }
+
+
+class TestOne(TestUniqueOp):
+    def init_config(self):
+        self.inputs = {'X': np.array([2], dtype='int64'), }
+        self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
+        self.outputs = {
+            'Out': np.array(
+                [2], dtype='int64'),
+            'Index': np.array(
+                [0], dtype='int32')
+        }
+
+
+class TestRandom(TestUniqueOp):
+    def init_config(self):
+        self.inputs = {'X': np.random.randint(0, 100, (150, ), dtype='int64')}
+        self.attrs = {'dtype': int(core.VarDesc.VarType.INT64)}
+        np_unique, np_index, reverse_index = np.unique(self.inputs['X'], True,
+                                                       True)
+        np_tuple = [(np_unique[i], np_index[i]) for i in range(len(np_unique))]
+        np_tuple.sort(key=lambda x: x[1])
+        target_out = np.array([i[0] for i in np_tuple], dtype='int64')
+        target_index = np.array(
+            [list(target_out).index(i) for i in self.inputs['X']],
+            dtype='int64')
+
+        self.outputs = {'Out': target_out, 'Index': target_index}
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab