From bb01b120a61644f1a613468fe9b3af289d5353cf Mon Sep 17 00:00:00 2001
From: From00 <zero.ruibiao@gmail.com>
Date: Wed, 11 Aug 2021 11:43:40 +0800
Subject: [PATCH] [NPU] Support NPU kernel for TopKV2 op (#34599)

* Add NPU kernel for TopKV2 op

* deleted unnecessary cache file static_mode_white_list.cpython-37.pyc

* A draft for error checking

* A commit with accuracy error for float32 data

* Modify codes according to the review comments

* Modify codes according to the review comments
---
 paddle/fluid/operators/top_k_v2_op_npu.cc     |  94 +++++
 .../unittests/npu/test_top_k_v2_op_npu.py     | 343 ++++++++++++++++++
 2 files changed, 437 insertions(+)
 create mode 100755 paddle/fluid/operators/top_k_v2_op_npu.cc
 create mode 100755 python/paddle/fluid/tests/unittests/npu/test_top_k_v2_op_npu.py
diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc
new file mode 100755
index 00000000000..e536055013f
--- /dev/null
+++ b/paddle/fluid/operators/top_k_v2_op_npu.cc
@@ -0,0 +1,94 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/top_k_v2_op.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+// NOTE(Ruibiao): the Ascend TopKV2 operator used in this kernel
+// may lead to large accuracy error for float32 data
+template <typename T>
+class TopkV2NPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* k_tensor = context.Input<Tensor>("K");
+    auto* out = context.Output<Tensor>("Out");
+    auto* indices = context.Output<Tensor>("Indices");  // type: INT64
+
+    int32_t k = static_cast<int32_t>(context.Attr<int>("k"));
+    int axis = static_cast<int>(context.Attr<int>("axis"));
+    const bool sorted = static_cast<bool>(context.Attr<bool>("sorted"));
+    const bool largest = static_cast<bool>(context.Attr<bool>("largest"));
+
+    if (axis < 0) {
+      axis += input->dims().size();
+    }
+
+    if (k_tensor != nullptr) {
+      std::vector<int> v_tmp(1);
+      TensorToVector(
+          *k_tensor,
+          context.template device_context<paddle::platform::NPUDeviceContext>(),
+          &v_tmp);
+      k = static_cast<int32_t>(v_tmp[0]);
+    }
+
+    framework::DDim output_dims = input->dims();
+    output_dims[axis] = k;
+
+    out->Resize(output_dims);
+    indices->Resize(output_dims);
+
+    out->mutable_data<T>(context.GetPlace());
+    indices->mutable_data<int64_t>(context.GetPlace());
+
+    framework::Tensor indices_int32(framework::proto::VarType::INT32);
+    indices_int32.Resize(output_dims);
+    indices_int32.mutable_data<int32_t>(context.GetPlace());
+
+    auto npu_stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    NpuOpRunner npu_op_runner_topkv2;
+    npu_op_runner_topkv2.SetType("TopKV2")
+        .AddInput(*input)
+        .AddInput(std::vector<int32_t>{k})
+        .AddOutput(*out)
+        .AddOutput(indices_int32)
+        .AddAttr("sorted", sorted)
+        .AddAttr("dim", axis)
+        .AddAttr("largest", largest)
+        .Run(npu_stream);
+
+    // Cast 'indices_int32' to 'indices', from INT32 to INT64
+    auto dst_dtype = ConvertToNpuDtype(indices->type());
+    const auto& npu_op_runner_cast =
+        NpuOpRunner("Cast", {indices_int32}, {*indices},
+                    {{"dst_type", static_cast<int>(dst_dtype)}});
+    npu_op_runner_cast.Run(npu_stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_NPU_KERNEL(top_k_v2, ops::TopkV2NPUKernel<float>,
+                       ops::TopkV2NPUKernel<double>,
+                       ops::TopkV2NPUKernel<int32_t>,
+                       ops::TopkV2NPUKernel<int64_t>);
diff --git a/python/paddle/fluid/tests/unittests/npu/test_top_k_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_top_k_v2_op_npu.py
new file mode 100755
index 00000000000..a8242be855c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_top_k_v2_op_npu.py
@@ -0,0 +1,343 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+
+
+def numpy_topk(x, k=1, axis=-1, largest=True):
+    if axis < 0:
+        axis = len(x.shape) + axis
+    if largest:
+        indices = np.argsort(-x, axis=axis)
+    else:
+        indices = np.argsort(x, axis=axis)
+    if largest:
+        value = -np.sort(-x, axis=axis)
+    else:
+        value = np.sort(x, axis=axis)
+    indices = indices.take(indices=range(0, k), axis=axis)
+    value = value.take(indices=range(0, k), axis=axis)
+    return value, indices
+
+
+class TestTopkV2NPUOp(OpTest):
+    def setUp(self):
+        paddle.enable_static()
+        self.op_type = "top_k_v2"
+
+        self.set_npu()
+        self.set_dtype()
+        self.set_input_data()
+        self.set_attrs()
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        self.outputs = {'Out': output, 'Indices': indices}
+
+    def set_dtype(self):
+        self.dtype = np.int32
+
+    def set_attrs(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def set_input_data(self):
+        self.input_data = np.random.choice(
+            10000, size=(10, 20), replace=False).astype(self.dtype)
+
+    def test_check_output(self):
+        self.__class__.no_need_check_grad = True
+        if self.dtype == np.float32:
+            self.check_output_with_place(self.place, atol=1e-3)
+        else:
+            self.check_output_with_place(self.place)
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+
+class TestTopkV2OpFloat16(TestTopkV2NPUOp):
+    def set_attrs(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def set_dtype(self):
+        self.dtype = np.float32
+
+    def set_input_data(self):
+        self.input_data = np.random.rand(3, 4).astype(self.dtype)
+
+
+class TestTopkV2OP1Int32(TestTopkV2NPUOp):
+    def set_attrs(self):
+        self.k = 3
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopkV2OP2Int32(TestTopkV2NPUOp):
+    def set_attrs(self):
+        self.k = 4
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopkV2OP3Int32(TestTopkV2NPUOp):
+    def set_attrs(self):
+        self.k = 6
+        self.axis = 1
+        self.largest = True
+
+
+class TestTopkV2OP4Int32(TestTopkV2NPUOp):
+    def set_attrs(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+
+class TestTopkV2Op1Int64(TestTopkV2OP1Int32):
+    def set_dtype(self):
+        self.dtype = np.int64
+
+
+class TestTopkV2Op2Int64(TestTopkV2OP2Int32):
+    def set_dtype(self):
+        self.dtype = np.int64
+
+
+class TestTopkV2Op3Int64(TestTopkV2OP3Int32):
+    def set_dtype(self):
+        self.dtype = np.int64
+
+
+class TestTopkV2Op4Int64(TestTopkV2OP4Int32):
+    def set_dtype(self):
+        self.dtype = np.int64
+
+
+class TestTopkV2Op1Float32(TestTopkV2OP1Int32):
+    def set_dtype(self):
+        self.dtype = np.float32
+
+    def set_input_data(self):
+        self.input_data = np.random.rand(10, 20).astype(self.dtype)
+
+
+class TestTopkV2Op2Float32(TestTopkV2OP2Int32):
+    def set_dtype(self):
+        self.dtype = np.float32
+
+    def set_input_data(self):
+        self.input_data = np.random.rand(10, 20).astype(self.dtype)
+
+
+class TestTopkV2Op3Float32(TestTopkV2OP3Int32):
+    def set_dtype(self):
+        self.dtype = np.float32
+
+    def set_input_data(self):
+        self.input_data = np.random.rand(10, 20).astype(self.dtype)
+
+
+class TestTopkV2Op4Float32(TestTopkV2OP4Int32):
+    def set_dtype(self):
+        self.dtype = np.float32
+
+    def set_input_data(self):
+        self.input_data = np.random.rand(10, 20).astype(self.dtype)
+
+
+class TestTopkV2Op1Float64(TestTopkV2OP1Int32):
+    def set_dtype(self):
+        self.dtype = np.float64
+
+    def set_input_data(self):
+        self.input_data = np.random.rand(10, 20).astype(self.dtype)
+
+
+class TestTopkV2Op2Float64(TestTopkV2OP2Int32):
+    def set_dtype(self):
+        self.dtype = np.float64
+
+    def set_input_data(self):
+        self.input_data = np.random.rand(10, 20).astype(self.dtype)
+
+
+class TestTopkV2Op3Float64(TestTopkV2OP3Int32):
+    def set_dtype(self):
+        self.dtype = np.float64
+
+    def set_input_data(self):
+        self.input_data = np.random.rand(10, 20).astype(self.dtype)
+
+
+class TestTopkV2Op4Float64(TestTopkV2OP4Int32):
+    def set_dtype(self):
+        self.dtype = np.float64
+
+    def set_input_data(self):
+        self.input_data = np.random.rand(10, 20).astype(self.dtype)
+
+
+class TestTopKAPI(unittest.TestCase):
+    def setUp(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+        np.random.seed(123)
+        self.input_data = np.random.rand(6, 7, 8)
+        self.large_input_data = np.random.rand(2, 1030)
+
+    def run_dygraph(self, place):
+        paddle.disable_static(place)
+        input_tensor = paddle.to_tensor(self.input_data)
+        large_input_tensor = paddle.to_tensor(self.large_input_data)
+        # test case for basic test case 1
+        paddle_result = paddle.topk(input_tensor, k=2)
+        numpy_result = numpy_topk(self.input_data, k=2)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+
+        # test case for basic test case 2 with axis
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 3 with tensor K
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=k_tensor, axis=1)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+
+        # test case for basic test case 4 with tensor largest
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1, largest=False)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1, largest=False)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+
+        # test case for basic test case 5 with axis -1
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=-1, largest=False)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+
+        # test case for basic test case 6 for the partial sort 
+        paddle_result = paddle.topk(large_input_tensor, k=1, axis=-1)
+        numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 7 for the unsorted 
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
+        sort_paddle = numpy_topk(
+            np.array(paddle_result[0].numpy()), axis=1, k=2)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
+
+    def run_static(self, place):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            input_tensor = paddle.static.data(
+                name="x", shape=[6, 7, 8], dtype="float64")
+            large_input_tensor = paddle.static.data(
+                name="large_x", shape=[2, 1030], dtype="float64")
+            k_tensor = paddle.static.data(name="k", shape=[1], dtype="int32")
+            result1 = paddle.topk(input_tensor, k=2)
+            result2 = paddle.topk(input_tensor, k=2, axis=-1)
+            result3 = paddle.topk(input_tensor, k=k_tensor, axis=1)
+            self.assertEqual(result3[0].shape, (6, -1, 8))
+            self.assertEqual(result3[1].shape, (6, -1, 8))
+            result4 = paddle.topk(input_tensor, k=2, axis=1, largest=False)
+            result5 = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
+            result6 = paddle.topk(large_input_tensor, k=1, axis=-1)
+            result7 = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
+            exe = paddle.static.Executor(place)
+            input_data = np.random.rand(10, 20).astype("float64")
+            large_input_data = np.random.rand(2, 100).astype("float64")
+            paddle_result = exe.run(
+                feed={
+                    "x": self.input_data,
+                    "large_x": self.large_input_data,
+                    "k": np.array([2]).astype("int32")
+                },
+                fetch_list=[
+                    result1[0], result1[1], result2[0], result2[1], result3[0],
+                    result3[1], result4[0], result4[1], result5[0], result5[1],
+                    result6[0], result6[1], result7[0], result7[1]
+                ])
+            numpy_result = numpy_topk(self.input_data, k=2)
+            self.assertTrue(np.allclose(paddle_result[0], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[1], numpy_result[1]))
+
+            numpy_result = numpy_topk(self.input_data, k=2, axis=-1)
+            self.assertTrue(np.allclose(paddle_result[2], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[3], numpy_result[1]))
+
+            numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+            self.assertTrue(np.allclose(paddle_result[4], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[5], numpy_result[1]))
+
+            numpy_result = numpy_topk(
+                self.input_data, k=2, axis=1, largest=False)
+            self.assertTrue(np.allclose(paddle_result[6], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[7], numpy_result[1]))
+
+            numpy_result = numpy_topk(
+                self.input_data, k=2, axis=-1, largest=False)
+            self.assertTrue(np.allclose(paddle_result[8], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[9], numpy_result[1]))
+
+            numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
+            self.assertTrue(np.allclose(paddle_result[10], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[11], numpy_result[1]))
+            sort_paddle = numpy_topk(paddle_result[12], axis=1, k=2)
+            numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+            self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
+
+    def test_cases(self):
+        places = [core.NPUPlace(0)]
+        for place in places:
+            self.run_dygraph(place)
+            self.run_static(place)
+
+    def test_errors(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+        paddle.disable_static()
+        x = paddle.to_tensor([1, 2, 3])
+        with self.assertRaises(BaseException):
+            paddle.topk(x, k=-1)
+
+        with self.assertRaises(BaseException):
+            paddle.topk(x, k=0)
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab