From d3d1a6b6e0ac11f9b2facfa8fdd45a07b2097459 Mon Sep 17 00:00:00 2001
From: taixiurong <taixiurong@126.com>
Date: Fri, 20 Nov 2020 13:10:09 +0800
Subject: [PATCH] add kunlun kernel: slice, slice_grad, top_k, cast. 
 *test=kunlun (#28542)

* 1.add xpu slice op 2. add xpu top_k op 3.modify xpu cast to new api

* 1.add xpu slice op 2. add xpu top_k op 3.modify xpu cast to new api
---
 cmake/external/xpu.cmake                      |  2 +-
 paddle/fluid/operators/cast_op_xpu.cc         | 16 ++--
 .../{slice_xpu_op.cc => slice_op_xpu.cc}      | 34 ++++----
 paddle/fluid/operators/top_k_op_xpu.cc        | 82 +++++++++++++++++++
 .../tests/unittests/xpu/test_slice_op_xpu.py  | 47 +++++++----
 .../tests/unittests/xpu/test_top_k_op_xpu.py  | 77 +++++++++++++++++
 6 files changed, 219 insertions(+), 39 deletions(-)
 rename paddle/fluid/operators/{slice_xpu_op.cc => slice_op_xpu.cc} (88%)
 create mode 100644 paddle/fluid/operators/top_k_op_xpu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index eb00b82220..8d3fee915c 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -4,7 +4,7 @@ endif()
 
 INCLUDE(ExternalProject)
 SET(XPU_PROJECT                 "extern_xpu")
-SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2020_09_22_api_2020_11_05.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2020_11_10.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
 SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
 SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc
index 56160bd297..a2791cb262 100644
--- a/paddle/fluid/operators/cast_op_xpu.cc
+++ b/paddle/fluid/operators/cast_op_xpu.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/cast_op.h"
 #include <memory>
+
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/cast_op.h"
 #include "paddle/fluid/platform/float16.h"
+#include "xpu/refactor/math.h"
 
 namespace paddle {
 namespace operators {
@@ -37,14 +39,16 @@ class CastXPUKernel : public framework::OpKernel<InT> {
     int r = -1;
     if (out_type == framework::proto::VarType::FP32) {
       auto* out_data = out->mutable_data<float>(context.GetPlace());
-      r = xpu::cast<InT, float>(dev_ctx.x_context(), in_data, out_data, numel);
+      r = xpu::cast_v2<InT, float>(dev_ctx.x_context(), in_data, out_data,
+                                   numel);
     } else if (out_type == framework::proto::VarType::INT32) {
       auto* out_data = out->mutable_data<int>(context.GetPlace());
-      r = xpu::cast<InT, int>(dev_ctx.x_context(), in_data, out_data, numel);
+      r = xpu::cast_v2<InT, int32_t>(dev_ctx.x_context(), in_data, out_data,
+                                     numel);
     } else if (out_type == framework::proto::VarType::INT64) {
       auto* out_data = out->mutable_data<int64_t>(context.GetPlace());
-      r = xpu::cast<InT, int64_t>(dev_ctx.x_context(), in_data, out_data,
-                                  numel);
+      r = xpu::cast_v2<InT, int64_t>(dev_ctx.x_context(), in_data, out_data,
+                                     numel);
     } else {
       PADDLE_THROW(platform::errors::Unavailable("Not supported cast %d -> %d",
                                                  in_type, out_type));
@@ -63,7 +67,7 @@ class CastXPUKernel : public framework::OpKernel<InT> {
 
 namespace ops = paddle::operators;
 REGISTER_OP_XPU_KERNEL(
-    cast, ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int>,
+    cast, ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int32_t>,
     ops::CastXPUKernel<paddle::platform::XPUDeviceContext, float>,
     ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int64_t>);
 #endif
diff --git a/paddle/fluid/operators/slice_xpu_op.cc b/paddle/fluid/operators/slice_op_xpu.cc
similarity index 88%
rename from paddle/fluid/operators/slice_xpu_op.cc
rename to paddle/fluid/operators/slice_op_xpu.cc
index 3d6f52c7dc..5f98efe8e9 100644
--- a/paddle/fluid/operators/slice_xpu_op.cc
+++ b/paddle/fluid/operators/slice_op_xpu.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-
+#include "paddle/fluid/operators/slice_op.h"
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/operators/slice_op.h"
+#include "xpu/refactor/math.h"
 
 namespace paddle {
 namespace operators {
@@ -85,10 +85,8 @@ class SliceXPUKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto* in_data = in->data<T>();
     auto* out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    int r = xpu::slice_forward(dev_ctx.x_context(), shape.data(),
-                               starts_extension.data(), ends_extension.data(),
-                               shape_size, in_data, out_data);
+    int r = xpu::slice<T>(dev_ctx.x_context(), in_data, out_data, shape,
+                          starts_extension, ends_extension);
     PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
                       platform::errors::External("XPU slice kernel error!"));
   }
@@ -149,12 +147,14 @@ class SliceGradXPUKernel : public framework::OpKernel<T> {
         shape_size > axes.size() ? starts_extension.data() : starts.data();
     int* ends_host =
         shape_size > axes.size() ? ends_extension.data() : ends.data();
-    PADDLE_ENFORCE_EQ(
-        xpu_malloc((void**)(&starts_device), shape_size * sizeof(int)),
-        XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
-    PADDLE_ENFORCE_EQ(
-        xpu_malloc((void**)(&ends_device), shape_size * sizeof(int)),
-        XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
+    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&starts_device),
+                                 shape_size * sizeof(int)),
+                      XPU_SUCCESS,
+                      platform::errors::External("XPU has no enough memory"));
+    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&ends_device),
+                                 shape_size * sizeof(int)),
+                      XPU_SUCCESS,
+                      platform::errors::External("XPU has no enough memory"));
     memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
                  starts_device, platform::CPUPlace(), starts_host,
                  shape_size * sizeof(int));
@@ -168,9 +168,10 @@ class SliceGradXPUKernel : public framework::OpKernel<T> {
       shape[i] = in_dims[i];
     }
     int* shape_device = nullptr;
-    PADDLE_ENFORCE_EQ(
-        xpu_malloc((void**)(&shape_device), shape_size * sizeof(int)),
-        XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
+    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&shape_device),
+                                 shape_size * sizeof(int)),
+                      XPU_SUCCESS,
+                      platform::errors::External("XPU has no enough memory"));
     memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
                  shape_device, platform::CPUPlace(), shape.data(),
                  shape_size * sizeof(int));
@@ -196,7 +197,8 @@ class SliceGradXPUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 
 REGISTER_OP_XPU_KERNEL(
-    slice, ops::SliceXPUKernel<paddle::platform::XPUDeviceContext, float>);
+    slice, ops::SliceXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::SliceXPUKernel<paddle::platform::XPUDeviceContext, int>);
 REGISTER_OP_XPU_KERNEL(
     slice_grad,
     ops::SliceGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/top_k_op_xpu.cc b/paddle/fluid/operators/top_k_op_xpu.cc
new file mode 100644
index 0000000000..5e89e38c7d
--- /dev/null
+++ b/paddle/fluid/operators/top_k_op_xpu.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include <memory>
+
+#include "paddle/fluid/operators/top_k_op.h"
+#include "xpu/refactor/math.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T>
+class TopkXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // Get the top k elements of each row of input tensor
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* indices = ctx.Output<Tensor>("Indices");
+
+    size_t k = static_cast<int>(ctx.Attr<int>("k"));
+    auto* k_t = ctx.Input<Tensor>("K");
+    if (k_t) {
+      k = k_t->data<int>()[0];
+      framework::DDim output_dims = output->dims();
+      output_dims[output_dims.size() - 1] = k;
+      output->Resize(output_dims);
+      indices->Resize(output_dims);
+    }
+
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+    Tensor indices_32_data_tensor;
+    int32_t* indices_int_data = indices_32_data_tensor.mutable_data<int32_t>(
+        ctx.GetPlace(), indices->numel());
+    // reshape input to a flattern matrix(like flat_inner_dims)
+    framework::DDim inputdims = input->dims();
+    const size_t row = framework::product(
+        framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
+    const size_t col = inputdims[inputdims.size() - 1];
+    auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
+
+    int ret = xpu::sorted_topk<T>(dev_ctx.x_context(), input->data<T>(),
+                                  output_data, indices_int_data, row, col, k);
+    PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                      platform::errors::External(
+                          "XPU API return wrong value[%d] in call kernel name "
+                          "[%s], please check "
+                          "where Baidu Kunlun Card is properly installed.",
+                          ret, "sorted_topk"));
+    ret = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(),
+                                         (const int32_t*)indices_int_data,
+                                         indices_data, indices->numel());
+    PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                      platform::errors::External(
+                          "XPU API return wrong value[%d] in call kernel name "
+                          "[%s], please check "
+                          "where Baidu Kunlun Card is properly installed.",
+                          ret, "cast_v2"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(top_k, ops::TopkXPUKernel<float>);
+#endif
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
index 44c8821be0..8f3578b526 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
@@ -12,21 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
-import unittest
+import paddle
 import numpy as np
 import sys
+import unittest
 sys.path.append("..")
-import paddle
-import paddle.fluid.core as core
 from op_test import OpTest
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
+
+paddle.enable_static()
 
 
 # Situation 1: starts(list, no tensor), ends(list, no tensor)
 # 1.1 without attr(decrease)
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestSliceOp(OpTest):
     def setUp(self):
         self.op_type = "slice"
@@ -42,7 +41,7 @@ class TestSliceOp(OpTest):
         }
 
     def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [1, 0, 2]
         self.ends = [3, 3, 4]
         self.axes = [0, 1, 2]
@@ -58,9 +57,11 @@ class TestSliceOp(OpTest):
         self.check_grad_with_place(place, ['Input'], 'Out')
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestCase1(TestSliceOp):
     def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [-3, 0, 2]
         self.ends = [3, 100, -1]
         self.axes = [0, 1, 2]
@@ -68,9 +69,11 @@ class TestCase1(TestSliceOp):
         self.out = self.input[-3:3, 0:100, 2:-1, :]
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestCase2(TestSliceOp):
     def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [-3, 0, 2]
         self.ends = [3, 100, -1]
         self.axes = [0, 1, 3]
@@ -79,6 +82,8 @@ class TestCase2(TestSliceOp):
 
 
 # 1.2 with attr(decrease)
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestSliceOp_decs_dim(OpTest):
     def setUp(self):
         self.op_type = "slice"
@@ -95,7 +100,7 @@ class TestSliceOp_decs_dim(OpTest):
         }
 
     def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [1, 0, 2]
         self.ends = [2, 3, 4]
         self.axes = [0, 1, 2]
@@ -112,9 +117,11 @@ class TestSliceOp_decs_dim(OpTest):
         self.check_grad_with_place(place, ['Input'], 'Out')
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
     def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [1, 0, 2]
         self.ends = [2, 1, 4]
         self.axes = [0, 1, 2]
@@ -123,9 +130,11 @@ class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
         self.out = self.input[1, 0, 2:4, :]
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
     def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [-1, 0, 2]
         self.ends = [1000000, 1, 4]
         self.axes = [0, 1, 2]
@@ -134,9 +143,11 @@ class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
         self.out = self.input[-1, 0, 2:4, :]
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
     def config(self):
-        self.input = np.random.random([3, 4, 5, 7]).astype("float64")
+        self.input = np.random.random([3, 4, 5, 7]).astype("float32")
         self.starts = [0, 1, 2, 3]
         self.ends = [1, 2, 3, 4]
         self.axes = [0, 1, 2, 3]
@@ -145,9 +156,11 @@ class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
         self.out = self.input[0, 1, 2, 3:4]
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
     def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [-1]
         self.ends = [1000000]
         self.axes = [3]
@@ -156,9 +169,11 @@ class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
         self.out = self.input[:, :, :, -1]
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
     def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [0, 1, 2, 3]
         self.ends = [1, 2, 3, 4]
         self.axes = [0, 1, 2, 3]
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py
new file mode 100644
index 0000000000..c4418bd55c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py
@@ -0,0 +1,77 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from paddle.fluid.op import Operator
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestTopkOp(OpTest):
+    def setUp(self):
+        self.variable_k = False
+        self.use_xpu = True
+        self.set_args()
+        self.op_type = "top_k"
+        self.dtype = np.float32
+        self.init_dtype()
+
+        k = self.top_k
+        input = np.random.random((self.row, k)).astype(self.dtype)
+        output = np.ndarray((self.row, k))
+        indices = np.ndarray((self.row, k)).astype("int64")
+        self.inputs = {'X': input}
+
+        if self.variable_k:
+            self.inputs['K'] = np.array([k]).astype("int32")
+        else:
+            self.attrs = {'k': k}
+
+        for rowid in range(self.row):
+            row = input[rowid]
+            output[rowid] = np.sort(row)[::-1][:k]
+            indices[rowid] = row.argsort()[::-1][:k]
+
+        self.outputs = {'Out': output, 'Indices': indices}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_args(self):
+        self.row = 100
+        self.top_k = 1
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab