[XPU] add sampling_id op, add top_k op, update xdnn api. test=kunlun (#44704)

e61f48c1 · houj04 · GitHub · 72b65d6b · e61f48c1 · e61f48c1
8 changed file
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
 if(NOT DEFINED XPU_BASE_URL)
  set(XPU_BASE_URL_WITHOUT_DATE
      "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220727")
+  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220728")
 else()
  set(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
@@ -19,7 +19,7 @@ endif()
 if(NOT DEFINED XPU_XDNN_BASE_URL)
  set(XPU_XDNN_BASE_URL_WITHOUT_DATE
      "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
-  set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220727")
+  set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220728")
 else()
  set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
 endif()

--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -302,6 +302,11 @@ void TensorFromVector(const std::vector<T>& src,
        size,
        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream());
  }
+#endif
+#ifdef PADDLE_WITH_XPU
+  else if (platform::is_xpu_place(dst_place)) {  // NOLINT
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
 #endif
  else {  // NOLINT
    PADDLE_THROW(platform::errors::Unimplemented(
@@ -381,6 +386,11 @@ inline void TensorFromVector(const std::vector<bool>& src,
        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  }
+#endif
+#ifdef PADDLE_WITH_XPU
+  else if (platform::is_xpu_place(dst_place)) {  // NOLINT
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
 #endif
  else {  // NOLINT
    PADDLE_THROW(platform::errors::Unimplemented(

--- a/paddle/fluid/operators/detection/generate_proposals_v2_op_xpu.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op_xpu.cc
@@ -219,20 +219,14 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
  // 4. nms
  int nms_keep_num = 0;
-  r = xpu::nms<T>(dev_ctx.x_context(),
+  r = xpu::sorted_nms<T>(dev_ctx.x_context(),
-                  proposals_filter.data<T>(),
+                         proposals_filter.data<T>(),
-                  nullptr,
+                         keep_index.data<int>(),
-                  keep_index.data<int>(),
+                         nms_keep_num,
-                  1,
+                         keep_num,
-                  1,
+                         nms_thresh,
-                  keep_num,
+                         pixel_offset);
-                  -1,
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_nms");
-                  nms_thresh,
-                  -1,
-                  0,
-                  &nms_keep_num,
-                  pixel_offset);
-  PADDLE_ENFORCE_XDNN_SUCCESS(r, "nms");
  if (post_nms_top_n > 0 && post_nms_top_n < nms_keep_num) {
    keep_index.Resize({post_nms_top_n});
  } else {

--- a/paddle/fluid/operators/one_hot_op_xpu.cc
+++ b/paddle/fluid/operators/one_hot_op_xpu.cc
@@ -17,6 +17,7 @@
 #include <vector>
 #include "paddle/fluid/operators/one_hot_op.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {
 namespace operators {
@@ -28,9 +29,13 @@ template <typename DeviceContext, typename T>
 class OneHotXPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
+    const auto* in = context.Input<LoDTensor>("X");
    auto* out = context.Output<LoDTensor>("Out");
+    // get depth from attr
    int depth = context.Attr<int>("depth");
+    // get depth from input tensor
    if (context.HasInput("depth_tensor")) {
      auto* depth_tensor = context.Input<Tensor>("depth_tensor");
      auto* depth_data = depth_tensor->data<int32_t>();
@@ -50,18 +55,14 @@ class OneHotXPUKernel : public framework::OpKernel<T> {
    auto& dev_ctx = context.template device_context<DeviceContext>();
    int len = in->numel();
+    // int one_hot(Context* ctx, const T* x, float* y, int len, int depth, float
+    // on_value = 1.0f, float off_value = 0.0f);
    int ret = xpu::one_hot<T>(dev_ctx.x_context(),
                              in->data<T>(),
                              out->mutable_data<float>(context.GetPlace()),
                              len,
                              depth);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "one_hot");
-    PADDLE_ENFORCE_EQ(ret,
-                      XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU one_hot kernel return wrong value[%d %s]",
-                          ret,
-                          XPUAPIErrorMsg[ret]));
  }
 };

--- a/paddle/fluid/operators/sampling_id_op_xpu.cc
+++ b/paddle/fluid/operators/sampling_id_op_xpu.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include "paddle/fluid/operators/sampling_id_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(sampling_id,
+                       paddle::operators::SamplingIdKernel<float>,
+                       paddle::operators::SamplingIdKernel<double>);
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -322,6 +322,9 @@ XPUOpMap& get_kl2_ops() {
       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                     pOpKernelType(vartype::INT32, XPUPlace()),
                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"one_hot",
+       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::INT64, XPUPlace())})},
      {"one_hot_v2",
       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                     pOpKernelType(vartype::INT64, XPUPlace())})},
@@ -393,6 +396,9 @@ XPUOpMap& get_kl2_ops() {
      {"scatter",
       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sampling_id",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP64, XPUPlace())})},
      {"sgd",
       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                     pOpKernelType(vartype::FP16, XPUPlace())})},

--- a/python/paddle/fluid/tests/unittests/xpu/test_one_hot_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_one_hot_op_xpu.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,172 +13,117 @@
 # limitations under the License.
 from __future__ import print_function
 import unittest
 import numpy as np
-import paddle
-import paddle.fluid.core as core
 import sys
 sys.path.append("..")
+import paddle
+import paddle.fluid.core as core
+from op_test import OpTest
 from op_test_xpu import XPUOpTest
-import paddle.fluid as fluid
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
-from paddle.fluid import Program, program_guard
-import time
 paddle.enable_static()
-"""
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 'core is not compiled with XPU')
+class XPUTestOneHotOP(XPUOpTestWrapper):
-class TestOneHotOp(XPUOpTest):
-    def setUp(self):
+    def __init__(self):
-        self.use_xpu = True
+        self.op_name = 'one_hot'
-        self.op_type = 'one_hot'
+        self.use_dynamic_create_class = False
-        depth = 10
-        depth_np = np.array(10).astype('int32')
+    class TestXPUOneHotOP(XPUOpTest):
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        def setUp(self):
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+            self.place = paddle.XPUPlace(0)
+            self.init_dtype()
-        out = np.zeros(shape=(np.product(x.shape[:-1]),
+            self.op_type = 'one_hot'
-                              depth)).astype('float32')
+            self.set_data()
-        for i in range(np.product(x.shape)):
+            self.set_input()
-            out[i, x[i]] = 1.0
+        def set_data(self):
-        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
+            self.depth = 10
-        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
+            self.depth_np = np.array(10).astype('int32')
-        self.outputs = {'Out': (out, x_lod)}
+            self.x_lod = [[4, 1, 3, 3]]
+            self.x = [
-    def test_check_output(self):
+                np.random.randint(0, self.depth - 1)
-        place = paddle.XPUPlace(0)
+                for i in range(sum(self.x_lod[0]))
-        self.check_output_with_place(place, check_dygraph=False)
+            ]
+            self.x = np.array(self.x).astype(self.dtype).reshape(
+                [sum(self.x_lod[0]), 1])
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 'core is not compiled with XPU')
+            self.out = np.zeros(shape=(np.product(self.x.shape[:-1]),
-class TestOneHotOp_attr(XPUOpTest):
+                                       self.depth)).astype('float32')
-    def setUp(self):
+            for i in range(np.product(self.x.shape)):
-        self.op_type = 'one_hot'
+                self.out[i, self.x[i]] = 1.0
-        depth = 10
-        x_lod = [[4, 1, 3, 3]]
+            self.outputs = {'Out': (self.out, self.x_lod)}
-        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+        def set_input(self):
+            self.inputs = {
-        out = np.zeros(shape=(np.product(x.shape[:-1]),
+                'X': (self.x, self.x_lod),
-                              depth)).astype('float32')
+                'depth_tensor': self.depth_np
+            }
-        for i in range(np.product(x.shape)):
+            self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
-            out[i, x[i]] = 1.0
+        def test_check_output(self):
-        self.inputs = {'X': (x, x_lod)}
+            self.check_output(check_dygraph=False)
-        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32), 'depth': depth}
-        self.outputs = {'Out': (out, x_lod)}
+        def init_dtype(self):
+            self.dtype = self.in_type
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
+    class TestXPUOneHotOP_attr(TestXPUOneHotOP):
-        self.check_output_with_place(place, check_dygraph=False)
+        def set_input(self):
+            self.inputs = {'X': (self.x, self.x_lod)}
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+            self.attrs = {
-                 'core is not compiled with XPU')
+                'dtype': int(core.VarDesc.VarType.FP32),
-class TestOneHotOp_default_dtype(XPUOpTest):
+                'depth': self.depth
-    def setUp(self):
+            }
-        self.op_type = 'one_hot'
-        depth = 10
+    class TestXPUOneHotOP_default_dtype(TestXPUOneHotOP):
-        depth_np = np.array(10).astype('int32')
-        x_lod = [[4, 1, 3, 3]]
+        def set_input(self):
-        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+            self.inputs = {
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+                'X': (self.x, self.x_lod),
+                'depth_tensor': self.depth_np
-        out = np.zeros(shape=(np.product(x.shape[:-1]),
+            }
-                              depth)).astype('float32')
+            self.attrs = {}
-        for i in range(np.product(x.shape)):
+    class TestXPUOneHotOP_default_dtype_attr(TestXPUOneHotOP):
-            out[i, x[i]] = 1.0
+        def set_input(self):
-        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
+            self.inputs = {'X': (self.x, self.x_lod)}
-        self.attrs = {}
+            self.attrs = {'depth': self.depth}
-        self.outputs = {'Out': (out, x_lod)}
+    class TestXPUOneHotOP_out_of_range(TestXPUOneHotOP):
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
+        def set_data(self):
-        self.check_output_with_place(place, check_dygraph=False)
+            self.depth = 10
+            self.x_lod = [[4, 1, 3, 3]]
+            self.x = [
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                np.random.choice([-1, self.depth])
-                 'core is not compiled with XPU')
+                for i in range(sum(self.x_lod[0]))
-class TestOneHotOp_default_dtype_attr(XPUOpTest):
+            ]
-    def setUp(self):
+            self.x = np.array(self.x).astype(self.dtype).reshape(
-        self.op_type = 'one_hot'
+                [sum(self.x_lod[0]), 1])
-        depth = 10
-        x_lod = [[4, 1, 3, 3]]
+            self.out = np.zeros(shape=(np.product(self.x.shape[:-1]),
-        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+                                       self.depth)).astype('float32')
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+            self.outputs = {'Out': (self.out, self.x_lod)}
-        out = np.zeros(shape=(np.product(x.shape[:-1]),
-                              depth)).astype('float32')
+        def set_input(self):
+            self.inputs = {'X': (self.x, self.x_lod)}
-        for i in range(np.product(x.shape)):
+            self.attrs = {'depth': self.depth, 'allow_out_of_range': True}
-            out[i, x[i]] = 1.0
-        self.inputs = {'X': (x, x_lod)}
+support_types = get_xpu_op_support_types('one_hot')
-        self.attrs = {'depth': depth}
+print("support_types: %s" % str(support_types))
-        self.outputs = {'Out': (out, x_lod)}
+for stype in support_types:
+    create_test_class(globals(), XPUTestOneHotOP, stype)
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
+if __name__ == "__main__":
-        self.check_output_with_place(place, check_dygraph=False)
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 'core is not compiled with XPU')
-class TestOneHotOp_out_of_range(XPUOpTest):
-    def setUp(self):
-        self.op_type = 'one_hot'
-        depth = 10
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.choice([-1, depth]) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
-        out = np.zeros(shape=(np.product(x.shape[:-1]),
-                              depth)).astype('float32')
-        self.inputs = {'X': (x, x_lod)}
-        self.attrs = {'depth': depth, 'allow_out_of_range': True}
-        self.outputs = {'Out': (out, x_lod)}
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place, check_dygraph=False)
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 'core is not compiled with XPU')
-class TestOneHotOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            # the input must be Variable
-            in_w = np.random.random((4, 1)).astype('int32')
-            self.assertRaises(TypeError, fluid.layers.one_hot, in_w)
-            # the input must be int32 or int 64
-            in_w2 = fluid.layers.data(
-                name='in_w2',
-                shape=[4, 1],
-                append_batch_size=False,
-                dtype='float32')
-            self.assertRaises(TypeError, fluid.layers.one_hot, in_w2)
-            # the depth must be int, long or Variable
-            in_r = fluid.layers.data(
-                name='in_r',
-                shape=[4, 1],
-                append_batch_size=False,
-                dtype='int32')
-            depth_w = np.array([4])
-            self.assertRaises(TypeError, fluid.layers.one_hot, in_r, 4.1)
-            self.assertRaises(TypeError, fluid.layers.one_hot, in_r, depth_w)
-"""
-if __name__ == '__main__':
-    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/xpu/test_sampling_id_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sampling_id_op_xpu.py
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid.op import Operator
+import paddle
+class TestSamplingIdShape(unittest.TestCase):
+    def test_shape(self):
+        paddle.enable_static()
+        x = fluid.layers.data(name='x', shape=[3], dtype='float32')
+        output = fluid.layers.sampling_id(x)
+        place = fluid.XPUPlace(0)
+        exe = fluid.Executor(place=place)
+        exe.run(fluid.default_startup_program())
+        feed = {
+            'x': np.array([[0.2, 0.3, 0.5], [0.2, 0.3, 0.4]], dtype='float32')
+        }
+        output_np = exe.run(feed=feed, fetch_list=[output])[0]
+        self.assertEqual(output.shape[0], -1)
+        self.assertEqual(len(output.shape), 1)
+        self.assertEqual(output_np.shape[0], 2)
+        self.assertEqual(len(output_np.shape), 1)
+if __name__ == "__main__":
+    unittest.main()