add the kunlun kernel for the paddle 2.0

Add xpu kernel for KUNLUN core: * accuracy op * sign op * scale op * sum op Add default atol in xpu unittest.

add the kunlun kernel for the paddle 2.0
Add xpu kernel for KUNLUN core: * accuracy op * sign op * scale op * sum op Add default atol in xpu unittest.
d4359b0f · Jack Zhou · GitHub · 4676f03c · d4359b0f · d4359b0f
8 changed file
--- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AccuracyXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* inference = ctx.Input<Tensor>("Out");
+    auto* indices = ctx.Input<Tensor>("Indices");
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* accuracy = ctx.Output<Tensor>("Accuracy");
+    auto* correct = ctx.Output<Tensor>("Correct");
+    auto* total = ctx.Output<Tensor>("Total");
+    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
+    int* total_data = total->mutable_data<int>(ctx.GetPlace());
+    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
+    const int64_t* indices_data = indices->data<int64_t>();
+    const int64_t* label_data = label->data<int64_t>();
+    size_t num_samples = inference->dims()[0];
+    size_t class_dim = inference->dims()[1];
+    if (num_samples == 0) {
+      return;
+    }
+    size_t indices_int32_size = num_samples * class_dim * sizeof(int);
+    size_t indices_int64_size = num_samples * class_dim * sizeof(int64_t);
+    size_t label_int32_size = num_samples * sizeof(int);
+    size_t label_int64_size = num_samples * sizeof(int64_t);
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int* indices_int32_device = NULL;
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc(reinterpret_cast<void**>(&indices_int32_device),
+                   indices_int32_size),
+        XPU_SUCCESS,
+        platform::errors::ResourceExhausted(
+            "\n\nOut of memory error on XPU, Cannot allocate %s memory"
+            " on XPU. \n\nPlease check whether there is any other process "
+            "using XPU.\n",
+            string::HumanReadableSize(indices_int32_size)));
+    int* label_int32_device = NULL;
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc(reinterpret_cast<void**>(&label_int32_device),
+                   label_int32_size),
+        XPU_SUCCESS,
+        platform::errors::ResourceExhausted(
+            "\n\nOut of memory error on XPU, Cannot allocate %s memory"
+            " on XPU. \n\nPlease check whether there is any other process "
+            "using XPU.\n",
+            string::HumanReadableSize(label_int32_size)));
+
+    int* indices_int32_host =
+        reinterpret_cast<int*>(std::malloc(indices_int32_size));
+    int64_t* indices_int64_host =
+        reinterpret_cast<int64_t*>(std::malloc(indices_int64_size));
+    int* label_int32_host =
+        reinterpret_cast<int*>(std::malloc(label_int32_size));
+    int64_t* label_int64_host =
+        reinterpret_cast<int64_t*>(std::malloc(label_int64_size));
+    dev_ctx.Wait();
+    memory::Copy(platform::CPUPlace(), indices_int64_host,
+                 BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 indices_data, indices_int64_size);
+    memory::Copy(platform::CPUPlace(), label_int64_host,
+                 BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 label_data, label_int64_size);
+    for (int i = 0; i < num_samples; ++i) {
+      label_int32_host[i] = label_int64_host[i];
+      for (int j = 0; j < class_dim; ++j) {
+        indices_int32_host[i * class_dim + j] =
+            indices_int64_host[i * class_dim + j];
+      }
+    }
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 indices_int32_device, platform::CPUPlace(), indices_int32_host,
+                 indices_int32_size);
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 label_int32_device, platform::CPUPlace(), label_int32_host,
+                 label_int32_size);
+    int r = xpu::accuracy(dev_ctx.x_context(), indices_int32_device,
+                          label_int32_device, num_samples, class_dim,
+                          correct_data, total_data, accuracy_data);
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::Fatal("XPU kernel error!"));
+    dev_ctx.Wait();
+    xpu_free(indices_int32_device);
+    xpu_free(label_int32_device);
+    std::free(indices_int32_host);
+    std::free(indices_int64_host);
+    std::free(label_int32_host);
+    std::free(label_int64_host);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    accuracy,
+    ops::AccuracyXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ b/paddle/fluid/operators/scale_op_xpu.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/scale_op.h"
+#include <string>
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class ScaleXPUKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto* in_var = ctx.InputVar("X");
+    auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
+    auto scale = static_cast<T>(ctx.Attr<float>("scale"));
+    auto bias = static_cast<T>(ctx.Attr<float>("bias"));
+    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
+    auto* out_var = ctx.OutputVar("Out");
+    if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
+      auto& in_slr = in_var->Get<framework::SelectedRows>();
+      auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
+      out_slr->set_rows(in_slr.rows());
+      out_slr->set_height(in_slr.height());
+    }
+    auto* out =
+        framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
+    out->mutable_data<T>(in->place());
+    PADDLE_ENFORCE_EQ(
+        in->dims(), out->dims(),
+        platform::errors::InvalidArgument("In and out should have the same dim,"
+                                          " expected %s, but got %s.",
+                                          in->dims().to_str().c_str(),
+                                          out->dims().to_str().c_str()));
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::scale(dev_ctx.x_context(), in->numel(), scale, bias,
+                       bias_after_scale, in->data<float>(), out->data<float>());
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::Fatal("XPU kernel error!"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    scale, ops::ScaleXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
--- a/paddle/fluid/operators/sign_op_xpu.cc
+++ b/paddle/fluid/operators/sign_op_xpu.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/sign_op.h"
+#include "paddle/fluid/platform/xpu_header.h"
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SignXPUKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    out->mutable_data<T>(in->place());
+    auto xpu_context = context.device_context<DeviceContext>().x_context();
+    int r = xpu::activation_forward(xpu_context, xpu::Activation_t::SIGN,
+                                    in->numel(), in->data<T>(), out->data<T>());
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::Fatal("XPU kernel error!"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    sign, ops::SignXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
--- a/paddle/fluid/operators/sum_op_xpu.cc
+++ b/paddle/fluid/operators/sum_op_xpu.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/sum_op.h"
+#include <vector>
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SumXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto in_vars = context.MultiInputVar("X");
+    auto out_var = context.OutputVar("Out");
+    auto *out = context.Output<LoDTensor>("Out");
+    bool in_place = out_var == in_vars[0];
+    int N = in_vars.size();
+    PADDLE_ENFORCE_EQ(
+        out_var->IsType<framework::LoDTensor>(), true,
+        platform::errors::InvalidArgument("XPU only surpport LodTensor"));
+    if (!in_place) {
+      out->mutable_data<T>(context.GetPlace());
+    }
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<const float *> ptrs(N, nullptr);
+    int valid_count = 0;
+    for (int i = 0; i < N; ++i) {
+      PADDLE_ENFORCE_EQ(
+          in_vars[i]->IsType<framework::LoDTensor>(), true,
+          platform::errors::InvalidArgument("XPU only surpport LodTensor"));
+      auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
+      if (in_t.numel() == 0) {
+        continue;
+      }
+      ptrs[valid_count] = reinterpret_cast<const float *>(in_t.data<T>());
+      valid_count++;
+    }
+    int r = xpu::sum_batch(dev_ctx.x_context(), ptrs.data(), out->data<T>(),
+                           valid_count, out->numel());
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::Fatal("XPU kernel error!"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    sum, ops::SumXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
--- a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUAccuracyOp(OpTest):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.init_dtype()
+        n = 8192
+        infer = np.random.random((n, 1)).astype(self.dtype)
+        indices = np.random.randint(0, 2, (n, 1)).astype('int64')
+        label = np.random.randint(0, 2, (n, 1)).astype('int64')
+        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
+        self.outputs = {
+            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
+            'Correct': np.array([num_correct]).astype("int32"),
+            'Total': np.array([n]).astype("int32")
+        }
+        self.attrs = {'use_xpu': True}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUScaleOp(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.dtype = np.float32
+        self.inputs = {'X': np.random.random((10, 10)).astype(self.dtype)}
+        self.attrs = {'scale': -2.3, 'use_xpu': True}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.dtype(self.attrs['scale'])
+        }
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSignOp(OpTest):
+    def setUp(self):
+        self.op_type = "sign"
+        self.dtype = np.float32
+        self.inputs = {
+            'X': np.random.uniform(-10, 10, (10, 10)).astype(self.dtype)
+        }
+        self.outputs = {'Out': np.sign(self.inputs['X'])}
+        self.attrs = {'use_xpu': True}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSumOp(OpTest):
+    def setUp(self):
+        self.op_type = "sum"
+        self.use_mkldnn = False
+        self.init_kernel_type()
+        x0 = np.random.random((3, 40)).astype(self.dtype)
+        x1 = np.random.random((3, 40)).astype(self.dtype)
+        x2 = np.random.random((3, 40)).astype(self.dtype)
+        self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
+        y = x0 + x1 + x2
+        self.outputs = {'Out': y}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'use_xpu': True}
+
+    def init_kernel_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['x0'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()