From 5098891fdf573a9a2db5fedacbefa059c9def8ce Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Sat, 10 Oct 2020 15:34:54 +0800
Subject: [PATCH] add softmax xpu kernel (#27700)

---
 paddle/fluid/operators/softmax_op_xpu.cc      | 99 +++++++++++++++++++
 .../unittests/xpu/test_softmax_op_xpu.py      | 93 +++++++++++++++++
 2 files changed, 192 insertions(+)
 create mode 100644 paddle/fluid/operators/softmax_op_xpu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
diff --git a/paddle/fluid/operators/softmax_op_xpu.cc b/paddle/fluid/operators/softmax_op_xpu.cc
new file mode 100644
index 0000000000..29740000ae
--- /dev/null
+++ b/paddle/fluid/operators/softmax_op_xpu.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+template <typename DeviceContext, typename T>
+class SoftmaxXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    const int rank = x->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    PADDLE_ENFORCE_EQ(axis == -1 || axis == rank - 1, true,
+                      platform::errors::InvalidArgument(
+                          "xpu softmax kernel only support last dimension of x "
+                          "(axis==-1 or axis==x_dims-1), but received axis: "
+                          "%d, x's shape: %s.",
+                          axis, x->dims()));
+
+    // allocate memory on device.
+    out->mutable_data<T>(context.GetPlace());
+
+    const int n = SizeToAxis(axis, x->dims());
+    const int d = SizeFromAxis(axis, x->dims());
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = xpu::softmax2d_forward(dev_ctx.x_context(), x->data<float>(),
+                                   out->data<float>(), n, d, d <= 2048);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(softmax2d_forward) return wrong "
+                                   "value[%d], please check whether "
+                                   "Baidu Kunlun Card is properly installed.",
+                                   r));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SoftmaxGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* out = context.Input<Tensor>("Out");
+    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
+    const int rank = dx->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+
+    // allocate memory on device.
+    dx->mutable_data<T>(context.GetPlace());
+
+    const int n = SizeToAxis(axis, dx->dims());
+    const int d = SizeFromAxis(axis, dx->dims());
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r =
+        xpu::softmax2d_backward(dev_ctx.x_context(), out->data<float>(),
+                                dout->data<float>(), dx->data<float>(), n, d);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(softmax2d_backward) return wrong "
+                                   "value[%d], please check whether "
+                                   "Baidu Kunlun Card is properly installed.",
+                                   r));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    softmax, ops::SoftmaxXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    softmax_grad,
+    ops::SoftmaxGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
new file mode 100644
index 0000000000..92842fbc2e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
@@ -0,0 +1,93 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+import sys
+import unittest
+sys.path.append("..")
+from op_test import OpTest
+
+paddle.enable_static()
+np.random.seed(10)
+
+
+def stable_softmax(x):
+    """Compute the softmax of vector x in a numerically stable way."""
+    # clip to shiftx, otherwise, when calc loss with
+    # log(exp(shiftx)), may get log(0)=INF
+    shiftx = (x - np.max(x)).clip(-64.)
+    exps = np.exp(shiftx)
+    return exps / np.sum(exps)
+
+
+def ref_softmax(x, axis=None, dtype=None):
+    x_t = x.copy()
+    if dtype is not None:
+        x_t = x_t.astype(dtype)
+    if axis is None:
+        axis = -1
+    return np.apply_along_axis(stable_softmax, axis, x_t)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSoftmaxOp(OpTest):
+    def setUp(self):
+        self.op_type = "softmax"
+        self.dtype = np.float32
+        self.shape = [2, 3, 4, 5]
+        self.axis = -1
+        self.set_attrs()
+
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        out = np.apply_along_axis(stable_softmax, self.axis, x)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {'axis': self.axis, 'use_xpu': True}
+
+    def set_attrs(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output_with_place(paddle.XPUPlace(0), atol=1e-4)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(paddle.XPUPlace(0), ['X'], 'Out')
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSoftmaxAxis3(TestXPUSoftmaxOp):
+    def set_attrs(self):
+        self.axis = 3
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSoftmax2D(TestXPUSoftmaxOp):
+    def set_attrs(self):
+        self.shape = [10, 12]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSoftmax3D(TestXPUSoftmaxOp):
+    def set_attrs(self):
+        self.shape = [4, 5, 6]
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab