From 22fe4f03f611fd6be5dd3cb291814546a6cec389 Mon Sep 17 00:00:00 2001
From: dongfangshenzhu <102794151+dongfangshenzhu@users.noreply.github.com>
Date: Fri, 23 Sep 2022 16:37:32 +0800
Subject: [PATCH] add phi reduce_sum test=kunlun (#46241)

* add phi reduce_sum test=kunlun

* add fhi reduce_sum test=kunlun

* add fhi reduce_sum test=kunlun
---
 .../operators/reduce_ops/reduce_sum_op_xpu.cc | 99 -------------------
 paddle/phi/kernels/reduce_sum_kernel.cc       |  6 +-
 .../phi/kernels/xpu/reduce_sum_grad_kernel.cc | 65 ++++++++++++
 paddle/phi/kernels/xpu/reduce_sum_kernel.cc   | 44 +++++++++
 .../unittests/xpu/test_reduce_max_op_xpu.py   |  2 +-
 .../unittests/xpu/test_reduce_sum_op_xpu.py   |  2 +-
 6 files changed, 116 insertions(+), 102 deletions(-)
 delete mode 100644 paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
 create mode 100644 paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/xpu/reduce_sum_kernel.cc
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
deleted file mode 100644
index 1d36bdb2841..00000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifdef PADDLE_WITH_XPU
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h"
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ReduceSumXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    XPUReduce<DeviceContext, T>(context, xpu::reduce_sum<T>);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReduceSumGradXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto dims = context.Attr<std::vector<int>>("dim");
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-
-    int in_dtype = context.Attr<int>("in_dtype");
-    PADDLE_ENFORCE_EQ(
-        in_dtype == -1,
-        true,
-        platform::errors::InvalidArgument(
-            "XPU only support in_dtype == -1 in reduce_sum_grad op."));
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    x_grad->mutable_data<T>(context.GetPlace());
-    const auto* out_data = out->data<T>();
-    auto* x_grad_data = x_grad->data<T>();
-
-    const auto& input_dim_size = x->dims().size();
-    std::vector<int> true_dims;
-    for (size_t i = 0; i < dims.size(); ++i) {
-      if (dims[i] < 0) {
-        true_dims.push_back(dims[i] + input_dim_size);
-      } else {
-        true_dims.push_back(dims[i]);
-      }
-    }
-
-    std::vector<int> ydims(input_dim_size);
-    std::vector<int> xdims((input_dim_size));
-    std::set<int> dims_set(true_dims.begin(), true_dims.end());
-    for (auto i = 0; i < input_dim_size; i++) {
-      xdims[i] = x->dims()[i];
-      if (dims_set.find(i) != dims_set.end() || reduce_all) {
-        ydims[i] = 1;
-      } else {
-        ydims[i] = x->dims()[i];
-      }
-    }
-
-    int r = xpu::broadcast<T>(
-        dev_ctx.x_context(), out_data, x_grad_data, ydims, xdims);
-    PADDLE_ENFORCE_EQ(
-        r == xpu::Error_t::SUCCESS,
-        true,
-        platform::errors::External("XPU broadcast in reduce_sum_grad op return"
-                                   " wrong value[%d %s].",
-                                   r,
-                                   XPUAPIErrorMsg[r]));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_XPU_KERNEL(
-    reduce_sum,
-    ops::ReduceSumXPUKernel<paddle::platform::XPUDeviceContext, float>);
-REGISTER_OP_XPU_KERNEL(
-    reduce_sum_grad,
-    ops::ReduceSumGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
-
-#endif
diff --git a/paddle/phi/kernels/reduce_sum_kernel.cc b/paddle/phi/kernels/reduce_sum_kernel.cc
index 83db2d854b9..075e4a6022d 100644
--- a/paddle/phi/kernels/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/reduce_sum_kernel.cc
@@ -73,7 +73,7 @@ PD_REGISTER_KERNEL(sum,
 }
 #endif
 
-#if defined(PADDLE_WITH_XPU_KP)
+#if defined(PADDLE_WITH_XPU_KP) && !defined(PADDLE_WITH_XPU)
 PD_REGISTER_KERNEL(sum, KPS, ALL_LAYOUT, phi::SumKernel, float) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
@@ -83,3 +83,7 @@ PD_REGISTER_KERNEL(sum, KPS, ALL_LAYOUT, phi::SumKernel, float) {
 PD_REGISTER_KERNEL(
     sum, OneDNN, ALL_LAYOUT, phi::SumKernel, float, phi::dtype::bfloat16) {}
 #endif
+
+#if defined(PADDLE_WITH_XPU)
+PD_REGISTER_KERNEL(sum, XPU, ALL_LAYOUT, phi::SumKernel, float) {}
+#endif
diff --git a/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc
new file mode 100644
index 00000000000..9dc1fe92fac
--- /dev/null
+++ b/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
+
+#include <set>
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceSumGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const IntArray& dims_arr,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DenseTensor* x_grad) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  auto dims = dims_arr.GetData();
+  dev_ctx.template Alloc<XPUType>(x_grad);
+  const auto* out_data = out_grad.data<XPUType>();
+  auto* x_grad_data = x_grad->data<XPUType>();
+  const auto& input_dim_size = x.dims().size();
+  std::vector<int> true_dims;
+  for (size_t i = 0; i < dims.size(); ++i) {
+    if (dims[i] < 0) {
+      true_dims.push_back(dims[i] + input_dim_size);
+    } else {
+      true_dims.push_back(dims[i]);
+    }
+  }
+
+  std::vector<int> ydims(input_dim_size);
+  std::vector<int> xdims((input_dim_size));
+  std::set<int> dims_set(true_dims.begin(), true_dims.end());
+  for (auto i = 0; i < input_dim_size; i++) {
+    xdims[i] = x.dims()[i];
+    if (dims_set.find(i) != dims_set.end() || reduce_all) {
+      ydims[i] = 1;
+    } else {
+      ydims[i] = x.dims()[i];
+    }
+  }
+
+  int r = xpu::broadcast<XPUType>(
+      dev_ctx.x_context(), out_data, x_grad_data, ydims, xdims);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast");
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sum_grad, XPU, ALL_LAYOUT, phi::ReduceSumGradKernel, float) {
+}
diff --git a/paddle/phi/kernels/xpu/reduce_sum_kernel.cc b/paddle/phi/kernels/xpu/reduce_sum_kernel.cc
new file mode 100644
index 00000000000..74c50304b14
--- /dev/null
+++ b/paddle/phi/kernels/xpu/reduce_sum_kernel.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_sum_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/xpu/reduce.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SumRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const IntArray& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DataType out_dtype,
+                  DenseTensor* out) {
+  int r = XPUReduce<Context, T>(dev_ctx,
+                                x,
+                                dims.GetData(),
+                                keep_dim,
+                                reduce_all,
+                                out,
+                                xpu::reduce_sum<T>);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sum_raw, XPU, ALL_LAYOUT, phi::SumRawKernel, float) {}
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py
index ac827b6738f..597b7ee0fe9 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py
@@ -67,7 +67,7 @@ class XPUTestReduceMaxOp(XPUOpTestWrapper):
             self.check_output_with_place(self.place)
 
         def test_check_grad(self):
-            pass
+            self.check_grad_with_place(self.place, ['X'], 'Out')
 
     class XPUTestReduceMaxCase1(XPUTestReduceMaxBase):
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py
index d80fd187dfd..15db9e5a375 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py
@@ -67,7 +67,7 @@ class XPUTestReduceSumOp(XPUOpTestWrapper):
             self.check_output_with_place(self.place)
 
         def test_check_grad(self):
-            pass
+            self.check_grad_with_place(self.place, ['X'], 'Out')
 
     class XPUTestReduceSumCase1(XPUTestReduceSumBase):
 
-- 
GitLab