[XPU] migrate reduce kernels to phi, test=kunlun (#45973)

5829069d · ykkk2333 · GitHub · d7e74e63 · d7e74e63 · d7e74e63
12 changed file
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifdef PADDLE_WITH_XPU
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h"
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class ReduceMaxXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    XPUReduce<DeviceContext, T>(context, xpu::reduce_max<T>);
-  }
-};
-template <typename DeviceContext, typename T>
-class ReduceMaxGradXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto dims = context.Attr<std::vector<int>>("dim");
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Input<Tensor>("Out");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    int in_dtype = context.Attr<int>("in_dtype");
-    PADDLE_ENFORCE_EQ(
-        in_dtype == -1,
-        true,
-        platform::errors::InvalidArgument(
-            "XPU only support in_dtype == -1 in reduce_sum_grad op."));
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    x_grad->mutable_data<T>(context.GetPlace());
-    const T* x_data = x->data<T>();
-    const T* out_data = out->data<T>();
-    const T* out_grad_data = out_grad->data<T>();
-    auto* x_grad_data = x_grad->data<T>();
-    const auto& input_dim_size = x->dims().size();
-    std::vector<int> true_dims;
-    for (size_t i = 0; i < dims.size(); ++i) {
-      if (dims[i] < 0) {
-        true_dims.push_back(dims[i] + input_dim_size);
-      } else {
-        true_dims.push_back(dims[i]);
-      }
-    }
-    std::vector<int> ydims(input_dim_size);
-    std::vector<int> xdims((input_dim_size));
-    std::set<int> dims_set(true_dims.begin(), true_dims.end());
-    for (auto i = 0; i < input_dim_size; i++) {
-      xdims[i] = x->dims()[i];
-      if (dims_set.find(i) != dims_set.end() || reduce_all) {
-        ydims[i] = 1;
-      } else {
-        ydims[i] = x->dims()[i];
-      }
-    }
-    T* brocast1 = nullptr;
-    T* brocast2 = nullptr;
-    bool* equal = nullptr;
-    PADDLE_ENFORCE_EQ(
-        xpu_malloc(reinterpret_cast<void**>(&brocast1), x->numel() * sizeof(T)),
-        XPU_SUCCESS,
-        platform::errors::ResourceExhausted("XPU has no enough memory"));
-    PADDLE_ENFORCE_EQ(
-        xpu_malloc(reinterpret_cast<void**>(&equal), x->numel() * sizeof(bool)),
-        XPU_SUCCESS,
-        platform::errors::ResourceExhausted("XPU has no enough memory"));
-    PADDLE_ENFORCE_EQ(
-        xpu_malloc(reinterpret_cast<void**>(&brocast2), x->numel() * sizeof(T)),
-        XPU_SUCCESS,
-        platform::errors::ResourceExhausted("XPU has no enough memory"));
-    // step 1. brocast out and out_grad
-    int r = xpu::broadcast<T>(
-        dev_ctx.x_context(), out_data, brocast1, ydims, xdims);
-    PADDLE_ENFORCE_EQ(
-        r == xpu::Error_t::SUCCESS,
-        true,
-        platform::errors::External("XPU broadcast in reduce_max_grad op return"
-                                   " wrong value[%d %s].",
-                                   r,
-                                   XPUAPIErrorMsg[r]));
-    r = xpu::broadcast<T>(
-        dev_ctx.x_context(), out_grad_data, brocast2, ydims, xdims);
-    PADDLE_ENFORCE_EQ(
-        r == xpu::Error_t::SUCCESS,
-        true,
-        platform::errors::External("XPU broadcast in reduce_max_grad op return"
-                                   " wrong value[%d %s].",
-                                   r,
-                                   XPUAPIErrorMsg[r]));
-    // step 2. comparse out_brocast and x
-    r = xpu::equal<T>(dev_ctx.x_context(), x_data, brocast1, equal, x->numel());
-    PADDLE_ENFORCE_EQ(
-        r == xpu::Error_t::SUCCESS,
-        true,
-        platform::errors::External("XPU equal in reduce_max_grad "
-                                   "op return wrong value[%d %s].",
-                                   r,
-                                   XPUAPIErrorMsg[r]));
-    // step 3. get x_grad
-    r = xpu::constant<T>(dev_ctx.x_context(), brocast1, x->numel(), 0);
-    PADDLE_ENFORCE_EQ(
-        r == xpu::Error_t::SUCCESS,
-        true,
-        platform::errors::External("XPU constant in reduce_max_grad op return"
-                                   " wrong value[%d %s].",
-                                   r,
-                                   XPUAPIErrorMsg[r]));
-    r = xpu::select<T>(dev_ctx.x_context(),
-                       equal,
-                       brocast2,
-                       brocast1,
-                       x_grad_data,
-                       xdims,
-                       xdims);
-    PADDLE_ENFORCE_EQ(
-        r == xpu::Error_t::SUCCESS,
-        true,
-        platform::errors::External("XPU select in reduce_max_grad op return"
-                                   " wrong value[%d %s].",
-                                   r,
-                                   XPUAPIErrorMsg[r]));
-    if (dev_ctx.x_context()->xpu_stream) {
-      dev_ctx.Wait();
-    }
-    xpu_free(brocast1);
-    xpu_free(brocast2);
-    xpu_free(equal);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-REGISTER_OP_XPU_KERNEL(
-    reduce_max,
-    ops::ReduceMaxXPUKernel<paddle::platform::XPUDeviceContext, float>);
-REGISTER_OP_XPU_KERNEL(
-    reduce_max_grad,
-    ops::ReduceMaxGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
-#endif
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifdef PADDLE_WITH_XPU
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class ReduceMeanXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_xpu_place(context.GetPlace()),
-        true,
-        platform::errors::Unavailable("This kernel only runs on XPU."));
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> xdims;
-    for (int i = 0; i < input->dims().size(); i++) {
-      xdims.push_back(input->dims()[i]);
-    }
-    auto rdims = context.Attr<std::vector<int>>("dim");
-    const auto& input_dim_size = input->dims().size();
-    std::vector<int> reduce_dims;
-    if (reduce_all) {
-      for (size_t i = 0; i < xdims.size(); i++) {
-        reduce_dims.push_back(static_cast<int>(i));
-      }
-    } else {
-      for (size_t i = 0; i < rdims.size(); ++i) {
-        if (rdims[i] < 0) {
-          reduce_dims.push_back(rdims[i] + input_dim_size);
-        } else {
-          reduce_dims.push_back(rdims[i]);
-        }
-      }
-    }
-    int r = xpu::reduce_mean(dev_ctx.x_context(),
-                             reinterpret_cast<const XPUType*>(input->data<T>()),
-                             reinterpret_cast<XPUType*>(output->data<T>()),
-                             xdims,
-                             reduce_dims);
-    PADDLE_ENFORCE_EQ(r,
-                      XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU reduce_mean kernel return wrong value[%d %s]",
-                          r,
-                          XPUAPIErrorMsg[r]));
-  }
-};
-template <typename DeviceContext, typename T>
-class ReduceMeanGradXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    XPUType* x_data =
-        reinterpret_cast<XPUType*>(input_grad->mutable_data<T>(ctx.GetPlace()));
-    const XPUType* dy_data =
-        reinterpret_cast<const XPUType*>(output_grad->data<T>());
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
-    bool keep_dim = ctx.Attr<bool>("keep_dim");
-    std::vector<int> xdims;
-    for (int i = 0; i < input->dims().size(); i++) {
-      xdims.push_back(input->dims()[i]);
-    }
-    std::vector<int> ydims;
-    for (int i = 0; i < output_grad->dims().size(); i++) {
-      ydims.push_back(output_grad->dims()[i]);
-    }
-    int reduce_numel = 1;
-    if (reduce_all) {
-      reduce_dims.clear();
-      for (size_t d = 0; d < xdims.size(); ++d) {
-        reduce_dims.push_back(static_cast<int>(d));
-      }
-    }
-    for (auto& d : reduce_dims) {
-      if (d < 0) {
-        d = d + xdims.size();
-      }
-      reduce_numel *= xdims[d];
-    }
-    if (keep_dim != true) {
-      sort(reduce_dims.begin(), reduce_dims.end());
-      for (auto& d : reduce_dims) {
-        ydims.insert(ydims.begin() + d, 1);
-      }
-    }
-    float val = 1.0f / static_cast<float>(reduce_numel);
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int r = xpu::constant(
-        dev_ctx.x_context(), x_data, input->numel(), static_cast<XPUType>(val));
-    PADDLE_ENFORCE_EQ(r,
-                      XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU constant kernel return wrong value[%d %s]",
-                          r,
-                          XPUAPIErrorMsg[r]));
-    r = xpu::broadcast_mul(
-        dev_ctx.x_context(), x_data, dy_data, x_data, xdims, ydims);
-    PADDLE_ENFORCE_EQ(r,
-                      XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU broadcast_mul kernel return wrong value[%d %s]",
-                          r,
-                          XPUAPIErrorMsg[r]));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-REGISTER_OP_XPU_KERNEL(
-    reduce_mean,
-    ops::ReduceMeanXPUKernel<paddle::platform::XPUDeviceContext, float>);
-REGISTER_OP_XPU_KERNEL(
-    reduce_mean_grad,
-    ops::ReduceMeanGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
-#endif
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op_xpu.cc
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifdef PADDLE_WITH_XPU
-#include <memory>
-#include <vector>
-#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class ReduceProdXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_xpu_place(context.GetPlace()),
-        true,
-        platform::errors::Unavailable("This kernel only runs on XPU."));
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> xdims;
-    for (int i = 0; i < input->dims().size(); i++) {
-      xdims.push_back(input->dims()[i]);
-    }
-    auto rdims = context.Attr<std::vector<int>>("dim");
-    const auto& input_dim_size = input->dims().size();
-    std::vector<int> reduce_dims;
-    if (reduce_all) {
-      for (size_t i = 0; i < xdims.size(); i++) {
-        reduce_dims.push_back(static_cast<int>(i));
-      }
-    } else {
-      for (size_t i = 0; i < rdims.size(); ++i) {
-        if (rdims[i] < 0) {
-          reduce_dims.push_back(rdims[i] + input_dim_size);
-        } else {
-          reduce_dims.push_back(rdims[i]);
-        }
-      }
-    }
-    int r = xpu::reduce_prod(dev_ctx.x_context(),
-                             reinterpret_cast<const XPUType*>(input->data<T>()),
-                             reinterpret_cast<XPUType*>(output->data<T>()),
-                             xdims,
-                             reduce_dims);
-    PADDLE_ENFORCE_EQ(r,
-                      XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU reduce_prod kernel return wrong value[%d %s]",
-                          r,
-                          XPUAPIErrorMsg[r]));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-REGISTER_OP_XPU_KERNEL(
-    reduce_prod,
-    ops::ReduceProdXPUKernel<paddle::platform::XPUDeviceContext, float>);
-#endif
--- a/paddle/phi/kernels/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/reduce_max_kernel.cc
@@ -42,7 +42,7 @@ PD_REGISTER_KERNEL(
    max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
 #endif
-#if defined(PADDLE_WITH_XPU_KP)
+#if defined(PADDLE_WITH_XPU_KP) && !defined(PADDLE_WITH_XPU)
 PD_REGISTER_KERNEL(max, KPS, ALL_LAYOUT, phi::MaxKernel, float) {}
 #endif
@@ -50,3 +50,7 @@ PD_REGISTER_KERNEL(max, KPS, ALL_LAYOUT, phi::MaxKernel, float) {}
 PD_REGISTER_KERNEL(
    max, OneDNN, ALL_LAYOUT, phi::MaxKernel, float, phi::dtype::bfloat16) {}
 #endif
+#if defined(PADDLE_WITH_XPU)
+PD_REGISTER_KERNEL(max, XPU, ALL_LAYOUT, phi::MaxKernel, float) {}
+#endif
--- a/paddle/phi/kernels/reduce_mean_kernel.cc
+++ b/paddle/phi/kernels/reduce_mean_kernel.cc
@@ -47,7 +47,7 @@ PD_REGISTER_KERNEL(mean,
                   phi::dtype::float16) {}
 #endif
-#if defined(PADDLE_WITH_XPU_KP)
+#if defined(PADDLE_WITH_XPU_KP) && !defined(PADDLE_WITH_XPU)
 PD_REGISTER_KERNEL(mean, KPS, ALL_LAYOUT, phi::MeanKernel, float) {}
 #endif
@@ -55,3 +55,7 @@ PD_REGISTER_KERNEL(mean, KPS, ALL_LAYOUT, phi::MeanKernel, float) {}
 PD_REGISTER_KERNEL(
    mean, OneDNN, ALL_LAYOUT, phi::MeanKernel, float, phi::dtype::bfloat16) {}
 #endif
+#if defined(PADDLE_WITH_XPU)
+PD_REGISTER_KERNEL(mean, XPU, ALL_LAYOUT, phi::MeanKernel, float) {}
+#endif
--- a/paddle/phi/kernels/reduce_prod_kernel.cc
+++ b/paddle/phi/kernels/reduce_prod_kernel.cc
@@ -39,6 +39,10 @@ PD_REGISTER_KERNEL(
    prod, GPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {}
 #endif
-#if defined(PADDLE_WITH_XPU_KP)
+#if defined(PADDLE_WITH_XPU_KP) && !defined(PADDLE_WITH_XPU)
 PD_REGISTER_KERNEL(prod, KPS, ALL_LAYOUT, phi::ProdKernel, float) {}
 #endif
+#if defined(PADDLE_WITH_XPU)
+PD_REGISTER_KERNEL(prod, XPU, ALL_LAYOUT, phi::ProdKernel, float) {}
+#endif
--- a/paddle/phi/kernels/xpu/reduce.h
+++ b/paddle/phi/kernels/xpu/reduce.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+namespace phi {
+template <typename Context, typename T>
+int XPUReduce(const Context& dev_ctx,
+              const DenseTensor& x,
+              const std::vector<int64_t>& dims,
+              bool keep_dim,
+              bool reduce_all,
+              DenseTensor* out,
+              std::function<int(xpu::Context*,
+                                const T*,
+                                T*,
+                                const std::vector<int>&,
+                                const std::vector<int>&)> func) {
+  dev_ctx.template Alloc<T>(out);
+  const auto* x_data = x.data<T>();
+  auto* y_data = out->data<T>();
+  const auto& input_dim_size = x.dims().size();
+  std::vector<int> true_dims;
+  for (size_t i = 0; i < dims.size(); ++i) {
+    if (dims[i] < 0) {
+      true_dims.push_back(dims[i] + input_dim_size);
+    } else {
+      true_dims.push_back(dims[i]);
+    }
+  }
+  std::vector<int> reduce_dims;
+  std::vector<int> xdims((input_dim_size));
+  for (int i = 0; i < input_dim_size; ++i) {
+    xdims[i] = x.dims()[i];
+  }
+  if (reduce_all) {
+    for (int i = 0; i < input_dim_size; ++i) {
+      reduce_dims.push_back(i);
+    }
+  } else {
+    std::set<int> dims_set(true_dims.begin(), true_dims.end());
+    for (auto i = 0; i < input_dim_size; i++) {
+      if (dims_set.find(i) != dims_set.end()) {
+        if (x.dims()[i] != 1) {
+          reduce_dims.push_back(i);
+        }
+      }
+    }
+  }
+  int r = xpu::SUCCESS;
+  if (reduce_dims.size() == 0) {
+    r = xpu::copy<T>(
+        dev_ctx.x_context(), x_data, y_data, x.numel() * sizeof(T));
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
+  } else {
+    r = func(dev_ctx.x_context(), x_data, y_data, xdims, reduce_dims);
+  }
+  return r;
+}
+}  // namespace phi
--- a/paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/reduce_max_grad_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/xpu/reduce.h"
+namespace phi {
+template <typename T, typename Context>
+void ReduceMaxGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out,
+                         const DenseTensor& out_grad,
+                         const IntArray& dims_arr,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DenseTensor* x_grad) {
+  auto dims = dims_arr.GetData();
+  dev_ctx.template Alloc<T>(x_grad);
+  const T* x_data = x.data<T>();
+  const T* out_data = out.data<T>();
+  const T* out_grad_data = out_grad.data<T>();
+  auto* x_grad_data = x_grad->data<T>();
+  const auto& input_dim_size = x.dims().size();
+  std::vector<int> true_dims;
+  for (size_t i = 0; i < dims.size(); ++i) {
+    if (dims[i] < 0) {
+      true_dims.push_back(dims[i] + input_dim_size);
+    } else {
+      true_dims.push_back(dims[i]);
+    }
+  }
+  std::vector<int> ydims(input_dim_size);
+  std::vector<int> xdims((input_dim_size));
+  std::set<int> dims_set(true_dims.begin(), true_dims.end());
+  for (auto i = 0; i < input_dim_size; i++) {
+    xdims[i] = x.dims()[i];
+    if (dims_set.find(i) != dims_set.end() || reduce_all) {
+      ydims[i] = 1;
+    } else {
+      ydims[i] = x.dims()[i];
+    }
+  }
+  T* brocast1 = nullptr;
+  T* brocast2 = nullptr;
+  bool* equal = nullptr;
+  PADDLE_ENFORCE_EQ(
+      xpu_malloc(reinterpret_cast<void**>(&brocast1), x.numel() * sizeof(T)),
+      XPU_SUCCESS,
+      errors::ResourceExhausted("XPU has no enough memory"));
+  PADDLE_ENFORCE_EQ(
+      xpu_malloc(reinterpret_cast<void**>(&equal), x.numel() * sizeof(bool)),
+      XPU_SUCCESS,
+      errors::ResourceExhausted("XPU has no enough memory"));
+  PADDLE_ENFORCE_EQ(
+      xpu_malloc(reinterpret_cast<void**>(&brocast2), x.numel() * sizeof(T)),
+      XPU_SUCCESS,
+      errors::ResourceExhausted("XPU has no enough memory"));
+  // step 1. brocast out and out_grad
+  int r =
+      xpu::broadcast<T>(dev_ctx.x_context(), out_data, brocast1, ydims, xdims);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast");
+  r = xpu::broadcast<T>(
+      dev_ctx.x_context(), out_grad_data, brocast2, ydims, xdims);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast");
+  // step 2. comparse out_brocast and x
+  r = xpu::equal<T>(dev_ctx.x_context(), x_data, brocast1, equal, x.numel());
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "equal");
+  // step 3. get x_grad
+  r = xpu::constant<T>(dev_ctx.x_context(), brocast1, x.numel(), 0);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+  r = xpu::select<T>(dev_ctx.x_context(),
+                     equal,
+                     brocast2,
+                     brocast1,
+                     x_grad_data,
+                     xdims,
+                     xdims);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "select");
+  if (dev_ctx.x_context()->xpu_stream) {
+    dev_ctx.Wait();
+  }
+  xpu_free(brocast1);
+  xpu_free(brocast2);
+  xpu_free(equal);
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(max_grad, XPU, ALL_LAYOUT, phi::ReduceMaxGradKernel, float) {
+}
--- a/paddle/phi/kernels/xpu/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_max_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/reduce_max_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/xpu/reduce.h"
+namespace phi {
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const IntArray& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  int r = XPUReduce<Context, T>(dev_ctx,
+                                x,
+                                dims.GetData(),
+                                keep_dim,
+                                reduce_all,
+                                out,
+                                xpu::reduce_max<T>);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_max");
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(max_raw, XPU, ALL_LAYOUT, phi::MaxRawKernel, float) {}
--- a/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/reduce_mean_grad_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/xpu/reduce.h"
+namespace phi {
+template <typename T, typename Context>
+void ReduceMeanGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const IntArray& dims_arr,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DenseTensor* x_grad) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  dev_ctx.template Alloc<T>(x_grad);
+  const XPUType* dy_data = reinterpret_cast<const XPUType*>(out_grad.data<T>());
+  XPUType* x_data = reinterpret_cast<XPUType*>(x_grad->data<T>());
+  auto reduce_dims = dims_arr.GetData();
+  std::vector<int> xdims;
+  for (int i = 0; i < x.dims().size(); i++) {
+    xdims.push_back(x.dims()[i]);
+  }
+  std::vector<int> ydims;
+  for (int i = 0; i < out_grad.dims().size(); i++) {
+    ydims.push_back(out_grad.dims()[i]);
+  }
+  int reduce_numel = 1;
+  if (reduce_all) {
+    reduce_dims.clear();
+    for (size_t d = 0; d < xdims.size(); ++d) {
+      reduce_dims.push_back(static_cast<int>(d));
+    }
+  }
+  for (auto& d : reduce_dims) {
+    if (d < 0) {
+      d = d + xdims.size();
+    }
+    reduce_numel *= xdims[d];
+  }
+  if (keep_dim != true) {
+    sort(reduce_dims.begin(), reduce_dims.end());
+    for (auto& d : reduce_dims) {
+      ydims.insert(ydims.begin() + d, 1);
+    }
+  }
+  float val = 1.0f / static_cast<float>(reduce_numel);
+  int r = xpu::constant(
+      dev_ctx.x_context(), x_data, x.numel(), static_cast<XPUType>(val));
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+  r = xpu::broadcast_mul(
+      dev_ctx.x_context(), x_data, dy_data, x_data, xdims, ydims);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul");
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(
+    mean_grad, XPU, ALL_LAYOUT, phi::ReduceMeanGradKernel, float) {}
--- a/paddle/phi/kernels/xpu/reduce_mean_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_mean_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/reduce_mean_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/xpu/reduce.h"
+namespace phi {
+template <typename T, typename Context>
+void MeanRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const IntArray& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  int r = XPUReduce<Context, T>(dev_ctx,
+                                x,
+                                dims.GetData(),
+                                keep_dim,
+                                reduce_all,
+                                out,
+                                xpu::reduce_mean<T>);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_mean");
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(mean_raw, XPU, ALL_LAYOUT, phi::MeanRawKernel, float) {}
--- a/paddle/phi/kernels/xpu/reduce_prod_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_prod_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/reduce_prod_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/xpu/reduce.h"
+namespace phi {
+template <typename T, typename Context>
+void ProdRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const IntArray& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  int r = XPUReduce<Context, T>(dev_ctx,
+                                x,
+                                dims.GetData(),
+                                keep_dim,
+                                reduce_all,
+                                out,
+                                xpu::reduce_prod<T>);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_prod");
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(prod_raw, XPU, ALL_LAYOUT, phi::ProdRawKernel, float) {}