add gather_nd_grad op and where_grad support zero_dim for xpu (#50454)

055d0c2d · zhangyikun02 · GitHub · 47c23ccb · 055d0c2d · 055d0c2d
4 changed file
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -299,6 +299,10 @@ XPUOpMap& get_kl2_ops() {
      {"floor", XPUKernelSet({phi::DataType::FLOAT32})},
      {"gather_grad",
       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"gather_nd_grad",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT32})},
      {"gather_nd",
       XPUKernelSet({phi::DataType::INT32,
                     phi::DataType::INT64,
@@ -697,6 +701,11 @@ XPUOpMap& get_kl2_ops() {
       XPUKernelSet({phi::DataType::INT32,
                     phi::DataType::BOOL,
                     phi::DataType::FLOAT32})},
+      {"where_grad",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
      {"where",
       XPUKernelSet({phi::DataType::INT32,
                     phi::DataType::INT64,

--- a/paddle/phi/kernels/xpu/gather_nd_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/gather_nd_grad_kernel.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_nd_grad_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherNdGradKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &out_grad,
+                        DenseTensor *x_grad) {
+  ctx.template Alloc<T>(x_grad);
+
+  int r = XPU_SUCCESS;
+  T *dx_data = x_grad->data<T>();
+  r = xpu::constant<T>(
+      ctx.x_context(), dx_data, x_grad->numel(), static_cast<T>(0));
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+
+  if (out_grad.numel() == 0) return;
+
+  if (index.numel() == 0) {
+    r = xpu::copy(ctx.x_context(),
+                  out_grad.data<T>(),
+                  x_grad->data<T>(),
+                  x_grad->numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
+    return;
+  }
+
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      phi::errors::InvalidArgument("Index holds the wrong type, it holds [%s],"
+                                   "but desires to be [%s] or [%s]",
+                                   index_type,
+                                   phi::DataType::INT32,
+                                   phi::DataType::INT64));
+
+  int index_size =
+      static_cast<int>(index.dims().size() == 0 ? 1 : index.dims()[0]);
+  auto x_shape = phi::vectorize<int64_t>(x_grad->dims());
+  auto index_shape = phi::vectorize<int64_t>(index.dims());
+  if (index_shape.size() == 1) {
+    index_shape.insert(index_shape.begin(), 1);
+  }
+  xpu::VectorParam<int64_t> x_vec = {
+      x_shape.data(), static_cast<int>(x_shape.size()), nullptr};
+
+  DenseTensor index_cpu(index.type());
+  phi::Copy(ctx, index, phi::CPUPlace(), false, &index_cpu);
+
+  if (index_type == phi::DataType::INT32) {
+    auto index_data = const_cast<int *>(index.data<int>());
+    xpu::VectorParam<int> index_vec{
+        index_cpu.data<int>(), index_size, index_data};
+    r = xpu::scatter_nd<T, int>(ctx.x_context(),
+                                nullptr,
+                                out_grad.data<T>(),
+                                dx_data,
+                                index_vec,
+                                x_vec,
+                                index_shape,
+                                false);
+  } else {
+    auto index_data = const_cast<int64_t *>(index.data<int64_t>());
+    xpu::VectorParam<int64_t> index_vec{
+        index_cpu.data<int64_t>(), index_size, index_data};
+    r = xpu::scatter_nd<T, int64_t>(ctx.x_context(),
+                                    nullptr,
+                                    out_grad.data<T>(),
+                                    dx_data,
+                                    index_vec,
+                                    x_vec,
+                                    index_shape,
+                                    false);
+  }
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "scatter_nd");
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_nd_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::GatherNdGradKernel,
+                   float,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/xpu/where_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/where_grad_kernel.cc
@@ -33,6 +33,13 @@ void WhereGradKernel(const Context& ctx,

  auto cond_shape = phi::vectorize(condition.dims());
  auto out_shape = phi::vectorize(out_grad.dims());
+  // use [1] to replace [], because xpu not support []
+  if (cond_shape.size() == 0) {
+    cond_shape = std::vector<int64_t>({1});
+  }
+  if (out_shape.size() == 0) {
+    out_shape = std::vector<int64_t>({1});
+  }

  T* dx = nullptr;
  T* dy = nullptr;

--- a/python/paddle/fluid/tests/unittests/xpu/test_gather_nd_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_gather_nd_op_xpu.py
@@ -51,6 +51,9 @@ class XPUTestGatherNd(XPUOpTestWrapper):
        def test_check_output(self):
            self.check_output_with_place(self.place)

+        def test_check_grad(self):
+            self.check_grad(['X'], 'Out', check_eager=False)
+
        def init_data(self):
            self.xnp = np.random.random((5, 20)).astype(self.in_type)
            self.inp = np.array([[], []]).astype("int32")
@@ -58,6 +61,10 @@ class XPUTestGatherNd(XPUOpTestWrapper):
                (self.xnp[np.newaxis, :], self.xnp[np.newaxis, :])
            )

+        def infer_dtype_from_inputs_outputs(self, inputs, outputs):
+            self.__class__.dtype = self.dtype
+            self.output_dtype = self.dtype
+
    class XPUTestGatherNdOpWithEmptyIndex1(XPUTestGatherNdBase):
        def init_data(self):
            self.xnp = np.random.random((5, 20)).astype(self.in_type)