[Sparse]Fix the bug of elementwise_grad (#52102)

aeb8c2e2 · zhangkaihuo · GitHub · 8b622d58 · aeb8c2e2 · aeb8c2e2
3 changed file
--- a/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc
@@ -27,6 +27,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"
 #include "paddle/phi/kernels/sparse/empty_kernel.h"

 namespace phi {
@@ -39,6 +40,7 @@ void AllocCsrPtr(const Context& dev_ctx,
  DenseTensor dx_crows = phi::EmptyLike<IntT>(dev_ctx, x.crows());
  DenseTensor dx_cols = phi::EmptyLike<IntT>(dev_ctx, x.cols());
  DenseTensor dx_values = phi::EmptyLike<T>(dev_ctx, x.values());
+  dx->set_meta(x.meta());
  dx->SetMember(dx_crows, dx_cols, dx_values, x.dims());
 }

@@ -48,9 +50,117 @@ void AllocCooPtr(const Context& dev_ctx,
                 SparseCooTensor* dx) {
  DenseTensor dx_indices = phi::EmptyLike<IntT>(dev_ctx, x.indices());
  DenseTensor dx_values = phi::EmptyLike<T>(dev_ctx, x.values());
+  dx->set_meta(x.meta());
  dx->SetMember(dx_indices, dx_values, x.dims(), x.coalesced());
 }

+template <typename T, typename IntT, typename Context>
+void CopyCooValues(const Context& dev_ctx,
+                   const SparseCooTensor& dout,
+                   const SparseCooTensor& x,
+                   SparseCooTensor* dx) {
+  Copy(dev_ctx, x.indices(), dev_ctx.GetPlace(), false, dx->mutable_indices());
+
+  const int sparse_dim = x.sparse_dim();
+  std::vector<IntT> sparse_offsets(sparse_dim), dout_indexs(dout.nnz()),
+      x_indexs(x.nnz());
+
+  phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
+      dout.dims(), sparse_dim, sparse_offsets.data());
+
+  phi::funcs::sparse::FlattenIndices(dout.indices().data<IntT>(),
+                                     sparse_offsets.data(),
+                                     dout.nnz(),
+                                     sparse_dim,
+                                     0,
+                                     1,
+                                     dout_indexs.data());
+
+  phi::funcs::sparse::FlattenIndices(x.indices().data<IntT>(),
+                                     sparse_offsets.data(),
+                                     x.nnz(),
+                                     sparse_dim,
+                                     0,
+                                     1,
+                                     x_indexs.data());
+
+  size_t i = 0, j = 0;
+  T* dx_values_ptr = dx->mutable_values()->data<T>();
+  const T* dout_values_ptr = dout.values().data<T>();
+
+  int64_t element_size = 1;
+  for (auto j = 1; j < x.values().dims().size(); ++j) {
+    element_size *= x.values().dims()[j];
+  }
+
+  while (i < dout_indexs.size() && j < x_indexs.size()) {
+    if (dout_indexs[i] == x_indexs[j]) {
+      memcpy(dx_values_ptr + j * element_size,
+             dout_values_ptr + i * element_size,
+             element_size * sizeof(T));
+      ++i;
+      ++j;
+    } else if (dout_indexs[i] > x_indexs[j]) {
+      memset(dx_values_ptr + j * element_size, 0, element_size * sizeof(T));
+      ++j;
+    } else {
+      ++i;
+    }
+  }
+  while (j < x_indexs.size()) {
+    memset(dx_values_ptr + j * element_size, 0, element_size * sizeof(T));
+    ++j;
+  }
+}
+
+template <typename T, typename IntT, typename Context>
+void CopyCsrValues(const Context& dev_ctx,
+                   const SparseCsrTensor& dout,
+                   const SparseCsrTensor& x,
+                   SparseCsrTensor* dx) {
+  Copy(dev_ctx, x.crows(), dev_ctx.GetPlace(), false, dx->mutable_crows());
+  Copy(dev_ctx, x.cols(), dev_ctx.GetPlace(), false, dx->mutable_cols());
+
+  const auto& x_dims = x.dims();
+  int batch = x_dims.size() == 2 ? 1 : x_dims[0];
+  int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];
+
+  const IntT* x_crows_ptr = x.crows().data<IntT>();
+  const IntT* x_cols_ptr = x.cols().data<IntT>();
+
+  const IntT* dout_crows_ptr = dout.crows().data<IntT>();
+  const IntT* dout_cols_ptr = dout.cols().data<IntT>();
+  const T* dout_values_ptr = dout.values().data<T>();
+
+  T* dx_values_ptr = dx->mutable_values()->data<T>();
+
+  for (int b = 0; b < batch; b++) {
+    for (int r = 0; r < rows; r++) {
+      int x_start = x_crows_ptr[b * (rows + 1) + r];
+      int dout_start = dout_crows_ptr[b * (rows + 1) + r];
+      int x_row_nnz = x_crows_ptr[b * (rows + 1) + r + 1] - x_start;
+      int dout_row_nnz = dout_crows_ptr[b * (rows + 1) + r + 1] - dout_start;
+      int i = 0, j = 0;
+      while (i < x_row_nnz && j < dout_row_nnz) {
+        if (x_cols_ptr[x_start + i] == dout_cols_ptr[dout_start + j]) {
+          dx_values_ptr[x_start + i] = dout_values_ptr[dout_start + j];
+          ++i;
+          ++j;
+        } else if (x_cols_ptr[x_start + i] < dout_cols_ptr[dout_start + j]) {
+          dx_values_ptr[x_start + i] = static_cast<T>(0);
+          ++i;
+        } else {
+          ++j;
+        }
+      }
+      while (i < x_row_nnz) {
+        dx_values_ptr[x_start + i] = static_cast<T>(0);
+        ++i;
+      }
+    }
+  }
+}
+
 template <typename T, typename IntT, typename Context>
 void ElementWiseAddCsrGradCPUKernel(const Context& dev_ctx,
                                    const SparseCsrTensor& x,
@@ -62,16 +172,16 @@ void ElementWiseAddCsrGradCPUKernel(const Context& dev_ctx,
  if (dx != nullptr && dy == nullptr) {
    VLOG(4) << "Special case when dy is not needed";
    AllocCsrPtr<T, IntT>(dev_ctx, x, dx);
-    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx);
+    CopyCsrValues<T, IntT, Context>(dev_ctx, dout, x, dx);
  } else if (dx == nullptr && dy != nullptr) {
    VLOG(4) << "Special case when dx is not needed";
    AllocCsrPtr<T, IntT>(dev_ctx, y, dy);
-    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dy);
+    CopyCsrValues<T, IntT, Context>(dev_ctx, dout, y, dy);
  } else {
    AllocCsrPtr<T, IntT>(dev_ctx, x, dx);
    AllocCsrPtr<T, IntT>(dev_ctx, y, dy);
-    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx);
-    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dy);
+    CopyCsrValues<T, IntT, Context>(dev_ctx, dout, x, dx);
+    CopyCsrValues<T, IntT, Context>(dev_ctx, dout, y, dy);
  }
 }

@@ -84,12 +194,12 @@ void ElementWiseSubtractCsrGradCPUKernel(const Context& dev_ctx,
                                         SparseCsrTensor* dy) {
  if (dx) {
    AllocCsrPtr<T, IntT>(dev_ctx, x, dx);
-    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx);
+    CopyCsrValues<T, IntT, Context>(dev_ctx, dout, x, dx);
  }

  if (dy) {
    AllocCsrPtr<T, IntT>(dev_ctx, y, dy);
-    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dy);
+    CopyCsrValues<T, IntT, Context>(dev_ctx, dout, y, dy);
    phi::NegativeKernel<T, Context>(
        dev_ctx, dout.values(), dy->mutable_values());
  }
@@ -105,13 +215,19 @@ void ElementWiseMultiplyCsrGradCPUKernel(const Context& dev_ctx,
  if (dx) {
    //    dout*y
    AllocCsrPtr<T, IntT>(dev_ctx, x, dx);
-    sparse::ElementWiseMultiplyCsrKernel<T, Context>(dev_ctx, dout, y, dx);
+    SparseCsrTensor tmp_dx;
+    AllocCsrPtr<T, IntT>(dev_ctx, x, &tmp_dx);
+    sparse::ElementWiseMultiplyCsrKernel<T, Context>(dev_ctx, dout, y, &tmp_dx);
+    CopyCsrValues<T, IntT, Context>(dev_ctx, tmp_dx, x, dx);
  }

  if (dy) {
    //    dout*x
    AllocCsrPtr<T, IntT>(dev_ctx, y, dy);
-    sparse::ElementWiseMultiplyCsrKernel<T, Context>(dev_ctx, dout, x, dy);
+    SparseCsrTensor tmp_dy;
+    AllocCsrPtr<T, IntT>(dev_ctx, y, &tmp_dy);
+    sparse::ElementWiseMultiplyCsrKernel<T, Context>(dev_ctx, dout, x, &tmp_dy);
+    CopyCsrValues<T, IntT, Context>(dev_ctx, tmp_dy, y, dy);
  }
 }

@@ -126,17 +242,24 @@ void ElementWiseDivideCsrGradCPUKernel(const Context& dev_ctx,
  if (dx) {
    //    dout/y
    AllocCsrPtr<T, IntT>(dev_ctx, x, dx);
-    sparse::ElementWiseDivideCsrKernel<T, Context>(dev_ctx, dout, y, dx);
+    SparseCsrTensor tmp_dx;
+    AllocCsrPtr<T, IntT>(dev_ctx, x, &tmp_dx);
+    sparse::ElementWiseDivideCsrKernel<T, Context>(dev_ctx, dout, y, &tmp_dx);
+    CopyCsrValues<T, IntT, Context>(dev_ctx, tmp_dx, x, dx);
  }

  if (dy) {
    //    -dout * out / y
    AllocCsrPtr<T, IntT>(dev_ctx, y, dy);
-    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dy);
+    SparseCsrTensor tmp_dy;
+    AllocCsrPtr<T, IntT>(dev_ctx, y, &tmp_dy);
+
+    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, &tmp_dy);
    phi::NegativeKernel<T, Context>(
-        dev_ctx, dout.values(), dy->mutable_values());
-    auto tmp = sparse::ElementWiseMultiplyCsr<T, Context>(dev_ctx, *dy, out);
-    sparse::ElementWiseDivideCsrKernel<T, Context>(dev_ctx, tmp, y, dy);
+        dev_ctx, dout.values(), tmp_dy.mutable_values());
+    auto tmp = sparse::ElementWiseMultiplyCsr<T, Context>(dev_ctx, tmp_dy, out);
+    sparse::ElementWiseDivideCsrKernel<T, Context>(dev_ctx, tmp, y, &tmp_dy);
+    CopyCsrValues<T, IntT, Context>(dev_ctx, tmp_dy, y, dy);
  }
 }

@@ -151,16 +274,16 @@ void ElementWiseAddCooGradCPUKernel(const Context& dev_ctx,
  if (dx != nullptr && dy == nullptr) {
    VLOG(4) << "Special case when dy is not needed";
    AllocCooPtr<T, IntT>(dev_ctx, x, dx);
-    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx);
+    CopyCooValues<T, IntT, Context>(dev_ctx, dout, x, dx);
  } else if (dx == nullptr && dy != nullptr) {
    VLOG(4) << "Special case when dx is not needed";
    AllocCooPtr<T, IntT>(dev_ctx, y, dy);
-    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dy);
+    CopyCooValues<T, IntT, Context>(dev_ctx, dout, y, dy);
  } else {
    AllocCooPtr<T, IntT>(dev_ctx, x, dx);
    AllocCooPtr<T, IntT>(dev_ctx, y, dy);
-    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx);
-    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dy);
+    CopyCooValues<T, IntT, Context>(dev_ctx, dout, x, dx);
+    CopyCooValues<T, IntT, Context>(dev_ctx, dout, y, dy);
  }
 }

@@ -173,12 +296,12 @@ void ElementWiseSubtractCooGradCPUKernel(const Context& dev_ctx,
                                         SparseCooTensor* dy) {
  if (dx) {
    AllocCooPtr<T, IntT>(dev_ctx, x, dx);
-    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx);
+    CopyCooValues<T, IntT, Context>(dev_ctx, dout, x, dx);
  }

  if (dy) {
    AllocCooPtr<T, IntT>(dev_ctx, y, dy);
-    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dy);
+    CopyCooValues<T, IntT, Context>(dev_ctx, dout, y, dy);
    phi::NegativeKernel<T, Context>(
        dev_ctx, dout.values(), dy->mutable_values());
  }
@@ -194,13 +317,19 @@ void ElementWiseMultiplyCooGradCPUKernel(const Context& dev_ctx,
  if (dx) {
    //    dout*y
    AllocCooPtr<T, IntT>(dev_ctx, x, dx);
-    sparse::ElementWiseMultiplyCooKernel<T, Context>(dev_ctx, dout, y, dx);
+    SparseCooTensor tmp_dx;
+    AllocCooPtr<T, IntT>(dev_ctx, x, &tmp_dx);
+    sparse::ElementWiseMultiplyCooKernel<T, Context>(dev_ctx, dout, y, &tmp_dx);
+    CopyCooValues<T, IntT, Context>(dev_ctx, tmp_dx, x, dx);
  }

  if (dy) {
    //    dout*x
    AllocCooPtr<T, IntT>(dev_ctx, y, dy);
-    sparse::ElementWiseMultiplyCooKernel<T, Context>(dev_ctx, dout, x, dy);
+    SparseCooTensor tmp_dy;
+    AllocCooPtr<T, IntT>(dev_ctx, y, &tmp_dy);
+    sparse::ElementWiseMultiplyCooKernel<T, Context>(dev_ctx, dout, x, &tmp_dy);
+    CopyCooValues<T, IntT, Context>(dev_ctx, tmp_dy, y, dy);
  }
 }

@@ -215,22 +344,26 @@ void ElementWiseDivideCooGradCPUKernel(const Context& dev_ctx,
  if (dx) {
    //    dout/y
    AllocCooPtr<T, IntT>(dev_ctx, x, dx);
-    sparse::ElementWiseDivideCooKernel<T, Context>(dev_ctx, dout, y, dx);
+    SparseCooTensor tmp_dx;
+    AllocCooPtr<T, IntT>(dev_ctx, x, &tmp_dx);
+    sparse::ElementWiseDivideCooKernel<T, Context>(dev_ctx, dout, y, &tmp_dx);
+    CopyCooValues<T, IntT, Context>(dev_ctx, tmp_dx, x, dx);
  }

  if (dy) {
    //    -dout * out / y
    AllocCooPtr<T, IntT>(dev_ctx, y, dy);
-    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dy);
+    SparseCooTensor tmp_dy;
+    AllocCooPtr<T, IntT>(dev_ctx, y, &tmp_dy);
+    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, &tmp_dy);
    phi::NegativeKernel<T, Context>(
-        dev_ctx, dout.values(), dy->mutable_values());
-    auto tmp = sparse::ElementWiseMultiplyCoo<T, Context>(dev_ctx, *dy, out);
-    sparse::ElementWiseDivideCooKernel<T, Context>(dev_ctx, tmp, y, dy);
+        dev_ctx, dout.values(), tmp_dy.mutable_values());
+    auto tmp = sparse::ElementWiseMultiplyCoo<T, Context>(dev_ctx, tmp_dy, out);
+    sparse::ElementWiseDivideCooKernel<T, Context>(dev_ctx, tmp, y, &tmp_dy);
+    CopyCooValues<T, IntT, Context>(dev_ctx, tmp_dy, y, dy);
  }
 }
-// CPU Kernel end

-// Kernel
 template <typename T, typename Context>
 void ElementWiseDivideCsrGradKernel(const Context& dev_ctx,
                                    const SparseCsrTensor& x,

--- a/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc
@@ -32,18 +32,13 @@ template <typename T, typename Functor>
 struct BinaryOPWithZeroCompareFunctor {
  explicit BinaryOPWithZeroCompareFunctor(Functor functor)
      : functor_(functor) {}
-  inline HOSTDEVICE bool operator()(const T* a,
+  inline HOSTDEVICE void operator()(const T* a,
                                    const T* b,
                                    T* result,
                                    const int64_t len) const {
-    bool is_zero = true;
    for (int64_t i = 0; i < len; ++i) {
      result[i] = functor_(a[i], b[i]);
-      if (result[i] != 0) {
-        is_zero = false;
-      }
    }
-    return is_zero;
  }
  Functor functor_;
 };
@@ -88,55 +83,41 @@ void Merge(const IntT el_len,
  // merge
  while (a < len_a && b < (is_divide ? len_b_max : len_b)) {
    if (a_index[a] == b_index[b]) {
-      if (!functor(a_values + a * el_len,
-                   b_values[b_index[b]],
-                   c_values + nnz * el_len,
-                   el_len)) {
-        c_index[nnz] = a_index[a];
-        ++nnz;
-      }
+      functor(a_values + a * el_len,
+              b_values[b_index[b]],
+              c_values + nnz * el_len,
+              el_len);
+      c_index[nnz] = a_index[a];
+      ++nnz;
      ++a;
      ++b;
    } else if (a_index[a] < b_index[b]) {  // coordinate x[a] < coordinate y[b]
-      if (!functor(a_values + a * el_len,
-                   zero.data(),
-                   c_values + nnz * el_len,
-                   el_len)) {
-        c_index[nnz] = a_index[a];
-        ++nnz;
-      }
+      functor(
+          a_values + a * el_len, zero.data(), c_values + nnz * el_len, el_len);
+      c_index[nnz] = a_index[a];
+      ++nnz;
      ++a;
    } else if (a_index[a] > b_index[b]) {  // coordinate x[a] > coordinate y[b]
-      if (!functor(zero.data(),
-                   b_values[b_index[b]],
-                   c_values + nnz * el_len,
-                   el_len)) {
-        c_index[nnz] = b_index[b];
-        ++nnz;
-      }
+      functor(
+          zero.data(), b_values[b_index[b]], c_values + nnz * el_len, el_len);
+      c_index[nnz] = b_index[b];
+      ++nnz;
      ++b;
    }
  }
  // a tail
  while (a < len_a) {
-    if (!functor(a_values + a * el_len,
-                 zero.data(),
-                 c_values + nnz * el_len,
-                 el_len)) {
-      c_index[nnz] = a_index[a];
-      ++nnz;
-    }
+    functor(
+        a_values + a * el_len, zero.data(), c_values + nnz * el_len, el_len);
+    c_index[nnz] = a_index[a];
+    ++nnz;
    ++a;
  }
  //  b tail
  while (b < (is_divide ? len_b_max : len_b)) {
-    if (!functor(zero.data(),
-                 b_values[b_index[b]],
-                 c_values + nnz * el_len,
-                 el_len)) {
-      c_index[nnz] = b_index[b];
-      ++nnz;
-    }
+    functor(zero.data(), b_values[b_index[b]], c_values + nnz * el_len, el_len);
+    c_index[nnz] = b_index[b];
+    ++nnz;
    ++b;
  }
 }

--- a/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py
@@ -37,6 +37,11 @@ def get_actual_res(x, y, op):
    return res


+def mask_to_zero(x, mask):
+    x[mask == 0] = 0
+    return x
+
+
 class TestSparseElementWiseAPI(unittest.TestCase):
    """
    test paddle.sparse.add, subtract, multiply, divide
@@ -45,14 +50,20 @@ class TestSparseElementWiseAPI(unittest.TestCase):
    def setUp(self):
        np.random.seed(2022)
        self.op_list = op_list
-        self.csr_shape = [128, 256]
+        self.csr_shape = [8, 8]
        self.coo_shape = [4, 8, 3, 5]
        self.support_dtypes = ['float32', 'float64', 'int32', 'int64']

    def func_test_csr(self, op):
        for dtype in self.support_dtypes:
-            x = np.random.randint(-255, 255, size=self.csr_shape).astype(dtype)
-            y = np.random.randint(-255, 255, size=self.csr_shape).astype(dtype)
+            x = np.random.randint(-255, 255, size=self.csr_shape)
+            y = np.random.randint(-255, 255, size=self.csr_shape)
+            mask_x = x / x
+            mask_y = y / y
+            mask_x[mask_x != 1] = 0
+            mask_y[mask_y != 1] = 0
+            x = x.astype(dtype)
+            y = y.astype(dtype)

            dense_x = paddle.to_tensor(x, dtype=dtype, stop_gradient=False)
            dense_y = paddle.to_tensor(y, dtype=dtype, stop_gradient=False)
@@ -63,9 +74,10 @@ class TestSparseElementWiseAPI(unittest.TestCase):
            csr_y = s_dense_y.to_sparse_csr()

            actual_res = get_actual_res(csr_x, csr_y, op)
+            actual_res.backward()

            expect_res = op(dense_x, dense_y)
-            expect_res.backward(expect_res)
+            expect_res.backward()

            np.testing.assert_allclose(
                expect_res.numpy(),
@@ -74,15 +86,14 @@ class TestSparseElementWiseAPI(unittest.TestCase):
                equal_nan=True,
            )
            if not (op == __truediv__ and dtype in ['int32', 'int64']):
-                actual_res.backward(actual_res)
                np.testing.assert_allclose(
-                    dense_x.grad.numpy(),
+                    mask_to_zero(dense_x.grad.numpy(), mask_x),
                    csr_x.grad.to_dense().numpy(),
                    rtol=1e-05,
                    equal_nan=True,
                )
                np.testing.assert_allclose(
-                    dense_y.grad.numpy(),
+                    mask_to_zero(dense_y.grad.numpy(), mask_y),
                    csr_y.grad.to_dense().numpy(),
                    rtol=1e-05,
                    equal_nan=True,
@@ -124,12 +135,14 @@ class TestSparseElementWiseAPI(unittest.TestCase):
                    rtol=1e-05,
                    equal_nan=True,
                )
+                np.testing.assert_allclose(coo_x.shape, coo_x.grad.shape)
                np.testing.assert_allclose(
                    dense_x.grad.numpy(),
                    coo_x.grad.to_dense().numpy(),
                    rtol=1e-05,
                    equal_nan=True,
                )
+                np.testing.assert_allclose(coo_y.shape, coo_y.grad.shape)
                np.testing.assert_allclose(
                    dense_y.grad.numpy(),
                    coo_y.grad.to_dense().numpy(),