[Sparse]fix sparse bug (#53390)

acf3e526 · zhangkaihuo · GitHub · 4ea1d041 · acf3e526 · acf3e526
6 changed file
--- a/paddle/phi/core/sparse_coo_tensor.h
+++ b/paddle/phi/core/sparse_coo_tensor.h
@@ -126,8 +126,13 @@ class SparseCooTensor : public TensorBase,
  bool valid() const noexcept override { return non_zero_elements_.valid(); }
  /// \brief Test whether the non_zero_elements_ storage is allocated.
-  /// return Whether the non_zero_elements_ storage is allocated.
+  /// In special cases, when nnz=0, non_zero_elements_ will not need to be
-  bool initialized() const override { return non_zero_elements_.initialized(); }
+  /// initialized, but it is neccessary to return true here, otherwise the
+  /// gradient will be None. return Whether the non_zero_elements_ storage is
+  /// allocated.
+  bool initialized() const override {
+    return values().initialized() || (nnz() == 0 && numel() > 0);
+  }
  /// \brief resize sparse coo tensor.
  /// \param dense_dims The dims of original dense tensor.

--- a/paddle/phi/core/sparse_csr_tensor.h
+++ b/paddle/phi/core/sparse_csr_tensor.h
@@ -131,8 +131,13 @@ class SparseCsrTensor : public TensorBase,
  bool valid() const noexcept override { return non_zero_elements_.valid(); }
  /// \brief Test whether the non_zero_elements_ storage is allocated.
-  /// return Whether the non_zero_elements_ storage is allocated.
+  /// In special cases, when nnz=0, non_zero_elements_ will not need to be
-  bool initialized() const override { return non_zero_elements_.initialized(); }
+  /// initialized, but it is neccessary to return true here, otherwise the
+  /// gradient will be None. return Whether the non_zero_elements_ storage is
+  /// allocated.
+  bool initialized() const override {
+    return values().initialized() || (nnz() == 0 && numel() > 0);
+  }
  /// \brief resize sparse csr tensor.
  /// \param dense_dims The dims of original dense tensor.

--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -113,13 +113,6 @@ void CsrToCooCPUKernel(const CPUContext& dev_ctx,
                       SparseCooTensor* out) {
  const DDim& x_dims = x.dims();
  const int64_t non_zero_num = x.cols().numel();
-  const auto& csr_crows = x.crows();
-  const auto& csr_cols = x.cols();
-  const auto& csr_values = x.values();
-  const IntT* csr_crows_data = csr_crows.data<IntT>();
-  const IntT* csr_cols_data = csr_cols.data<IntT>();
-  const T* csr_values_data = csr_values.data<T>();
  int64_t sparse_dim = 2;
  if (x_dims.size() == 3) {
    sparse_dim = 3;
@@ -127,6 +120,17 @@ void CsrToCooCPUKernel(const CPUContext& dev_ctx,
  phi::DenseTensor indices =
      phi::Empty<IntT>(dev_ctx, {sparse_dim, non_zero_num});
  phi::DenseTensor values = phi::Empty<T>(dev_ctx, {non_zero_num});
+  if (x.nnz() <= 0) {
+    out->SetMember(indices, values, x_dims, true);
+    return;
+  }
+  const auto& csr_crows = x.crows();
+  const auto& csr_cols = x.cols();
+  const auto& csr_values = x.values();
+  const IntT* csr_crows_data = csr_crows.data<IntT>();
+  const IntT* csr_cols_data = csr_cols.data<IntT>();
+  const T* csr_values_data = csr_values.data<T>();
  IntT* coo_indices = indices.data<IntT>();
  IntT* batch_ptr = x_dims.size() == 2 ? nullptr : coo_indices;
  IntT* coo_rows_data =
@@ -177,7 +181,6 @@ void CooToCsrCPUKernel(const CPUContext& dev_ctx,
                    phi::errors::InvalidArgument(
                        "SparseCsrTensor only support 2-D or 3-D matrix"));
  const int64_t non_zero_num = x.nnz();
-  if (non_zero_num <= 0) return;
  int batchs = x_dims.size() == 2 ? 1 : x_dims[0];
  int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];
@@ -185,6 +188,10 @@ void CooToCsrCPUKernel(const CPUContext& dev_ctx,
  phi::DenseTensor crows = phi::Empty<IntT>(dev_ctx, {batchs * (rows + 1)});
  phi::DenseTensor cols = phi::Empty<IntT>(dev_ctx, {non_zero_num});
  phi::DenseTensor values = phi::EmptyLike<T, CPUContext>(dev_ctx, x.values());
+  if (non_zero_num <= 0) {
+    out->SetMember(crows, cols, values, x_dims);
+    return;
+  }
  IntT* csr_crows_data = crows.data<IntT>();
  IntT* csr_cols_data = cols.data<IntT>();
  T* csr_values_data = values.data<T>();
@@ -268,6 +275,12 @@ void CooToDenseCPUKernel(const CPUContext& dev_ctx,
  const T* x_data = values.data<T>();
  dev_ctx.template Alloc<T>(out);
  T* out_data = out->data<T>();
+  memset(out_data, 0, sizeof(T) * out->numel());
+  if (x.nnz() <= 0) {
+    return;
+  }
  int64_t base_offset = 1;
  for (int64_t i = 0; i < dense_dim; i++) {
    base_offset *= dense_dims[sparse_dim + i];
@@ -279,7 +292,6 @@ void CooToDenseCPUKernel(const CPUContext& dev_ctx,
    offset *= dense_dims[i];
  }
-  memset(out_data, 0, sizeof(T) * out->numel());
  for (auto i = 0; i < non_zero_num; i++) {
    int64_t index = 0;
    for (int j = 0; j < sparse_dim; j++) {

--- a/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
@@ -61,6 +61,13 @@ void MaskCooGPUKernel(const GPUContext& dev_ctx,
      phi::errors::InvalidArgument("the input x and mask must have the shape"));
  const DenseTensor& indices = mask.indices();
  const DenseTensor& values = mask.values();
+  DenseTensor out_indices = phi::EmptyLike<IntT>(dev_ctx, indices);
+  DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, values);
+  if (mask.nnz() <= 0) {
+    out->SetMember(out_indices, out_values, dims, true);
+    return;
+  }
  const int sparse_dim = mask.sparse_dim();
  DenseTensor sparse_offsets = phi::Empty<GPUContext>(
      dev_ctx,
@@ -75,9 +82,6 @@ void MaskCooGPUKernel(const GPUContext& dev_ctx,
                                     gpuMemcpyHostToDevice,
                                     dev_ctx.stream());
-  DenseTensor out_indices = phi::EmptyLike<T>(dev_ctx, indices);
-  DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, values);
  phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &out_indices);
  const IntT* indices_ptr = indices.data<IntT>();

--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -164,6 +164,7 @@ void DenseToCooKernel(const Context& dev_ctx,
  T* sparse_data = dev_ctx.template Alloc<T>(&values);
  // 3. calc indices by indexs and get values by indexs
+  if (non_zero_num > 0) {
    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
    GetNonZeroElementsAndIndices<<<config.block_per_grid.x,
                                   config.thread_per_block.x,
@@ -176,6 +177,7 @@ void DenseToCooKernel(const Context& dev_ctx,
                                                       temp_indexs_ptr,
                                                       indices_data,
                                                       sparse_data);
+  }
  out->SetMember(indices, values, x_dims, true);
 }
@@ -218,6 +220,21 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
                       SparseCooTensor* out) {
  const DDim& x_dims = x.dims();
  const int64_t non_zero_num = x.cols().numel();
+  int64_t sparse_dim = 2;
+  if (x_dims.size() == 3) {
+    sparse_dim = 3;
+  }
+  if (x.nnz() <= 0) {
+#ifdef PADDLE_WITH_HIP
+    DenseTensor indices = phi::Empty<int>(dev_ctx, {sparse_dim, non_zero_num});
+#else
+    DenseTensor indices = phi::Empty<IntT>(dev_ctx, {sparse_dim, non_zero_num});
+#endif
+    DenseTensor values = phi::EmptyLike<T, GPUContext>(dev_ctx, x.values());
+    out->SetMember(indices, values, x_dims, true);
+    return;
+  }
 // rocsparse_csr2coo only support index with type 'rocsparse_int' (aka 'int')
 // now
@@ -235,10 +252,6 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
  const auto& csr_values = x.values();
  const T* csr_values_data = csr_values.data<T>();
-  int64_t sparse_dim = 2;
-  if (x_dims.size() == 3) {
-    sparse_dim = 3;
-  }
  int batches = x_dims.size() == 2 ? 1 : x_dims[0];
  int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];
@@ -395,7 +408,6 @@ void CooToCsrGPUKernel(const GPUContext& dev_ctx,
                    phi::errors::InvalidArgument(
                        "SparseCsrTensor only support 2-D or 3-D matrix"));
  const int64_t non_zero_num = x.nnz();
-  if (non_zero_num <= 0) return;
  int batchs = x_dims.size() == 2 ? 1 : x_dims[0];
  int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];
@@ -403,6 +415,10 @@ void CooToCsrGPUKernel(const GPUContext& dev_ctx,
  phi::DenseTensor crows = phi::Empty<IntT>(dev_ctx, {batchs * (rows + 1)});
  phi::DenseTensor cols = phi::Empty<IntT>(dev_ctx, {non_zero_num});
  phi::DenseTensor values = phi::EmptyLike<T, GPUContext>(dev_ctx, x.values());
+  if (non_zero_num <= 0) {
+    out->SetMember(crows, cols, values, x_dims);
+    return;
+  }
  IntT* csr_crows_data = crows.data<IntT>();
  IntT* csr_cols_data = cols.data<IntT>();
  T* csr_values_data = values.data<T>();
@@ -503,10 +519,17 @@ void CooToDenseGPUKernel(const GPUContext& dev_ctx,
  const int64_t dense_dim = values.dims().size() - 1;
  const auto place = dev_ctx.GetPlace();
-  const T* x_data = values.data<T>();
  dev_ctx.template Alloc<T>(out);
  T* out_data = out->data<T>();
+  phi::backends::gpu::GpuMemsetAsync(
+      out_data, 0, sizeof(T) * out->numel(), dev_ctx.stream());
+  if (x.nnz() <= 0) {
+    return;
+  }
+  const T* x_data = values.data<T>();
  int64_t base_offset = 1;
  for (int64_t i = 0; i < dense_dim; i++) {
    base_offset *= dense_dims[sparse_dim + i];
@@ -525,8 +548,6 @@ void CooToDenseGPUKernel(const GPUContext& dev_ctx,
                                     sparse_dim * sizeof(int64_t),
                                     gpuMemcpyHostToDevice,
                                     dev_ctx.stream());
-  phi::backends::gpu::GpuMemsetAsync(
-      out_data, 0, sizeof(T) * out->numel(), dev_ctx.stream());
  auto config =
      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);

--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -351,6 +351,27 @@ class TestSparseConvert(unittest.TestCase):
        dense_x[2] = 0
        verify(dense_x)
+    def test_zero_nnz(self):
+        for device in devices:
+            if device == 'cpu' or (
+                device == 'gpu' and paddle.is_compiled_with_cuda()
+            ):
+                paddle.device.set_device(device)
+                x1 = paddle.zeros([2, 2, 2])
+                x2 = paddle.zeros([2, 2, 2])
+                sp_csr_x = x1.to_sparse_csr()
+                sp_coo_x = x2.to_sparse_coo(1)
+                sp_coo_x.stop_gradient = False
+                out1 = sp_csr_x.to_dense()
+                out2 = sp_coo_x.to_dense()
+                out2.backward()
+                np.testing.assert_allclose(out1.numpy(), x1.numpy())
+                np.testing.assert_allclose(out2.numpy(), x2.numpy())
+                np.testing.assert_allclose(
+                    sp_coo_x.grad.to_dense().numpy().sum(), 0.0
+                )
 class TestCooError(unittest.TestCase):
    def test_small_shape(self):