Add kernel sparse_mask_helper; sparse_coo_tensor_grad (#41586) (#41902)

cherry-pick the PR#41586 to realese/2.3

Add kernel sparse_mask_helper; sparse_coo_tensor_grad (#41586) (#41902)
cherry-pick the PR#41586 to realese/2.3
44d8c6ed · zhangkaihuo · GitHub · dab7dfbf · 44d8c6ed · 44d8c6ed
13 changed file
--- a/paddle/phi/kernels/funcs/sparse/common_shape.h
+++ b/paddle/phi/kernels/funcs/sparse/common_shape.h
@@ -40,6 +40,45 @@ inline const DDim InferDenseDims(const DDim& x_dims,
  return values_dims;
 }

+template <typename IntT>
+inline const IntT HOSTDEVICE IndicesToIndex(const IntT* indices,
+                                            const IntT* sparse_offsets,
+                                            const int64_t non_zero_num,
+                                            const int64_t sparse_dim,
+                                            const int i) {
+  IntT index = 0;
+  for (IntT j = 0; j < sparse_dim; j++) {
+    index += indices[j * non_zero_num + i] * sparse_offsets[j];
+  }
+  return index;
+}
+
+template <typename IntT>
+inline void HOSTDEVICE FlattenIndices(const IntT* indices,
+                                      const IntT* sparse_offsets,
+                                      const int64_t non_zero_num,
+                                      const int64_t sparse_dim,
+                                      const int start,
+                                      const int stride,
+                                      IntT* out) {
+  for (int i = start; i < non_zero_num; i += stride) {
+    out[i] =
+        IndicesToIndex(indices, sparse_offsets, non_zero_num, sparse_dim, i);
+  }
+}
+
+// 1. indices.dims().size() == 2
+template <typename IntT>
+inline void CalcOffsetsPerDim(const DDim& dims,
+                              const int64_t sparse_dim,
+                              std::vector<IntT>* offsets) {
+  IntT offset = 1;
+  for (IntT i = sparse_dim - 1; i >= 0; i--) {
+    (*offsets)[i] = offset;
+    offset *= dims[i];
+  }
+}
+
 }  // namespace sparse
 }  // namespace funcs
 }  // namespace phi
--- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/sparse/common_shape.h"

 #include "paddle/phi/api/ext/dispatch.h"

@@ -38,12 +39,6 @@ void SparseMaskCPUKernel(const CPUContext& dev_ctx,
  const DenseTensor& indices = mask.non_zero_indices();
  const DenseTensor& values = mask.non_zero_elements();
  int sparse_dim = indices.dims().size();
-  std::vector<int64_t> sparse_offsets(sparse_dim);
-  int64_t offset = 1;
-  for (int i = sparse_dim - 1; i >= 0; i--) {
-    sparse_offsets[i] = offset;
-    offset *= dims[i];
-  }

  DenseTensor out_indices = phi::EmptyLike<T>(dev_ctx, indices);
  DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, values);
@@ -51,21 +46,25 @@ void SparseMaskCPUKernel(const CPUContext& dev_ctx,
  // the out_indices is same as indices of mask
  phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &out_indices);

-  const IntT* indices_ptr = indices.data<IntT>();
  T* out_values_ptr = out_values.data<T>();
  const T* x_ptr = x.data<T>();

  const int64_t non_zero_num = mask.nnz();
  auto dims_2d = flatten_to_2d(dims, sparse_dim);
  const int cols = dims_2d[1];
+  const IntT* indices_ptr = indices.data<IntT>();
+
+  std::vector<IntT> out_indexs(non_zero_num), sparse_offsets(sparse_dim);
+
+  phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
+      dims, sparse_dim, &sparse_offsets);

  for (int64_t i = 0; i < non_zero_num; i++) {
-    int64_t index = 0;
-    for (int j = 0; j < sparse_dim; j++) {
-      index += indices_ptr[j * non_zero_num + i] * sparse_offsets[j];
-    }
+    int64_t index = phi::funcs::sparse::IndicesToIndex<IntT>(
+        indices_ptr, sparse_offsets.data(), non_zero_num, sparse_dim, i);
    memcpy(out_values_ptr + i * cols, x_ptr + index * cols, cols * sizeof(T));
  }
+
  out->SetMember(out_indices, out_values, dims, true);
 }

@@ -85,6 +84,73 @@ void SparseMaskKernel(const Context& dev_ctx,
      }));
 }

+template <typename T, typename IntT>
+void SparseMaskHelperCPUKernel(const CPUContext& dev_ctx,
+                               const SparseCooTensor& x,
+                               const DenseTensor& mask_indices,
+                               DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      mask_indices.dims().size(),
+      2,
+      phi::errors::InvalidArgument("the mask_indices must be 2-D tensor"));
+
+  const int64_t sparse_dim = x.non_zero_indices().dims()[0];
+
+  std::vector<IntT> sparse_offsets(sparse_dim), x_indexs(x.nnz()),
+      mask_indexs(mask_indices.dims()[1]);
+  phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
+      x.dims(), sparse_dim, &sparse_offsets);
+
+  phi::funcs::sparse::FlattenIndices(x.non_zero_indices().data<IntT>(),
+                                     sparse_offsets.data(),
+                                     x.nnz(),
+                                     sparse_dim,
+                                     0,
+                                     1,
+                                     x_indexs.data());
+  phi::funcs::sparse::FlattenIndices(mask_indices.data<IntT>(),
+                                     sparse_offsets.data(),
+                                     x.nnz(),
+                                     sparse_dim,
+                                     0,
+                                     1,
+                                     mask_indexs.data());
+
+  std::unordered_map<IntT, uint64_t> x_indexs_map;
+  for (uint64_t i = 0; i < x_indexs.size(); i++) {
+    x_indexs_map[x_indexs[i]] = i;
+  }
+  *out = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
+  T* out_ptr = out->data<T>();
+  memset(out_ptr, static_cast<T>(0), out->numel() * sizeof(T));
+  const int64_t stride =
+      x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim;
+  const T* in_ptr = x.non_zero_elements().data<T>();
+  // TODO(zhangkaihuo): multithreading can be used for acceleration
+  for (uint64_t i = 0; i < mask_indexs.size(); i++) {
+    auto iter = x_indexs_map.find(mask_indexs[i]);
+    if (iter != x_indexs_map.end()) {
+      memcpy(out_ptr + i * stride,
+             in_ptr + iter->second * stride,
+             stride * sizeof(T));
+    }
+  }
+}
+
+/**
+ * @brief filter values from x.values() using mask_indices
+ */
+template <typename T, typename Context>
+void SparseMaskHelperKernel(const Context& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& mask_indices,
+                            DenseTensor* out) {
+  PD_DISPATCH_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "SparseMaskHelperCPUKernel", ([&] {
+        SparseMaskHelperCPUKernel<T, data_t>(dev_ctx, x, mask_indices, out);
+      }));
+}
+
 }  // namespace sparse
 }  // namespace phi

@@ -101,3 +167,16 @@ PD_REGISTER_KERNEL(sparse_mask,
                   int64_t) {
  kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
+
+PD_REGISTER_KERNEL(sparse_mask_helper,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseMaskHelperKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -394,3 +394,15 @@ PD_REGISTER_KERNEL(csr_values,
                   int64_t) {
  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
+
+PD_REGISTER_KERNEL(sparse_coo_tensor,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooTensorKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <thrust/binary_search.h>
+
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/ddim.h"
@@ -20,6 +22,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/sparse/common_shape.h"
 #include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"

 #include "paddle/phi/api/ext/dispatch.h"
@@ -59,7 +62,7 @@ void SparseMaskGPUKernel(const GPUContext& dev_ctx,
  const DenseTensor& indices = mask.non_zero_indices();
  const DenseTensor& values = mask.non_zero_elements();
  int sparse_dim = indices.dims().size();
-  DenseTensor sparse_offsets = phi::Empty(
+  DenseTensor sparse_offsets = phi::Empty<GPUContext>(
      dev_ctx,
      DenseTensorMeta(DataType::INT64, {sparse_dim}, DataLayout::NCHW));
  std::vector<int64_t> h_sparse_offsets(sparse_dim);
@@ -121,6 +124,153 @@ void SparseMaskKernel(const Context& dev_ctx,
      }));
 }

+// TODO(zhangkaihuo): Use an op to realize the function of FlattenIndices
+template <typename IntT>
+__global__ void FlattenIndicesKernel(const IntT* indices,
+                                     const IntT* sparse_offsets,
+                                     const int64_t non_zero_num,
+                                     const int64_t sparse_dim,
+                                     IntT* out) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  phi::funcs::sparse::FlattenIndices<IntT>(indices,
+                                           sparse_offsets,
+                                           non_zero_num,
+                                           sparse_dim,
+                                           tid,
+                                           gridDim.x * blockDim.x,
+                                           out);
+}
+
+template <typename T, typename IntT>
+__global__ void SparseMaskCopyKernel(const IntT* x_indexs,
+                                     const IntT* mask_indexs,
+                                     const IntT* bound_out,
+                                     const T* x_values,
+                                     const int64_t n,
+                                     const int64_t stride,
+                                     T* out_values) {
+  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
+    const IntT j = bound_out[i];
+    if (j >= 0 && j < n && mask_indexs[i] == x_indexs[j]) {
+      for (int k = 0; k < stride; k++) {
+        out_values[i * stride + k] = x_values[j * stride + k];
+      }
+    }
+  }
+}
+
+template <typename T, typename IntT>
+void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
+                               const SparseCooTensor& x,
+                               const DenseTensor& mask_indices,
+                               DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      mask_indices.dims().size(),
+      2,
+      phi::errors::InvalidArgument("the mask_indices must be 2-D tensor"));
+
+  const int64_t sparse_dim = x.non_zero_indices().dims()[0];
+  auto indices_dtype = paddle::experimental::CppTypeToDataType<IntT>::Type();
+
+  std::vector<IntT> sparse_offsets(sparse_dim);
+
+  DenseTensorMeta x_indexs_meta(indices_dtype, {x.nnz()}, DataLayout::NCHW);
+  DenseTensorMeta mask_indexs_meta(
+      indices_dtype, {mask_indices.dims()[1]}, DataLayout::NCHW);
+  DenseTensorMeta sparse_offset_meta(
+      indices_dtype, {sparse_dim}, DataLayout::NCHW);
+
+  DenseTensor x_indexs =
+      phi::Empty<GPUContext>(dev_ctx, std::move(x_indexs_meta));
+  DenseTensor mask_indexs =
+      phi::Empty<GPUContext>(dev_ctx, std::move(mask_indexs_meta));
+  DenseTensor bound_out =
+      phi::Empty<GPUContext>(dev_ctx, std::move(mask_indexs_meta));
+  DenseTensor d_sparse_offsets =
+      phi::Empty<GPUContext>(dev_ctx, std::move(sparse_offset_meta));
+  IntT* x_indexs_ptr = x_indexs.data<IntT>();
+  IntT* mask_indexs_ptr = mask_indexs.data<IntT>();
+  IntT* bound_out_ptr = bound_out.data<IntT>();
+
+  // 1. calc the offsets of per dim
+  phi::funcs::sparse::CalcOffsetsPerDim(x.dims(), sparse_dim, &sparse_offsets);
+  // 2. copy sparse_offsets to device
+  phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<IntT>(),
+                                     sparse_offsets.data(),
+                                     sizeof(IntT) * sparse_dim,
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyHostToDevice,
+#else
+                                     cudaMemcpyHostToDevice,
+#endif
+                                     dev_ctx.stream());
+
+  // 3. flatten x indices and mask indices
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1);
+  FlattenIndicesKernel<<<config.block_per_grid,
+                         config.thread_per_block,
+                         0,
+                         dev_ctx.stream()>>>(x.non_zero_indices().data<IntT>(),
+                                             d_sparse_offsets.data<IntT>(),
+                                             x_indexs.numel(),
+                                             sparse_dim,
+                                             x_indexs_ptr);
+
+  config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1);
+  FlattenIndicesKernel<<<config.block_per_grid,
+                         config.thread_per_block,
+                         0,
+                         dev_ctx.stream()>>>(mask_indices.data<IntT>(),
+                                             d_sparse_offsets.data<IntT>(),
+                                             mask_indexs.numel(),
+                                             sparse_dim,
+                                             mask_indexs_ptr);
+// 4. call thrust::lower_bound
+#ifdef PADDLE_WITH_HIP
+  thrust::lower_bound(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::lower_bound(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                      x_indexs_ptr,
+                      x_indexs_ptr + x_indexs.numel(),
+                      mask_indexs_ptr,
+                      mask_indexs_ptr + mask_indexs.numel(),
+                      bound_out_ptr);
+
+  // 5. copy value to out
+  *out = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
+  phi::funcs::SetConstant<GPUContext, T> set_zero;
+  set_zero(dev_ctx, out, static_cast<T>(0));
+  T* out_ptr = out->data<T>();
+
+  const int64_t stride =
+      x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim;
+
+  SparseMaskCopyKernel<<<config.block_per_grid,
+                         config.thread_per_block,
+                         0,
+                         dev_ctx.stream()>>>(x_indexs_ptr,
+                                             mask_indexs_ptr,
+                                             bound_out_ptr,
+                                             x.non_zero_elements().data<T>(),
+                                             mask_indexs.numel(),
+                                             stride,
+                                             out_ptr);
+}
+
+template <typename T, typename Context>
+void SparseMaskHelperKernel(const Context& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& mask_indices,
+                            DenseTensor* out) {
+  PD_DISPATCH_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "SparseMaskHelperGPUKernel", ([&] {
+        SparseMaskHelperGPUKernel<T, data_t>(dev_ctx, x, mask_indices, out);
+      }));
+}
+
 }  // namespace sparse
 }  // namespace phi

@@ -138,3 +288,17 @@ PD_REGISTER_KERNEL(sparse_mask,
                   int64_t) {
  kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
+
+PD_REGISTER_KERNEL(sparse_mask_helper,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseMaskHelperKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -665,3 +665,15 @@ PD_REGISTER_KERNEL(csr_values,
                   int64_t) {
  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
+
+PD_REGISTER_KERNEL(sparse_coo_tensor,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooTensorKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/sparse/sparse_mask_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_mask_kernel.h
@@ -26,5 +26,11 @@ void SparseMaskKernel(const Context& dev_ctx,
                      const SparseCooTensor& mask,
                      SparseCooTensor* out);

+template <typename T, typename Context>
+void SparseMaskHelperKernel(const Context& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& mask_indices,
+                            DenseTensor* out);
+
 }  // namespace sparse
 }  // namespace phi
--- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
@@ -66,6 +66,19 @@ PD_REGISTER_KERNEL(sparse_coo_to_dense_grad,
  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }

+PD_REGISTER_KERNEL(sparse_coo_tensor_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooTensorGradKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(coo_values_grad,
                   GPU,
@@ -95,4 +108,16 @@ PD_REGISTER_KERNEL(sparse_coo_to_dense_grad,
                   int64_t) {
  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
+PD_REGISTER_KERNEL(sparse_coo_tensor_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooTensorGradKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
 #endif
--- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"

 namespace phi {
 namespace sparse {
@@ -32,5 +33,13 @@ void SparseCooToDenseGradKernel(const Context& dev_ctx,
                                const DenseTensor& out_grad,
                                SparseCooTensor* x_grad);

+template <typename T, typename Context>
+void SparseCooTensorGradKernel(const Context& dev_ctx,
+                               const DenseTensor& indices,
+                               const SparseCooTensor& out_grad,
+                               DenseTensor* values_grad) {
+  SparseMaskHelperKernel<T, Context>(dev_ctx, out_grad, indices, values_grad);
+}
+
 }  // namespace sparse
 }  // namespace phi
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include "paddle/phi/api/lib/utils/storage.h"
+#include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
@@ -147,5 +148,16 @@ void CsrValuesKernel(const Context& dev_ctx,
  *out = x.non_zero_elements();
 }

+template <typename T, typename Context>
+void SparseCooTensorKernel(const Context& dev_ctx,
+                           const DenseTensor& values,
+                           const DenseTensor& indices,
+                           const IntArray& dense_shape,
+                           SparseCooTensor* out) {
+  *out =
+      SparseCooTensor(indices, values, phi::make_ddim(dense_shape.GetData()));
+  // TODO(zhangkaihuo): sort and merge the dumplicate indices
+}
+
 }  // namespace sparse
 }  // namespace phi
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -134,9 +134,11 @@ class TestSparseConvert(unittest.TestCase):
            #test to_sparse_coo_grad backward
            out_grad_indices = [[0, 1], [0, 1]]
            out_grad_values = [2.0, 3.0]
-            out_grad = core.eager.sparse_coo_tensor(
+            out_grad = paddle.sparse.sparse_coo_tensor(
                paddle.to_tensor(out_grad_indices),
-                paddle.to_tensor(out_grad_values), out.shape, True)
+                paddle.to_tensor(out_grad_values),
+                shape=out.shape,
+                stop_gradient=True)
            out.backward(out_grad)
            assert np.array_equal(dense_x.grad.numpy(),
                                  out_grad.to_dense().numpy())
@@ -145,9 +147,11 @@ class TestSparseConvert(unittest.TestCase):
        with _test_eager_guard():
            indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
            values = [1.0, 2.0, 3.0, 4.0, 5.0]
-            sparse_x = core.eager.sparse_coo_tensor(
+            sparse_x = paddle.sparse.sparse_coo_tensor(
                paddle.to_tensor(indices),
-                paddle.to_tensor(values), [3, 4], False)
+                paddle.to_tensor(values),
+                shape=[3, 4],
+                stop_gradient=False)
            dense_tensor = sparse_x.to_dense()
            #test to_dense_grad backward
            out_grad = [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
@@ -158,6 +162,17 @@ class TestSparseConvert(unittest.TestCase):
            assert np.array_equal(correct_x_grad,
                                  sparse_x.grad.values().numpy())

+            paddle.device.set_device("cpu")
+            sparse_x_cpu = paddle.sparse.sparse_coo_tensor(
+                paddle.to_tensor(indices),
+                paddle.to_tensor(values),
+                shape=[3, 4],
+                stop_gradient=False)
+            dense_tensor_cpu = sparse_x_cpu.to_dense()
+            dense_tensor_cpu.backward(paddle.to_tensor(out_grad))
+            assert np.array_equal(correct_x_grad,
+                                  sparse_x_cpu.grad.values().numpy())
+
    def test_to_sparse_csr(self):
        with _test_eager_guard():
            x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]]
@@ -177,15 +192,52 @@ class TestSparseConvert(unittest.TestCase):
        with _test_eager_guard():
            indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
            values = [1.0, 2.0, 3.0, 4.0, 5.0]
-            sparse_x = core.eager.sparse_coo_tensor(
+            sparse_x = paddle.sparse.sparse_coo_tensor(
                paddle.to_tensor(indices),
-                paddle.to_tensor(values), [3, 4], False)
+                paddle.to_tensor(values),
+                shape=[3, 4],
+                stop_gradient=False)
            values_tensor = sparse_x.values()
            out_grad = [2.0, 3.0, 5.0, 8.0, 9.0]
            # test coo_values_grad
            values_tensor.backward(paddle.to_tensor(out_grad))
            assert np.array_equal(out_grad, sparse_x.grad.values().numpy())

+    def test_sparse_coo_tensor_grad(self):
+        with _test_eager_guard():
+            indices = [[0, 1], [0, 1]]
+            values = [1, 2]
+            indices = paddle.to_tensor(indices, dtype='int32')
+            values = paddle.to_tensor(
+                values, dtype='float32', stop_gradient=False)
+            sparse_x = paddle.sparse.sparse_coo_tensor(
+                indices, values, shape=[2, 2], stop_gradient=False)
+            grad_indices = [[0, 1], [1, 1]]
+            grad_values = [2, 3]
+            grad_indices = paddle.to_tensor(grad_indices, dtype='int32')
+            grad_values = paddle.to_tensor(grad_values, dtype='float32')
+            sparse_out_grad = paddle.sparse.sparse_coo_tensor(
+                grad_indices, grad_values, shape=[2, 2])
+            sparse_x.backward(sparse_out_grad)
+            correct_values_grad = [0, 3]
+            assert np.array_equal(correct_values_grad, values.grad.numpy())
+
+            place = core.CPUPlace()
+            indices_cpu = paddle.to_tensor(indices, dtype='int32', place=place)
+            values_cpu = paddle.to_tensor(
+                values, dtype='float32', place=place, stop_gradient=False)
+            sparse_x_cpu = paddle.sparse.sparse_coo_tensor(
+                indices_cpu,
+                values_cpu,
+                shape=[2, 2],
+                place=place,
+                stop_gradient=False)
+
+            sparse_out_grad_cpu = paddle.sparse.sparse_coo_tensor(
+                grad_indices, grad_values, shape=[2, 2], place=place)
+            sparse_x_cpu.backward(sparse_out_grad_cpu)
+            assert np.array_equal(correct_values_grad, values_cpu.grad.numpy())
+

 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/sparse/creation.py
+++ b/python/paddle/sparse/creation.py
@@ -14,6 +14,7 @@

 from paddle import _C_ops
 from ..framework import core, dygraph_only
+from ..framework import _current_expected_place, _get_paddle_place
 from ..tensor import to_tensor
 from ..tensor import max
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
@@ -38,6 +39,18 @@ def _infer_dense_shape(indices):
    return list(lens.numpy())


+def _get_place(place):
+    place = _get_paddle_place(place)
+    if place is None:
+        place = _current_expected_place()
+    elif not isinstance(place, (core.Place, core.CPUPlace, core.CUDAPinnedPlace,
+                                core.CUDAPlace)):
+        raise ValueError(
+            "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace"
+        )
+    return place
+
+
 @dygraph_only
 def sparse_coo_tensor(indices,
                      values,
@@ -94,6 +107,8 @@ def sparse_coo_tensor(indices,
            #       values=[1., 2., 3.])
    """

+    place = _get_place(place)
+
    if not isinstance(indices, core.eager.Tensor):
        indices = to_tensor(
            indices, dtype=None, place=place, stop_gradient=True)
@@ -101,13 +116,20 @@ def sparse_coo_tensor(indices,
        values = to_tensor(values, dtype, place, stop_gradient)
    if len(indices.shape) != 2:
        raise ValueError("'indices' must be 2-D.")
-    if place is not None:
+
+    if not indices.place._equals(place):
        indices = indices._copy_to(place, False)
+
+    if not values.place._equals(place):
        values = values._copy_to(place, False)
    values = _handle_dtype(values, dtype)
+    values.stop_gradient = stop_gradient
+
    if shape is None:
        shape = _infer_dense_shape(indices)
-    return core.eager.sparse_coo_tensor(indices, values, shape, stop_gradient)
+
+    return _C_ops.final_state_sparse_create_sparse_coo_tensor(values, indices,
+                                                              shape)


 #TODO: need to support shape is None
@@ -171,6 +193,9 @@ def sparse_csr_tensor(crows,
            #       cols=[1, 3, 2, 0, 1],
            #       values=[1, 2, 3, 4, 5])
    """
+
+    place = _get_place(place)
+
    if not isinstance(crows, core.eager.Tensor):
        crows = to_tensor(crows, dtype=None, place=place, stop_gradient=True)
    if not isinstance(cols, core.eager.Tensor):
@@ -182,10 +207,15 @@ def sparse_csr_tensor(crows,
            "SparseCsrTensor only support 2-D or 3-D matrix. The 'crows', 'cols' and 'values' must be 1-D."
        )

-    if place is not None:
+    if not crows.place._equals(place):
        crows = crows._copy_to(place, False)
+
+    if not cols.place._equals(place):
        cols = cols._copy_to(place, False)
+
+    if not values.place._equals(place):
        values = values._copy_to(place, False)
    values = _handle_dtype(values, dtype)
+    values.stop_gradient = stop_gradient
    return core.eager.sparse_csr_tensor(crows, cols, values, shape,
                                        stop_gradient)
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -21,6 +21,14 @@
    layout : x
  backward : coo_values_grad

+- api : create_sparse_coo_tensor
+  args : (Tensor values, Tensor indices, IntArray dense_shape)
+  output : Tensor(out@SparseCooTensor)
+  kernel :
+    func : sparse_coo_tensor
+    layout : values
+  backward : create_sparse_coo_tensor_grad
+
 - api : csr_values
  args : (Tensor x)
  output : Tensor(out@DenseTensor)

--- a/python/paddle/utils/code_gen/sparse_bw_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml
@@ -19,6 +19,13 @@
  kernel :
    func : coo_values_grad

+- backward_api : create_sparse_coo_tensor_grad
+  forward : create_sparse_coo_tensor(Tensor values, Tensor indices, IntArray dense_shape) -> Tensor(out@SparseCooTensor)
+  args : (Tensor indices, Tensor out_grad)
+  output : Tensor(values_grad@DenseTensor)
+  kernel :
+    func : sparse_coo_tensor_grad
+
 - backward_api : dense_to_coo_grad
  forward : dense_to_coo(Tensor x, int64_t sparse_dim) -> Tensor(out@SparseCooTensor)
  args : (Tensor out_grad)