[Sparse]fix bug in paddle.sparse.transpose and paddle.sparse.reshape (#53038)

15251291 · Zhan Rongrui · GitHub · 5cfaa7dd · 15251291 · 15251291
3 changed file
--- a/paddle/phi/kernels/sparse/cpu/transpose_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/transpose_kernel.cc
@@ -33,7 +33,7 @@ void TransposeCooKernel(const Context& dev_ctx,
  int64_t x_nnz = x.nnz();
  DDim out_dims = x.dims().transpose(perm);
  DenseTensor out_indices = EmptyLike<int64_t, Context>(dev_ctx, x.indices());
-  DenseTensor out_values(x.values());
+  const DenseTensor& out_values(x.values());
  out->SetMember(out_indices, out_values, out_dims, x.coalesced());
  // compute values of indices
@@ -108,7 +108,7 @@ void TransposeCsrKernel(const Context& dev_ctx,
      out_crows_data[i] = 0;
    }
    for (int i = 0; i < x_nnz; ++i) {
-      int j = x_cols_data[i];
+      int64_t j = x_cols_data[i];
      out_crows_data[j + 1]++;
    }
    out_crows_data[out_dims[0]] = x_nnz;
@@ -134,8 +134,8 @@ void TransposeCsrKernel(const Context& dev_ctx,
      }
    }
  } else {  // n_dim == 3
-    int out_n_rows = out_dims[1];
+    int64_t out_n_rows = out_dims[1];
-    int x_n_rows = x.dims()[1];
+    int64_t x_n_rows = x.dims()[1];
    for (int k = 0; k < out_dims[0]; ++k) {
      if (perm[0] == 0) {  // perm == {0, 2, 1}
        // compute out_crows_data by x_cols_data
@@ -143,7 +143,7 @@ void TransposeCsrKernel(const Context& dev_ctx,
          out_crows_data[i] = 0;
        }
        for (int i = 0; i < x_crows_data[x_n_rows]; ++i) {
-          int j = x_cols_data[i];
+          int64_t j = x_cols_data[i];
          out_crows_data[j + 1]++;
        }
        out_crows_data[out_n_rows] = x_crows_data[x_n_rows];
@@ -176,14 +176,14 @@ void TransposeCsrKernel(const Context& dev_ctx,
        for (int i = 0; i < out_n_rows; ++i) {
          out_crows_data[i] = 0;
        }
-        int x_cols_offset = 0;
+        int64_t x_cols_offset = 0;
        int out_cols_index = 0;
        for (int i = 0; i < x.dims()[0]; ++i) {
          int x_crows_index = i * (x_n_rows + 1);
-          int start = x_crows_data[x_crows_index + k];
+          int64_t start = x_crows_data[x_crows_index + k];
-          int end = x_crows_data[x_crows_index + 1 + k];
+          int64_t end = x_crows_data[x_crows_index + 1 + k];
          out_crows_data[i + 1] = end - start;
-          for (int j = start; j < end; ++j) {
+          for (int64_t j = start; j < end; ++j) {
            out_cols_data[out_cols_index] = x_cols_data[x_cols_offset + j];
            out_values_data[out_cols_index] = x_values_data[x_cols_offset + j];
            out_cols_index++;

--- a/paddle/phi/kernels/sparse/gpu/reshape_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/reshape_kernel.cu
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/sparse/unary_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/sparse/empty_kernel.h"
@@ -78,33 +79,31 @@ void ReshapeCooKernel(const Context& dev_ctx,
  int64_t *destination_x_sparse_part_strides,
      *destination_out_sparse_part_strides;
-#ifdef PADDLE_WITH_HIP
+  auto destination_x_sparse_part_strides_tensor = memory_utils::Alloc(
-  hipMalloc(reinterpret_cast<void**>(&destination_x_sparse_part_strides),
+      dev_ctx.GetPlace(),
-            sizeof(int64_t) * x_sparse_part_strides.size());
+      sizeof(int64_t) * x_sparse_part_strides.size(),
-  hipMemcpy(destination_x_sparse_part_strides,
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-            x_sparse_part_strides.Get(),
+  destination_x_sparse_part_strides = reinterpret_cast<int64_t*>(
-            sizeof(int64_t) * x_sparse_part_strides.size(),
+      destination_x_sparse_part_strides_tensor->ptr());
-            hipMemcpyHostToDevice);
+  memory_utils::Copy(dev_ctx.GetPlace(),
-  hipMalloc(reinterpret_cast<void**>(&destination_out_sparse_part_strides),
+                     reinterpret_cast<void*>(destination_x_sparse_part_strides),
-            sizeof(int64_t) * out_sparse_part_strides.size());
+                     phi::CPUPlace(),
-  hipMemcpy(destination_out_sparse_part_strides,
+                     x_sparse_part_strides.Get(),
-            out_sparse_part_strides.Get(),
+                     sizeof(int64_t) * x_sparse_part_strides.size(),
-            sizeof(int64_t) * out_sparse_part_strides.size(),
+                     dev_ctx.stream());
-            hipMemcpyHostToDevice);
-#else
+  auto destination_out_sparse_part_strides_tensor = memory_utils::Alloc(
-  cudaMalloc(reinterpret_cast<void**>(&destination_x_sparse_part_strides),
+      dev_ctx.GetPlace(),
-             sizeof(int64_t) * x_sparse_part_strides.size());
+      sizeof(int64_t) * out_sparse_part_strides.size(),
-  cudaMemcpy(destination_x_sparse_part_strides,
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-             x_sparse_part_strides.Get(),
+  destination_out_sparse_part_strides = reinterpret_cast<int64_t*>(
-             sizeof(int64_t) * x_sparse_part_strides.size(),
+      destination_out_sparse_part_strides_tensor->ptr());
-             cudaMemcpyHostToDevice);
+  memory_utils::Copy(dev_ctx.GetPlace(),
-  cudaMalloc(reinterpret_cast<void**>(&destination_out_sparse_part_strides),
+                     destination_out_sparse_part_strides,
-             sizeof(int64_t) * out_sparse_part_strides.size());
+                     phi::CPUPlace(),
-  cudaMemcpy(destination_out_sparse_part_strides,
+                     out_sparse_part_strides.Get(),
-             out_sparse_part_strides.Get(),
+                     sizeof(int64_t) * out_sparse_part_strides.size(),
-             sizeof(int64_t) * out_sparse_part_strides.size(),
+                     dev_ctx.stream());
-             cudaMemcpyHostToDevice);
-#endif
  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_nnz, 1);
  ReshapeCooCudaKernel<<<config.block_per_grid.x,

--- a/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/sparse/unary_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
@@ -170,15 +171,18 @@ void TransposeCooKernel(const Context &dev_ctx,
  const auto *x_indices_data = x_indices.data<int64_t>();
  auto *out_indices_data = out_indices.data<int64_t>();
  int *d_perm;
-#ifdef PADDLE_WITH_HIP
-  hipMalloc(reinterpret_cast<void **>(&d_perm), sizeof(int) * perm.size());
+  auto d_perm_tensor = memory_utils::Alloc(
-  hipMemcpy(
+      dev_ctx.GetPlace(),
-      d_perm, perm.data(), sizeof(int) * perm.size(), hipMemcpyHostToDevice);
+      sizeof(int) * perm.size(),
-#else
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-  cudaMalloc(reinterpret_cast<void **>(&d_perm), sizeof(int) * perm.size());
+  d_perm = reinterpret_cast<int *>(d_perm_tensor->ptr());
-  cudaMemcpy(
+  memory_utils::Copy(dev_ctx.GetPlace(),
-      d_perm, perm.data(), sizeof(int) * perm.size(), cudaMemcpyHostToDevice);
+                     d_perm,
-#endif
+                     phi::CPUPlace(),
+                     perm.data(),
+                     sizeof(int) * perm.size(),
+                     dev_ctx.stream());
  auto config =
      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_nnz * n_dim, 1);
  TransposeCooCudaKernel<<<config.block_per_grid.x,
@@ -242,39 +246,41 @@ void TransposeCsrKernel(const Context &dev_ctx,
  const T *x_values_data = x_values.data<T>();
  int *d_perm;
  int64_t *d_x_dims, *d_out_dims;
-#ifdef PADDLE_WITH_HIP
-  hipMalloc(reinterpret_cast<void **>(&d_perm), sizeof(int) * perm.size());
+  auto d_perm_tensor = memory_utils::Alloc(
-  hipMemcpy(
+      dev_ctx.GetPlace(),
-      d_perm, perm.data(), sizeof(int) * perm.size(), hipMemcpyHostToDevice);
+      sizeof(int) * perm.size(),
-  hipMalloc(reinterpret_cast<void **>(&d_x_dims),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-            sizeof(int64_t) * x.dims().size());
+  d_perm = reinterpret_cast<int *>(d_perm_tensor->ptr());
-  hipMemcpy(d_x_dims,
+  memory_utils::Copy(dev_ctx.GetPlace(),
-            x.dims().Get(),
+                     d_perm,
-            sizeof(int64_t) * x.dims().size(),
+                     phi::CPUPlace(),
-            hipMemcpyHostToDevice);
+                     perm.data(),
-  hipMalloc(reinterpret_cast<void **>(&d_out_dims),
+                     sizeof(int) * perm.size(),
-            sizeof(int64_t) * out_dims.size());
+                     dev_ctx.stream());
-  hipMemcpy(d_out_dims,
+  auto d_x_dims_tensor = memory_utils::Alloc(
-            out_dims.Get(),
+      dev_ctx.GetPlace(),
-            sizeof(int64_t) * out_dims.size(),
+      sizeof(int64_t) * x.dims().size(),
-            hipMemcpyHostToDevice);
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-#else
+  d_x_dims = reinterpret_cast<int64_t *>(d_x_dims_tensor->ptr());
-  cudaMalloc(reinterpret_cast<void **>(&d_perm), sizeof(int) * perm.size());
+  memory_utils::Copy(dev_ctx.GetPlace(),
-  cudaMemcpy(
+                     d_x_dims,
-      d_perm, perm.data(), sizeof(int) * perm.size(), cudaMemcpyHostToDevice);
+                     phi::CPUPlace(),
-  cudaMalloc(reinterpret_cast<void **>(&d_x_dims),
+                     x.dims().Get(),
-             sizeof(int64_t) * x.dims().size());
+                     sizeof(int64_t) * x.dims().size(),
-  cudaMemcpy(d_x_dims,
+                     dev_ctx.stream());
-             x.dims().Get(),
+  auto d_out_dims_tensor = memory_utils::Alloc(
-             sizeof(int64_t) * x.dims().size(),
+      dev_ctx.GetPlace(),
-             cudaMemcpyHostToDevice);
+      sizeof(int64_t) * out_dims.size(),
-  cudaMalloc(reinterpret_cast<void **>(&d_out_dims),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-             sizeof(int64_t) * out_dims.size());
+  d_out_dims = reinterpret_cast<int64_t *>(d_out_dims_tensor->ptr());
-  cudaMemcpy(d_out_dims,
+  memory_utils::Copy(dev_ctx.GetPlace(),
-             out_dims.Get(),
+                     d_out_dims,
-             sizeof(int64_t) * out_dims.size(),
+                     phi::CPUPlace(),
-             cudaMemcpyHostToDevice);
+                     out_dims.Get(),
-#endif
+                     sizeof(int64_t) * out_dims.size(),
+                     dev_ctx.stream());
  int64_t x_nnz = x.nnz();
  auto config =
      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_dims[0], 1);