Optimize sparse convolution (#43576)

9841b308 · zhangkaihuo · GitHub · 22342d51 · 9841b308 · 9841b308
27 changed file
--- a/paddle/phi/api/yaml/sparse_api.yaml
+++ b/paddle/phi/api/yaml/sparse_api.yaml
@@ -80,14 +80,14 @@
    data_type : x
  backward : cast_grad
- api : conv3d
+- api : conv3d_coo
-  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
+  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key)
-  output : Tensor(out), Tensor(rulebook)
+  output : Tensor(out), Tensor(rulebook), Tensor(counter) 
  kernel :
-    func : conv3d_coo{sparse_coo, dense -> sparse_coo, dense}
+    func : conv3d_coo{sparse_coo, dense -> sparse_coo, dense, dense}
    layout : x
-  intermediate : rulebook
+  intermediate: rulebook, counter
-  backward : conv3d_grad
+  backward : conv3d_coo_grad
 - api : coo_to_dense
  args : (Tensor x)
@@ -352,11 +352,11 @@
 - api: maxpool
  args : (Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides)
-  output : Tensor(out), Tensor(rulebook)
+  output : Tensor(out), Tensor(rulebook), Tensor(counter)
  kernel :
-    func : maxpool_coo{sparse_coo -> sparse_coo, dense}
+    func : maxpool_coo{sparse_coo -> sparse_coo, dense, dense}
    layout : x
-  intermediate : rulebook
+  intermediate : rulebook, counter
  backward : maxpool_grad
 - api: mv

--- a/paddle/phi/api/yaml/sparse_bw_api.yaml
+++ b/paddle/phi/api/yaml/sparse_bw_api.yaml
@@ -81,12 +81,12 @@
           cast_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
    data_type : out_grad
- backward_api : conv3d_grad
+- backward_api : conv3d_coo_grad
-  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  forward : conv3d_coo (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) -> Tensor(out), Tensor(rulebook), Tensor(counter)
-  args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
+  args : (Tensor x, Tensor kernel, Tensor out, Tensor rulebook, Tensor counter, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key)
  output : Tensor(x_grad), Tensor(kernel_grad)
  kernel :
-    func : conv3d_coo_grad{sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense}
+    func : conv3d_coo_grad{sparse_coo, dense, sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense}
 - backward_api : coo_to_dense_grad
  forward : coo_to_dense(Tensor x) -> Tensor(out)
@@ -164,11 +164,11 @@
           matmul_coo_coo_grad {sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo}
 - backward_api : maxpool_grad
-  forward : maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out), Tensor(rulebook)
+  forward : maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out), Tensor(rulebook), Tensor(counter)
-  args : (Tensor x, Tensor rulebook, Tensor out, Tensor out_grad, int[] kernel_sizes)
+  args : (Tensor x, Tensor rulebook, Tensor counter, Tensor out, Tensor out_grad, int[] kernel_sizes)
  output : Tensor(x_grad)
  kernel :
-    func : maxpool_coo_grad {sparse_coo, dense, sparse_coo, sparse_coo -> sparse_coo}
+    func : maxpool_coo_grad {sparse_coo, dense, dense, sparse_coo, sparse_coo -> sparse_coo}
 - backward_api : multiply_grad
  forward : multiply(Tensor x, Tensor y) -> Tensor(out)

--- a/paddle/phi/core/sparse_coo_tensor.h
+++ b/paddle/phi/core/sparse_coo_tensor.h
@@ -156,6 +156,48 @@ class SparseCooTensor : public TensorBase,
  /// \brief get the dnese dim
  int32_t dense_dim() const;
+  /// \brief query table according to key
+  const std::pair<DenseTensor, DenseTensor>* IndicesPairs(
+      const std::string& key) const {
+    if (indices_dict_ == nullptr) {
+      return nullptr;
+    }
+    const auto& iter = indices_dict_->find(key);
+    if (iter == indices_dict_->end()) {
+      return nullptr;
+    }
+    return &iter->second;
+  }
+  /// \brief save (key, indices_pairs)
+  void SaveIndicesPairs(
+      const std::string& key,
+      const std::pair<DenseTensor, DenseTensor>& indices_pairs) {
+    if (indices_dict_ == nullptr) {
+      indices_dict_ = std::make_shared<
+          std::map<std::string, std::pair<DenseTensor, DenseTensor>>>();
+    }
+    auto ret = indices_dict_->insert({key, indices_pairs});
+    if (ret.second == false) {
+      ret.first->second = indices_pairs;
+    }
+  }
+  /// \brief get indices_dict_
+  const std::shared_ptr<
+      std::map<std::string, std::pair<DenseTensor, DenseTensor>>>&
+  GetIndicesDict() const {
+    return indices_dict_;
+  }
+  /// \brief set indices_dict_
+  void SetIndicesDict(
+      const std::shared_ptr<
+          std::map<std::string, std::pair<DenseTensor, DenseTensor>>>&
+          indices_dict) {
+    indices_dict_ = indices_dict;
+  }
 private:
  // save the indices of non zero elements in original dense tensor
  DenseTensor non_zero_indices_;
@@ -165,6 +207,14 @@ class SparseCooTensor : public TensorBase,
  bool coalesced_ = false;
  // save the number of non zero elements in each batch
  DDim dims_;
+  // for submanifold conv
+  // SubmConv will generate a rulebook and a counter, which can be
+  // reused by different SubmConv.
+  // refer to sparse/gpu/convolution_kernel.cu.
+  std::shared_ptr<std::map<std::string, std::pair<DenseTensor, DenseTensor>>>
+      indices_dict_ = nullptr;
  /* --------------------------- */
  /*   example: non zero element is scalar */
  /* --------------------------- */

--- a/paddle/phi/kernels/funcs/sparse/convolution.h
+++ b/paddle/phi/kernels/funcs/sparse/convolution.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 namespace phi {
@@ -188,6 +189,88 @@ inline void PrefixSum(const T* counter, T* offsets, const int n) {
  offsets[n] = offset;
 }
+template <typename IntT>
+inline const IntT* GetRulebookPtr(const SparseCooTensor& coo,
+                                  const DenseTensor& rulebook,
+                                  const std::string& key,
+                                  int* rulebook_len) {
+  if (!key.empty()) {
+    const auto* indices_pairs = coo.IndicesPairs(key);
+    if (indices_pairs != nullptr) {
+      const DenseTensor& tmp_rulebook = indices_pairs->first;
+      *rulebook_len = tmp_rulebook.dims()[1];
+      return tmp_rulebook.data<IntT>();
+    }
+  }
+  *rulebook_len = rulebook.dims()[1];
+  return rulebook.data<IntT>();
+}
+inline const int* GetCounterPtr(const SparseCooTensor& coo,
+                                const DenseTensor& counter,
+                                const std::string& key) {
+  if (!key.empty()) {
+    const auto* indices_pairs = coo.IndicesPairs(key);
+    if (indices_pairs != nullptr) {
+      return indices_pairs->second.data<int>();
+    }
+  }
+  return counter.data<int>();
+}
+template <typename T, typename IntT, typename Context>
+inline const IntT* PrepareSubm(const Context& dev_ctx,
+                               const SparseCooTensor& x,
+                               const std::string& key,
+                               const DDim& out_dims,
+                               SparseCooTensor* out,
+                               int* counter,
+                               int* offsets,
+                               int* rulebook_len,
+                               bool* need_product_rulebook) {
+  const auto* indices_pairs = x.IndicesPairs(key);
+  if (indices_pairs != nullptr) {
+    *need_product_rulebook = false;
+    const DenseTensor& rulebook = indices_pairs->first;
+    const int counter_size = indices_pairs->second.numel();
+    memcpy(
+        counter, indices_pairs->second.data<int>(), counter_size * sizeof(int));
+    out->SetIndicesDict(x.GetIndicesDict());
+    *rulebook_len = rulebook.dims()[1];
+    DenseTensor out_indices =
+        phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
+    DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
+    phi::Copy(
+        dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices);
+    out->SetMember(out_indices, out_values, out_dims, false);
+    PrefixSum<int>(counter, offsets, counter_size);
+    return rulebook.data<IntT>();
+  }
+  return nullptr;
+}
+template <typename Context>
+inline void SaveToTable(const Context& dev_ctx,
+                        const SparseCooTensor& x,
+                        const std::string& key,
+                        const DenseTensor& in_rulebook,
+                        const DenseTensor& h_counter,
+                        SparseCooTensor* out,
+                        DenseTensor* out_rulebook,
+                        DenseTensor* counter) {
+  out->SetIndicesDict(x.GetIndicesDict());
+  if (!key.empty()) {
+    out->SaveIndicesPairs(key, std::make_pair(in_rulebook, h_counter));
+  } else {
+    *out_rulebook = in_rulebook;
+    counter->Resize({h_counter.numel()});
+    int* counter_ptr = dev_ctx.template HostAlloc<int>(counter);
+    memcpy(counter_ptr, h_counter.data<int>(), h_counter.numel() * sizeof(int));
+  }
+}
 }  // namespace sparse
 }  // namespace funcs
 }  // namespace phi
--- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+#define VecBytes 16
 namespace phi {
 namespace funcs {
@@ -28,33 +33,126 @@ namespace sparse {
 * channels: the output channel size
 * out: the outputs
 **/
-template <typename T>
+template <typename T, int VecSize>
 __global__ void ScatterKernel(const T* input,
                              const int* unique_value,
                              const int* out_index,
                              const int non_zero_num,
                              const int rulebook_len,
                              const int channels,
-                              T* out,
+                              T* out) {
-                              const bool subm = false) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
+  const int vec_channels = channels / VecSize;
-    int indices_i = i / channels;
+  using LoadT = phi::AlignedVector<T, VecSize>;
-    int channels_i = i - indices_i * channels;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  for (int i = tid; i < non_zero_num * vec_channels;
+       i += gridDim.x * blockDim.x) {
+    int indices_i = i / vec_channels;
+    int channels_i = i - indices_i * vec_channels;
    int start = unique_value[indices_i];
    int end = indices_i == non_zero_num - 1 ? rulebook_len
                                            : unique_value[indices_i + 1];
    // max(end-start) = kernel_size
-    T sum = static_cast<T>(0);
+    StoreT sums = {static_cast<T>(0)};
-    if (subm) {
-      sum = out[indices_i * channels + channels_i];
-    }
    for (int j = start; j < end; j++) {
      const int out_feature_i = out_index[j];
-      sum += input[out_feature_i * channels + channels_i];
+      LoadT vec_in;
+      phi::Load<T, VecSize>(
+          input + out_feature_i * channels + channels_i * VecSize, &vec_in);
+#pragma unroll
+      for (int k = 0; k < VecSize; k++) {
+        sums[k] += vec_in[k];
+      }
+    }
+    phi::Store<T, VecSize>(sums,
+                           out + indices_i * channels + channels_i * VecSize);
+  }
+}
+// scatter's index has been grouped in advance
+// index_counts record the count of each group
+// index_groups save the index of each group
+template <typename T, int VecSize>
+__global__ void ScatterKernelV2(const T* input,
+                                const int* index_counts,
+                                const int* index_groups,
+                                const int non_zero_num,
+                                const int kernel_size,
+                                const int channels,
+                                const int buffer_counts,
+                                T* out) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int vec_channels = channels / VecSize;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  for (int i = tid; i < non_zero_num * vec_channels;
+       i += gridDim.x * blockDim.x) {
+    int indices_i = i / vec_channels;
+    int channels_i = i - indices_i * vec_channels;
+    StoreT sums = {static_cast<T>(0)};
+    phi::Load<T, VecSize>(out + indices_i * channels + channels_i * VecSize,
+                          &sums);
+    for (int it = 0; it < buffer_counts; it++) {
+      int len = index_counts[indices_i + it * non_zero_num];
+      const int group_offset = it * kernel_size * non_zero_num;
+      for (int j = 0; j < len; j++) {
+        const int out_feature_i =
+            index_groups[indices_i * kernel_size + j + group_offset];
+        LoadT vec_in;
+        phi::Load<T, VecSize>(
+            input + out_feature_i * channels + channels_i * VecSize, &vec_in);
+#pragma unroll
+        for (int k = 0; k < VecSize; k++) {
+          sums[k] += vec_in[k];
+        }
+      }
+    }
+    phi::Store<T, VecSize>(sums,
+                           out + indices_i * channels + channels_i * VecSize);
  }
-    out[indices_i * channels + channels_i] = sum;
+}
+template <typename T>
+void ScatterV2(const GPUContext& dev_ctx,
+               const T* input,
+               const int* index_counts,
+               const int* index_groups,
+               const int non_zero_num,
+               const int kernel_size,
+               const int channels,
+               const int buffer_counts,
+               T* output) {
+  const int VecSize = VecBytes / sizeof(T);
+  if (channels % VecSize == 0) {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, non_zero_num * channels / VecSize, 1);
+    ScatterKernelV2<T, VecSize><<<config.block_per_grid.x,
+                                  config.thread_per_block.x,
+                                  0,
+                                  dev_ctx.stream()>>>(input,
+                                                      index_counts,
+                                                      index_groups,
+                                                      non_zero_num,
+                                                      kernel_size,
+                                                      channels,
+                                                      buffer_counts,
+                                                      output);
+  } else {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, non_zero_num * channels, 1);
+    ScatterKernelV2<T, 1><<<config.block_per_grid.x,
+                            config.thread_per_block.x,
+                            0,
+                            dev_ctx.stream()>>>(input,
+                                                index_counts,
+                                                index_groups,
+                                                non_zero_num,
+                                                kernel_size,
+                                                channels,
+                                                buffer_counts,
+                                                output);
  }
 }

--- a/paddle/phi/kernels/sparse/conv_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/conv_grad_kernel.h
@@ -25,13 +25,16 @@ template <typename T, typename Context>
 void Conv3dCooGradKernel(const Context& dev_ctx,
                         const SparseCooTensor& x,
                         const DenseTensor& kernel,
+                         const SparseCooTensor& out,
                         const DenseTensor& rulebook,
+                         const DenseTensor& counter,
                         const SparseCooTensor& out_grad,
                         const std::vector<int>& paddings,
                         const std::vector<int>& dilations,
                         const std::vector<int>& strides,
                         const int groups,
                         const bool subm,
+                         const std::string& key,
                         SparseCooTensor* x_grad,
                         DenseTensor* kernel_grad);
@@ -40,13 +43,16 @@ std::tuple<SparseCooTensor, DenseTensor> Conv3dCooGrad(
    const Context& dev_ctx,
    const SparseCooTensor& x,
    const DenseTensor& kernel,
+    const SparseCooTensor& out,
    const DenseTensor& rulebook,
+    const DenseTensor& counter,
    const SparseCooTensor& out_grad,
    const std::vector<int>& paddings,
    const std::vector<int>& dilations,
    const std::vector<int>& strides,
    const int groups,
-    const bool subm) {
+    const bool subm,
+    const std::string& key) {
  SparseCooTensor x_grad;
  DenseTensor kernel_grad;
@@ -54,13 +60,16 @@ std::tuple<SparseCooTensor, DenseTensor> Conv3dCooGrad(
  Conv3dCooGradKernel<T, Context>(dev_ctx,
                                  x,
                                  kernel,
+                                  out,
                                  rulebook,
+                                  counter,
                                  out_grad,
                                  paddings,
                                  dilations,
                                  strides,
                                  groups,
                                  subm,
+                                  key,
                                  &x_grad,
                                  &kernel_grad);
  return std::make_tuple(x_grad, kernel_grad);

--- a/paddle/phi/kernels/sparse/conv_kernel.h
+++ b/paddle/phi/kernels/sparse/conv_kernel.h
@@ -31,8 +31,10 @@ void Conv3dCooKernel(const Context& dev_ctx,
                     const std::vector<int>& strides,
                     const int groups,
                     const bool subm,
+                     const std::string& key,
                     SparseCooTensor* out,
-                     DenseTensor* rulebook);
+                     DenseTensor* rulebook,
+                     DenseTensor* counter);
 template <typename T, typename Context>
 SparseCooTensor Conv3dCoo(const Context& dev_ctx,
@@ -43,7 +45,9 @@ SparseCooTensor Conv3dCoo(const Context& dev_ctx,
                          const std::vector<int>& strides,
                          const int groups,
                          const bool subm,
-                          DenseTensor* rulebook) {
+                          const std::string& key,
+                          DenseTensor* rulebook,
+                          DenseTensor* counter) {
  SparseCooTensor coo;
  Conv3dCooKernel<T, Context>(dev_ctx,
                              x,
@@ -53,8 +57,10 @@ SparseCooTensor Conv3dCoo(const Context& dev_ctx,
                              strides,
                              groups,
                              subm,
+                              key,
                              &coo,
-                              rulebook);
+                              rulebook,
+                              counter);
  return coo;
 }

--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -41,13 +41,12 @@ void ProductRuleBook(const Context& dev_ctx,
                     const DDim& out_dims,
                     const bool subm,
                     DenseTensor* rulebook,
-                     DenseTensor* counter_per_kernel) {
+                     int* counter_per_kernel) {
  const int64_t non_zero_num = x.nnz();
  const auto& non_zero_indices = x.non_zero_indices();
  const IntT* indices_ptr = non_zero_indices.data<IntT>();
-  int* counter_ptr = counter_per_kernel->data<int>();
  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
-  memset(counter_ptr, 0, kernel_size * sizeof(int));
+  memset(counter_per_kernel, 0, kernel_size * sizeof(int));
  int rulebook_len = 0;
  // calc the rulebook_len
@@ -107,7 +106,7 @@ void ProductRuleBook(const Context& dev_ctx,
              }
              if (rulebook_ptr == nullptr) {
-                counter_ptr[kernel_index - 1] += 1;
+                counter_per_kernel[kernel_index - 1] += 1;
                ++rulebook_len;
              } else {
                rulebook_ptr[rulebook_index] = kernel_index - 1;

--- a/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/sparse/cpu/convolution.h"
+#include "paddle/phi/kernels/sparse/cpu/conv.h"
 namespace phi {
 namespace sparse {
@@ -34,22 +34,27 @@ template <typename T, typename IntT = int>
 void Conv3dCooGradCPUKernel(const CPUContext& dev_ctx,
                            const SparseCooTensor& x,
                            const DenseTensor& kernel,
+                            const SparseCooTensor& out,
                            const DenseTensor& rulebook,
+                            const DenseTensor& counter,
                            const SparseCooTensor& out_grad,
                            const std::vector<int>& paddings,
                            const std::vector<int>& dilations,
                            const std::vector<int>& strides,
                            const int groups,
                            const bool subm,
+                            const std::string& key,
                            SparseCooTensor* x_grad,
                            DenseTensor* kernel_grad) {
  const auto& kernel_dims = kernel.dims();
  const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
  const int in_channels = kernel_dims[3];
  const int out_channels = kernel_dims[4];
-  const IntT* rulebook_ptr = rulebook.data<IntT>();
-  const int rulebook_len = rulebook.dims()[1];
+  int rulebook_len = 0;
+  const IntT* rulebook_ptr = phi::funcs::sparse::GetRulebookPtr<IntT>(
+      out, rulebook, key, &rulebook_len);
+  const int* counter_ptr = phi::funcs::sparse::GetCounterPtr(out, counter, key);
  DenseTensorMeta in_features_meta(
      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
@@ -86,16 +91,14 @@ void Conv3dCooGradCPUKernel(const CPUContext& dev_ctx,
                        &x_grad_indices);
  x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
-  std::vector<IntT> offsets(kernel_size + 1), counter(kernel_size, 0);
+  std::vector<IntT> offsets(kernel_size + 1);
-  for (int i = 0; i < rulebook_len; i++) {
+  IntT offset = 0;
-    counter[rulebook_ptr[i]] += 1;
+  int max_count = 0;
-  }
-  IntT offset = 0, max_count = 0;
  for (int i = 0; i < kernel_size; i++) {
    offsets[i] = offset;
-    offset += counter[i];
+    offset += counter_ptr[i];
    if (i < half_kernel_size) {
-      max_count = std::max(max_count, counter[i]);
+      max_count = std::max(max_count, counter_ptr[i]);
    }
  }
  offsets[kernel_size] = offset;
@@ -129,11 +132,11 @@ void Conv3dCooGradCPUKernel(const CPUContext& dev_ctx,
  const T* kernel_ptr = kernel.data<T>();
  for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0 || (subm && i == half_kernel_size)) {
+    if (counter_ptr[i] <= 0 || (subm && i == half_kernel_size)) {
      continue;
    }
-    const int M = counter[i];
+    const int M = counter_ptr[i];
    const int K = in_channels;
    const int N = out_channels;
    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
@@ -171,7 +174,7 @@ void Conv3dCooGradCPUKernel(const CPUContext& dev_ctx,
  // 4. scatter
  Scatter<T, IntT>(d_x_features_ptr,
-                   rulebook.data<IntT>() + rulebook_len,
+                   rulebook_ptr + rulebook_len,
                   rulebook_len,
                   in_channels,
                   x_grad_values_ptr);
@@ -181,13 +184,16 @@ template <typename T, typename Context>
 void Conv3dCooGradKernel(const Context& dev_ctx,
                         const SparseCooTensor& x,
                         const DenseTensor& kernel,
+                         const SparseCooTensor& out,
                         const DenseTensor& rulebook,
+                         const DenseTensor& counter,
                         const SparseCooTensor& out_grad,
                         const std::vector<int>& paddings,
                         const std::vector<int>& dilations,
                         const std::vector<int>& strides,
                         const int groups,
                         const bool subm,
+                         const std::string& key,
                         SparseCooTensor* x_grad,
                         DenseTensor* kernel_grad) {
  PD_VISIT_INTEGRAL_TYPES(
@@ -195,13 +201,16 @@ void Conv3dCooGradKernel(const Context& dev_ctx,
        Conv3dCooGradCPUKernel<T, data_t>(dev_ctx,
                                          x,
                                          kernel,
+                                          out,
                                          rulebook,
+                                          counter,
                                          out_grad,
                                          paddings,
                                          dilations,
                                          strides,
                                          groups,
                                          subm,
+                                          key,
                                          x_grad,
                                          kernel_grad);
      }));

--- a/paddle/phi/kernels/sparse/cpu/conv_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc
@@ -14,9 +14,10 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/sparse/cpu/convolution.h"
+#include "paddle/phi/kernels/sparse/cpu/conv.h"
 namespace phi {
 namespace sparse {
@@ -35,8 +36,10 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx,
                        const std::vector<int>& strides,
                        const int groups,
                        const bool subm,
+                        const std::string& key,
                        SparseCooTensor* out,
-                        DenseTensor* rulebook) {
+                        DenseTensor* rulebook,
+                        DenseTensor* counter) {
  // update padding and dilation
  // Currently, only support x.layout is NDHWC, groups = 1
  // if x.layout != NDHWC then transpose(x), transpose(weight)
@@ -66,10 +69,30 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx,
  // Second algorithm:
  // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
  // 1. product rulebook
-  DenseTensorMeta counter_meta(
+  DenseTensor h_counter, h_offsets;
-      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  h_counter.Resize({kernel_size});
-  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+  h_offsets.Resize({kernel_size + 1});
+  int* h_counter_ptr = dev_ctx.template HostAlloc<int>(&h_counter);
+  int* h_offsets_ptr = dev_ctx.template HostAlloc<int>(&h_offsets);
+  // DenseTensor* rulebook = nullptr;
+  const IntT* rulebook_ptr = nullptr;
+  int n = 0;
+  bool need_product_rulebook = true;
+  if (subm && !key.empty()) {
+    rulebook_ptr = phi::funcs::sparse::PrepareSubm<T, IntT, CPUContext>(
+        dev_ctx,
+        x,
+        key,
+        out_dims,
+        out,
+        h_counter_ptr,
+        h_offsets_ptr,
+        &n,
+        &need_product_rulebook);
+  }
+  if (need_product_rulebook) {
+    DenseTensor tmp_rulebook;
    ProductRuleBook<T, CPUContext, IntT>(dev_ctx,
                                         x,
                                         kernel_sizes,
@@ -78,14 +101,18 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx,
                                         subm_strides,
                                         out_dims,
                                         subm,
-                                       rulebook,
+                                         &tmp_rulebook,
-                                       &counter_per_kernel);
+                                         h_counter_ptr);
    UpdateRulebookAndOutIndex<T, CPUContext, IntT>(
-      dev_ctx, x, kernel_size, out_channels, out_dims, rulebook, out);
+        dev_ctx, x, kernel_size, out_channels, out_dims, &tmp_rulebook, out);
+    n = tmp_rulebook.dims()[1];
+    rulebook_ptr = tmp_rulebook.data<IntT>();
-  int n = rulebook->dims()[1];
+    phi::funcs::sparse::SaveToTable(
-  const int* counter_ptr = counter_per_kernel.data<int>();
+        dev_ctx, x, key, tmp_rulebook, h_counter, out, rulebook, counter);
+  }
+  // int n = rulebook->dims()[1];
  // 2. gather
  DenseTensorMeta in_features_meta(
@@ -100,34 +127,33 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx,
  T* out_features_ptr = out_features.data<T>();
  Gather<T, IntT>(x.non_zero_elements().data<T>(),
-                  rulebook->data<IntT>() + n,
+                  rulebook_ptr + n,
                  n,
                  in_channels,
                  in_features_ptr);
  // 3. call gemm for every werght
  auto blas = phi::funcs::GetBlas<CPUContext, T>(dev_ctx);
-  std::vector<int> offsets(kernel_size + 1);
  int offset = 0;
  for (int i = 0; i < kernel_size; i++) {
-    offsets[i] = offset;
+    h_offsets_ptr[i] = offset;
-    offset += counter_ptr[i];
+    offset += h_counter_ptr[i];
  }
-  offsets[kernel_size] = offset;
+  h_offsets_ptr[kernel_size] = offset;
  const T* kernel_ptr = kernel.data<T>();
  for (int i = 0; i < kernel_size; i++) {
-    if (counter_ptr[i] <= 0) {
+    if (h_counter_ptr[i] <= 0) {
      continue;
    }
    // call gemm: (n, in_channels) * (in_channels, out_channels)
-    const int M = counter_ptr[i];
+    const int M = h_counter_ptr[i];
    const int K = in_channels;   // in_channels
    const int N = out_channels;  // out_channels
-    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels;
    const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
-    T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels;
+    T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels;
    blas.GEMM(CblasNoTrans,
              CblasNoTrans,
              M,
@@ -143,11 +169,8 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx,
  // 4. scatter
  T* out_values_ptr = out->mutable_non_zero_elements()->data<T>();
  memset(out_values_ptr, 0, sizeof(T) * out->nnz() * out_channels);
-  Scatter<T, IntT>(out_features_ptr,
+  Scatter<T, IntT>(
-                   rulebook->data<IntT>() + n * 2,
+      out_features_ptr, rulebook_ptr + n * 2, n, out_channels, out_values_ptr);
-                   n,
-                   out_channels,
-                   out_values_ptr);
 }
 template <typename T, typename Context>
@@ -159,8 +182,10 @@ void Conv3dCooKernel(const Context& dev_ctx,
                     const std::vector<int>& strides,
                     const int groups,
                     const bool subm,
+                     const std::string& key,
                     SparseCooTensor* out,
-                     DenseTensor* rulebook) {
+                     DenseTensor* rulebook,
+                     DenseTensor* counter) {
  PD_VISIT_INTEGRAL_TYPES(
      x.non_zero_indices().dtype(), "Conv3dCooCPUKernel", ([&] {
        Conv3dCooCPUKernel<T, data_t>(dev_ctx,
@@ -171,8 +196,10 @@ void Conv3dCooKernel(const Context& dev_ctx,
                                      strides,
                                      groups,
                                      subm,
+                                      key,
                                      out,
-                                      rulebook);
+                                      rulebook,
+                                      counter);
      }));
 }

--- a/paddle/phi/kernels/sparse/cpu/pool_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/pool_grad_kernel.cc
@@ -28,6 +28,7 @@ template <typename T, typename IntT = int>
 void MaxPoolCooGradCPUKernel(const CPUContext& dev_ctx,
                             const SparseCooTensor& x,
                             const DenseTensor& rulebook,
+                             const DenseTensor& counter,
                             const SparseCooTensor& out,
                             const SparseCooTensor& out_grad,
                             const std::vector<int>& kernel_sizes,
@@ -36,11 +37,10 @@ void MaxPoolCooGradCPUKernel(const CPUContext& dev_ctx,
  const int channels = x.dims()[4];
  int rulebook_len = rulebook.dims()[1];
  const IntT* rulebook_ptr = rulebook.data<IntT>();
-  std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0);
+  std::vector<int> offsets(kernel_size + 1);
-  for (int i = 0; i < rulebook_len; i++) {
+  const int* counter_ptr = counter.data<int>();
-    counter[rulebook_ptr[i]] += 1;
-  }
+  phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);
-  phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size);
  const T* in_features_ptr = x.non_zero_elements().data<T>();
  const T* out_features_ptr = out.non_zero_elements().data<T>();
@@ -60,7 +60,7 @@ void MaxPoolCooGradCPUKernel(const CPUContext& dev_ctx,
  phi::funcs::MaxPoolGrad<T> grad_functor;
  for (int i = 0; i < kernel_size; i++) {
-    for (int j = 0; j < counter[i]; j++) {
+    for (int j = 0; j < counter_ptr[i]; j++) {
      IntT in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
      IntT out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
      for (int c = 0; c < channels; c++) {
@@ -78,6 +78,7 @@ template <typename T, typename Context>
 void MaxPoolCooGradKernel(const Context& dev_ctx,
                          const SparseCooTensor& x,
                          const DenseTensor& rulebook,
+                          const DenseTensor& counter,
                          const SparseCooTensor& out,
                          const SparseCooTensor& out_grad,
                          const std::vector<int>& kernel_sizes,
@@ -85,7 +86,7 @@ void MaxPoolCooGradKernel(const Context& dev_ctx,
  PD_VISIT_INTEGRAL_TYPES(
      x.non_zero_indices().dtype(), "MaxPoolCooGradCPUKernel", ([&] {
        MaxPoolCooGradCPUKernel<T, data_t>(
-            dev_ctx, x, rulebook, out, out_grad, kernel_sizes, x_grad);
+            dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, x_grad);
      }));
 }

--- a/paddle/phi/kernels/sparse/cpu/pool_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/pool_kernel.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/sparse/convolution.h"
-#include "paddle/phi/kernels/sparse/cpu/convolution.h"
+#include "paddle/phi/kernels/sparse/cpu/conv.h"
 namespace phi {
 namespace sparse {
@@ -37,7 +37,8 @@ void MaxPoolCooCPUKernel(const CPUContext& dev_ctx,
                         const std::vector<int>& dilations,
                         const std::vector<int>& strides,
                         SparseCooTensor* out,
-                         DenseTensor* rulebook) {
+                         DenseTensor* rulebook,
+                         DenseTensor* counter) {
  const auto& x_dims = x.dims();
  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
  const std::vector<int>& real_kernel_sizes =
@@ -47,9 +48,7 @@ void MaxPoolCooCPUKernel(const CPUContext& dev_ctx,
      x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
  const int in_channels = real_kernel_sizes[3];
-  DenseTensorMeta counter_meta(
+  std::vector<int> counter_per_kernel(kernel_size, 0);
-      DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
  const T* in_features_ptr = x.non_zero_elements().data<T>();
  // 1. product rule book
@@ -62,14 +61,17 @@ void MaxPoolCooCPUKernel(const CPUContext& dev_ctx,
                                       out_dims,
                                       false,
                                       rulebook,
-                                       &counter_per_kernel);
+                                       counter_per_kernel.data());
  UpdateRulebookAndOutIndex<T, CPUContext, IntT>(
      dev_ctx, x, kernel_size, in_channels, out_dims, rulebook, out);
  int rulebook_len = rulebook->dims()[1];
  const IntT* rulebook_ptr = rulebook->data<IntT>();
-  const int* counter_ptr = counter_per_kernel.data<int>();
+  counter->Resize({kernel_size});
+  int* counter_ptr = dev_ctx.template HostAlloc<int>(counter);
+  memcpy(counter_ptr, counter_per_kernel.data(), kernel_size * sizeof(int));
  std::vector<int> offsets(kernel_size + 1);
  phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);
@@ -105,7 +107,8 @@ void MaxPoolCooKernel(const Context& dev_ctx,
                      const std::vector<int>& dilations,
                      const std::vector<int>& strides,
                      SparseCooTensor* out,
-                      DenseTensor* rulebook) {
+                      DenseTensor* rulebook,
+                      DenseTensor* counter) {
  PD_VISIT_INTEGRAL_TYPES(
      x.non_zero_indices().dtype(), "MaxPoolCooCPUKernel", ([&] {
        MaxPoolCooCPUKernel<T, data_t>(dev_ctx,
@@ -115,7 +118,8 @@ void MaxPoolCooKernel(const Context& dev_ctx,
                                       dilations,
                                       strides,
                                       out,
-                                       rulebook);
+                                       rulebook,
+                                       counter);
      }));
 }

--- a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
@@ -125,16 +125,35 @@ void CoalesceGPUKernel(const GPUContext& dev_ctx,
  }
  // 5. scatter the values
+  const int VecSize = VecBytes / sizeof(T);
+  if (stride % VecSize == 0) {
+    config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, nnz * stride / VecSize, 1);
+    phi::funcs::sparse::ScatterKernel<T, VecSize>
+        <<<config.block_per_grid,
+           config.thread_per_block,
+           0,
+           dev_ctx.stream()>>>(x_values_ptr,
+                               public_indexs.data<int>(),
+                               values_indexs_ptr,
+                               out_nnz,
+                               nnz,
+                               stride,
+                               out_values.data<T>());
+  } else {
    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1);
-  phi::funcs::sparse::ScatterKernel<T>
+    phi::funcs::sparse::ScatterKernel<T, 1>
-      <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
+        <<<config.block_per_grid,
-          x_values_ptr,
+           config.thread_per_block,
+           0,
+           dev_ctx.stream()>>>(x_values_ptr,
                               public_indexs.data<int>(),
                               values_indexs_ptr,
                               out_nnz,
                               nnz,
                               stride,
                               out_values.data<T>());
+  }
  // 6. convert index to coordinate
  Dim<DDim::kMaxRank> const_dims;

--- a/paddle/phi/kernels/sparse/gpu/conv.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/conv.cu.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <thrust/remove.h>
+#include <thrust/unique.h>
+#include "paddle/phi/kernels/sparse/conv_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
+#include "paddle/phi/kernels/funcs/sparse/utils.cu.h"
+#include "paddle/phi/kernels/primitive/compute_primitives.h"
+namespace phi {
+namespace sparse {
+using Dims4D = phi::funcs::sparse::Dims4D;
+// Vectorize load and store global memory
+// In the scene of 3D point cloud, the slice_size 4,8,16,32,64 are commonly
+// used.
+template <typename T, typename IndexT = int, int VecSize>
+__global__ void GatherKernel(const T* params,
+                             const IndexT* indices,
+                             T* output,
+                             size_t index_size,
+                             size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size / VecSize, int64_t) {
+    const int vec_slice_size = slice_size / VecSize;
+    int indices_i = i / vec_slice_size;
+    int slice_i = i - indices_i * vec_slice_size;  // offset inside the slice
+    IndexT gather_i = indices[indices_i];
+    int64_t params_i = gather_i * slice_size + slice_i * VecSize;
+    using LoadT = phi::AlignedVector<T, VecSize>;
+    using StoreT = phi::AlignedVector<T, VecSize>;
+    LoadT params_vec;
+    phi::Load<T, VecSize>(params + params_i, &params_vec);
+    phi::Store<T, VecSize>(params_vec, output + i * VecSize);
+  }
+}
+// double sparse, seed GroupIndexs
+template <typename T, typename IntT, int VecSize>
+__global__ void GatherKernelV2(const T* inputs,
+                               const int* index_counts,
+                               const int* index_groups,
+                               const int non_zero_num,
+                               const int kernel_size,
+                               const int channels,
+                               const int buffer_count,
+                               T* output) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int vec_channels = channels / VecSize;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  for (int i = tid; i < non_zero_num * vec_channels;
+       i += gridDim.x * blockDim.x) {
+    int indices_i = i / vec_channels;
+    int channels_i = i - indices_i * vec_channels;
+    LoadT in_vec;
+    phi::Load<T, VecSize>(inputs + indices_i * channels + channels_i * VecSize,
+                          &in_vec);
+#pragma unroll
+    for (int it = 0; it < buffer_count; it++) {
+      int len = index_counts[indices_i + it * non_zero_num];
+      const int group_offset = it * kernel_size * non_zero_num;
+#pragma unroll
+      for (int j = 0; j < len; j++) {
+        int out_i = index_groups[indices_i * kernel_size + j + group_offset];
+        phi::Store<T, VecSize>(
+            in_vec, output + out_i * channels + channels_i * VecSize);
+      }
+    }
+  }
+}
+template <typename T, typename IntT>
+inline void Gather(const GPUContext& dev_ctx,
+                   const T* inputs,
+                   const IntT* indices,
+                   const int indices_size,
+                   const int channels,
+                   T* output) {
+  const int VecSize = VecBytes / sizeof(T);
+  if (channels % VecSize == 0) {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, indices_size * channels / VecSize, 1);
+    GatherKernel<T, IntT, VecSize>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(inputs, indices, output, indices_size, channels);
+  } else {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, indices_size * channels, 1);
+    GatherKernel<T, IntT, 1>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(inputs, indices, output, indices_size, channels);
+  }
+}
+template <typename T, typename IntT>
+inline void GatherV2(const GPUContext& dev_ctx,
+                     const T* inputs,
+                     const int* index_counts,
+                     const int* index_groups,
+                     const int non_zero_num,
+                     const int kernel_size,
+                     const int channels,
+                     const int buffer_count,
+                     T* output) {
+  const int VecSize = VecBytes / sizeof(T);
+  if (channels % VecSize == 0) {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, non_zero_num * channels / VecSize, 1);
+    GatherKernelV2<T, IntT, VecSize><<<config.block_per_grid.x,
+                                       config.thread_per_block.x,
+                                       0,
+                                       dev_ctx.stream()>>>(inputs,
+                                                           index_counts,
+                                                           index_groups,
+                                                           non_zero_num,
+                                                           kernel_size,
+                                                           channels,
+                                                           buffer_count,
+                                                           output);
+  } else {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, non_zero_num * channels, 1);
+    GatherKernelV2<T, IntT, 1><<<config.block_per_grid.x,
+                                 config.thread_per_block.x,
+                                 0,
+                                 dev_ctx.stream()>>>(inputs,
+                                                     index_counts,
+                                                     index_groups,
+                                                     non_zero_num,
+                                                     kernel_size,
+                                                     channels,
+                                                     buffer_count,
+                                                     output);
+  }
+}
+// unique the out indexs in rulebook
+template <typename IntT>
+__global__ void UniqueKernel(const IntT* in_indexs,
+                             const int rulebook_len,
+                             int* out_index_table,
+                             int* out_indexs,
+                             int* nnz) {
+  extern __shared__ int cache[];
+  __shared__ int count, start;
+  if (threadIdx.x == 0) {
+    count = 0;
+    start = 0;
+  }
+  __syncthreads();
+  int i = threadIdx.x + blockDim.x * blockIdx.x;
+  if (i < rulebook_len) {
+    // atomicOr only support int
+    int index = static_cast<int>(in_indexs[i]);
+    int change_index = index == 0 ? -1 : index;
+    int flag = atomicOr(out_index_table + index, change_index);
+    if (flag == 0) {
+      int j = atomicAdd(&count, 1);
+      cache[j] = index;
+    }
+  }
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    start = atomicAdd(nnz, count);
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < count; i += blockDim.x) {
+    out_indexs[start + i] = cache[i];
+  }
+}
+template <typename IntT>
+__global__ void GroupIndexs(const int* out_index_table,
+                            const int n,
+                            const int kernel_size,
+                            IntT* out_indexs,
+                            int* out_index_counts,
+                            int* out_index_groups) {
+  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
+    IntT index = out_indexs[i];
+    int real_index = out_index_table[index];
+    out_indexs[i] = real_index;
+    // kernel_size at most
+    int j = atomicAdd(out_index_counts + real_index, 1);
+    // nnz * kernel_size
+    out_index_groups[real_index * kernel_size + j] = i;
+  }
+}
+/**
+ * @brief product rulebook
+ * for input_i in x_indices:
+ *   if input_i participate in the convolution calculation:
+ *       infer the output_i by input_i and kernel_i
+ *       save output_i
+ *
+ * x_indices: the indices of input features
+ * x_dims: the input dims
+ * kernel_dims: the kernel dims
+ * out_dims: the output dims
+ * non_zero_num: the number of input features
+ * rulebook: the rulebook to save the kernel index, input index and output index
+ * counter: save the number of times each location in the kernel participates in
+ *the caculation
+ **/
+template <typename T>
+__global__ void ProductRuleBookKernel(const T* x_indices,
+                                      const Dims4D x_dims,
+                                      const Dims4D kernel_dims,
+                                      const Dims4D out_dims,
+                                      const int64_t non_zero_num,
+                                      const Dims4D paddings,
+                                      const Dims4D dilations,
+                                      const Dims4D strides,
+                                      T* rulebook,
+                                      int* counter) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int counter_buf[];  // kernel_size
+  const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
+  const int offset = kernel_size * non_zero_num;
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    counter_buf[i] = 0;
+  }
+  __syncthreads();
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    int kernel_index = 0;
+    T batch = x_indices[i];
+    T in_z = x_indices[i + non_zero_num];
+    T in_y = x_indices[i + 2 * non_zero_num];
+    T in_x = x_indices[i + 3 * non_zero_num];
+    for (int kz = 0; kz < kernel_dims[1]; kz++) {
+      for (int ky = 0; ky < kernel_dims[2]; ky++) {
+        for (int kx = 0; kx < kernel_dims[3]; kx++) {
+          int in_i = -1, out_index = -1, kernel_i = -1;
+          if (phi::funcs::sparse::Check(x_dims,
+                                        kernel_dims,
+                                        paddings,
+                                        dilations,
+                                        strides,
+                                        in_x,
+                                        in_y,
+                                        in_z,
+                                        kx,
+                                        ky,
+                                        kz)) {
+            T out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
+            T out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
+            T out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
+            in_i = i;
+            out_index = phi::funcs::sparse::PointToIndex<Dims4D>(
+                batch, out_x, out_y, out_z, out_dims);
+            atomicAdd(&counter_buf[kernel_index], 1);
+            kernel_i = kernel_index;
+          }
+          // rulebook[kernel_index * non_zero_num + i] = kernel_i;
+          rulebook[kernel_index * non_zero_num + i] = in_i;
+          rulebook[kernel_index * non_zero_num + offset + i] = out_index;
+          ++kernel_index;
+        }
+      }
+    }
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    atomicAdd(&counter[i], counter_buf[i]);
+  }
+}
+template <typename IntT>
+__global__ void GetOutIndexTable(const IntT* indices,
+                                 const IntT non_zero_num,
+                                 const Dims4D dims,
+                                 int* out_index_table) {
+  CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) {
+    IntT batch = indices[i];
+    IntT in_z = indices[i + non_zero_num];
+    IntT in_y = indices[i + 2 * non_zero_num];
+    IntT in_x = indices[i + 3 * non_zero_num];
+    IntT index = PointToIndex(batch, in_x, in_y, in_z, dims);
+    out_index_table[index] = i == 0 ? -1 : i;
+  }
+}
+template <typename IntT>
+__global__ void GetOutIndexTable(int* indexs,
+                                 const int non_zero_num,
+                                 const Dims4D out_dims,
+                                 int* out_index_table,
+                                 IntT* out_indices) {
+  CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) {
+    IntT index = static_cast<IntT>(indexs[i]);
+    out_index_table[index] = i;
+    IntT batch, x, y, z;
+    phi::funcs::sparse::IndexToPoint<Dims4D>(
+        index, out_dims, &batch, &x, &y, &z);
+    // get out indices
+    out_indices[i] = batch;
+    out_indices[i + non_zero_num] = z;
+    out_indices[i + non_zero_num * 2] = y;
+    out_indices[i + non_zero_num * 3] = x;
+    indexs[i] = 0;
+  }
+}
+template <typename IntT>
+__global__ void CopyRuleBook(const int* counters,
+                             const int* offsets,
+                             const IntT* in_rulebook,
+                             const int len,
+                             const int kernel_size,
+                             const int non_zero_num,
+                             IntT* out_rulebook) {
+  int tid = threadIdx.x + blockDim.x * blockIdx.x;
+  extern __shared__ int cache_counters[];
+  int* cache_offsets = cache_counters + kernel_size;
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    cache_counters[i] = counters[i];
+    cache_offsets[i] = offsets[i];
+  }
+  __syncthreads();
+  for (int i = tid; i < len; i += gridDim.x * blockDim.x) {
+    // get the kernel index
+    int kernel_index = 0;
+    for (; kernel_index < kernel_size - 1; kernel_index++) {
+      if (i >= offsets[kernel_index] && i < offsets[kernel_index + 1]) {
+        break;
+      }
+    }
+    int inner_index = i - offsets[kernel_index];
+    out_rulebook[i] = in_rulebook[kernel_index * non_zero_num + inner_index];
+    out_rulebook[len + i] =
+        in_rulebook[kernel_size * non_zero_num + kernel_index * non_zero_num +
+                    inner_index];
+  }
+}
+template <typename T>
+__global__ void ProductSubmRuleBookKernel(const T* x_indices,
+                                          const Dims4D x_dims,
+                                          const Dims4D kernel_dims,
+                                          const Dims4D out_dims,
+                                          const int64_t non_zero_num,
+                                          const Dims4D paddings,
+                                          const Dims4D dilations,
+                                          const Dims4D strides,
+                                          const int* out_index_table,
+                                          T* rulebook,
+                                          int* counter) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
+  extern __shared__ int counter_buf[];  // kernel_size
+  int* counter_buf2 = counter_buf + kernel_size;
+  // length = kernel_size * blockDim.x * 2;
+  int* rulebook_buf = counter_buf + kernel_size * 2;
+  const int offset = kernel_size * non_zero_num;
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    counter_buf[i] = 0;
+  }
+  __syncthreads();
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    int kernel_index = 0;
+    T batch = x_indices[i];
+    T in_z = x_indices[i + non_zero_num];
+    T in_y = x_indices[i + 2 * non_zero_num];
+    T in_x = x_indices[i + 3 * non_zero_num];
+    for (int kz = 0; kz < kernel_dims[1]; kz++) {
+      for (int ky = 0; ky < kernel_dims[2]; ky++) {
+        for (int kx = 0; kx < kernel_dims[3]; kx++) {
+          int in_i = -1, out_index = -1, kernel_i = -1;
+          if (phi::funcs::sparse::Check(x_dims,
+                                        kernel_dims,
+                                        paddings,
+                                        dilations,
+                                        strides,
+                                        in_x,
+                                        in_y,
+                                        in_z,
+                                        kx,
+                                        ky,
+                                        kz)) {
+            T out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
+            T out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
+            T out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
+            out_index = phi::funcs::sparse::PointToIndex<Dims4D>(
+                batch, out_x, out_y, out_z, out_dims);
+            int real_out_index = out_index_table[out_index];
+            if (real_out_index != 0) {
+              real_out_index = real_out_index == -1 ? 0 : real_out_index;
+              in_i = i;
+              int buf_i = atomicAdd(&counter_buf[kernel_index], 1);
+              kernel_i = kernel_index;
+              rulebook_buf[kernel_index * blockDim.x + buf_i] = in_i;
+              rulebook_buf[kernel_index * blockDim.x +
+                           kernel_size * blockDim.x + buf_i] = real_out_index;
+            }
+          }
+          ++kernel_index;
+        }
+      }
+    }
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    counter_buf2[i] = atomicAdd(&counter[i], counter_buf[i]);
+  }
+  __syncthreads();
+  for (int i = 0; i < kernel_size; i++) {
+    if (threadIdx.x < counter_buf[i]) {
+      // rulebook[i * non_zero_num + counter_buf2[i] + threadIdx.x] = i;
+      rulebook[i * non_zero_num + counter_buf2[i] + threadIdx.x] =
+          rulebook_buf[i * blockDim.x + threadIdx.x];
+      rulebook[i * non_zero_num + offset + counter_buf2[i] + threadIdx.x] =
+          rulebook_buf[i * blockDim.x + kernel_size * blockDim.x + threadIdx.x];
+    }
+  }
+}
+template <typename IntT>
+__global__ void GroupIndexs(const int n,
+                            const int kernel_size,
+                            const IntT* indexs,
+                            int* index_counts,
+                            int* index_groups) {
+  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
+    IntT index = indexs[i];
+    // kernel_size at most
+    int j = atomicAdd(index_counts + index, 1);
+    // nnz * kernel_size
+    index_groups[index * kernel_size + j] = i;
+  }
+}
+// double space to reduce atomicAdd conflict
+template <typename IntT>
+__global__ void GroupIndexsV2(const int rulebook_len,
+                              const int non_zero_num,
+                              const int kernel_size,
+                              const int half_kernel_offset,
+                              const IntT* indexs,
+                              int* index_counts,
+                              int* index_groups) {
+  CUDA_KERNEL_LOOP_TYPE(i, rulebook_len, int64_t) {
+    IntT index = indexs[i];
+    int* counts_ptr =
+        i < half_kernel_offset ? index_counts : index_counts + non_zero_num;
+    int* groups_ptr = i < half_kernel_offset
+                          ? index_groups
+                          : index_groups + non_zero_num * kernel_size;
+    // conflict kernel_size times at most
+    int j = atomicAdd(counts_ptr + index, 1);
+    // nnz * kernel_size
+    groups_ptr[index * kernel_size + j] = i;
+  }
+}
+inline void CallThrustScan(const GPUContext& dev_ctx,
+                           const int* counter_ptr,
+                           const int kernel_size,
+                           int* offsets_ptr,
+                           int* h_counter_ptr,
+                           int* h_offsets_ptr) {
+#ifdef PADDLE_WITH_HIP
+  thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                         counter_ptr,
+                         counter_ptr + kernel_size,
+                         offsets_ptr);
+  phi::backends::gpu::GpuMemcpyAsync(h_counter_ptr,
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     gpuMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(h_offsets_ptr,
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     gpuMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+}
+// the basic algorithm can refer to convolution_kernel.cc or
+// the second paper
+// example:
+// 1. the rulebook:
+//  the kernel_index:                       0, 0, 0, 1, 1, 1, 2, 2, ....
+//  the out_index(key):                     20, 30, 33, 30, 33, 20, 25
+// 2. mark the index of out_index(value):   0, 1, 2, 3, 4, 5, 6, ....
+// 3. sorted the (key, value)
+// 4. unique the (key, value):
+//  unique_key:     20, 25, 30, 33
+//  unique_values:  0, 2, 3, 5
+//  the index of unique_values is: 0, 1, 2, 3
+// 5. update the out_index by unique_key, uniqe_value and the index of
+// unique_value:
+//  the new out_index: 0, 2, 3, 2, 3, 0, 1
+template <typename T, typename Context, typename IntT = int>
+int ProductRuleBook(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    const std::vector<int>& kernel_sizes,
+                    const std::vector<int>& paddings,
+                    const std::vector<int>& dilations,
+                    const std::vector<int>& strides,
+                    const DDim& out_dims,
+                    const bool subm,
+                    DenseTensor* rulebook,
+                    DenseTensor* counter_per_kernel,
+                    DenseTensor* offsets_per_kernel,
+                    DenseTensor* out_index,
+                    DenseTensor* unique_value,
+                    SparseCooTensor* out,
+                    int* h_counter,
+                    int* h_offsets) {
+  auto indices_dtype = paddle::experimental::CppTypeToDataType<IntT>::Type();
+  const int64_t non_zero_num = x.nnz();
+  const auto& non_zero_indices = x.non_zero_indices();
+  const IntT* indices_ptr = non_zero_indices.data<IntT>();
+  int* counter_ptr = counter_per_kernel->data<int>();
+  int* offsets_ptr = offsets_per_kernel->data<int>();
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const auto x_dims = x.dims();
+  Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
+  Dims4D d_kernel_dims(1, kernel_sizes[2], kernel_sizes[1], kernel_sizes[0]);
+  Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
+  Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]);
+  Dims4D d_strides(1, strides[2], strides[1], strides[0]);
+  Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
+  // 1. product rule book
+  phi::backends::gpu::GpuMemsetAsync(counter_ptr,
+                                     0,
+                                     sizeof(int) * counter_per_kernel->numel(),
+                                     dev_ctx.stream());
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+  const int rulebook_rows = 2;
+  const int rulebook_cols = kernel_size * non_zero_num;
+  DenseTensorMeta rulebook_meta(
+      indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW);
+  int64_t table_size = 1;
+  for (int i = 0; i < out_dims.size() - 1; i++) {
+    table_size *= out_dims[i];
+  }
+  DenseTensor out_index_table = phi::Empty<int>(dev_ctx, {table_size});
+  int* out_index_table_ptr = out_index_table.data<int>();
+  if (subm) {
+    DenseTensor tmp_rulebook = phi::Empty(dev_ctx, std::move(rulebook_meta));
+    IntT* rulebook_ptr = tmp_rulebook.data<IntT>();
+    DenseTensor out_indices =
+        phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
+    DenseTensor out_values = phi::Empty<T>(dev_ctx, {x.nnz(), kernel_sizes[4]});
+    phi::Copy(
+        dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices);
+    phi::backends::gpu::GpuMemsetAsync(
+        out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+    GetOutIndexTable<IntT><<<config.block_per_grid,
+                             config.thread_per_block,
+                             0,
+                             dev_ctx.stream()>>>(
+        out_indices.data<IntT>(), non_zero_num, d_x_dims, out_index_table_ptr);
+    size_t cache_size = kernel_size * 2 + kernel_size *
+                                              config.thread_per_block.x * 2 *
+                                              sizeof(int);
+    const int MAX_CACHE_SIZE = 48 * 1024;
+    while (cache_size >= MAX_CACHE_SIZE) {
+      config.thread_per_block.x /= 2;
+      config.block_per_grid.x *= 2;
+      PADDLE_ENFORCE_GE(config.thread_per_block.x,
+                        32,
+                        phi::errors::Fatal("the shared memory is not enough"));
+      cache_size = kernel_size * 2 +
+                   kernel_size * config.thread_per_block.x * 2 * sizeof(int);
+    }
+    ProductSubmRuleBookKernel<IntT><<<config.block_per_grid.x,
+                                      config.thread_per_block.x,
+                                      cache_size,
+                                      dev_ctx.stream()>>>(indices_ptr,
+                                                          d_x_dims,
+                                                          d_kernel_dims,
+                                                          d_out_dims,
+                                                          non_zero_num,
+                                                          d_paddings,
+                                                          d_dilations,
+                                                          d_strides,
+                                                          out_index_table_ptr,
+                                                          rulebook_ptr,
+                                                          counter_ptr);
+    out->SetMember(out_indices, out_values, out_dims, false);
+    CallThrustScan(
+        dev_ctx, counter_ptr, kernel_size, offsets_ptr, h_counter, h_offsets);
+    dev_ctx.Wait();
+    int rulebook_len = h_offsets[kernel_size - 1] + h_counter[kernel_size - 1];
+    DenseTensor out_rulebook =
+        phi::Empty<IntT>(dev_ctx, {rulebook_rows, rulebook_len});
+    IntT* out_rulebook_ptr = out_rulebook.data<IntT>();
+    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+    cache_size = kernel_size * 2 * sizeof(int);
+    CopyRuleBook<IntT><<<config.block_per_grid,
+                         config.thread_per_block,
+                         cache_size,
+                         dev_ctx.stream()>>>(counter_ptr,
+                                             offsets_ptr,
+                                             rulebook_ptr,
+                                             rulebook_len,
+                                             kernel_size,
+                                             non_zero_num,
+                                             out_rulebook_ptr);
+    *rulebook = out_rulebook;
+    return rulebook_len;
+  } else {
+    *rulebook = phi::Empty(dev_ctx, std::move(rulebook_meta));
+    IntT* rulebook_ptr = rulebook->data<IntT>();
+    ProductRuleBookKernel<IntT><<<config.block_per_grid.x,
+                                  config.thread_per_block.x,
+                                  kernel_size * sizeof(int),
+                                  dev_ctx.stream()>>>(indices_ptr,
+                                                      d_x_dims,
+                                                      d_kernel_dims,
+                                                      d_out_dims,
+                                                      non_zero_num,
+                                                      d_paddings,
+                                                      d_dilations,
+                                                      d_strides,
+                                                      rulebook_ptr,
+                                                      counter_ptr);
+    // 2. remove -1
+#ifdef PADDLE_WITH_HIP
+    IntT* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#else
+    IntT* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                                rulebook_ptr,
+                                rulebook_ptr + rulebook_rows * rulebook_cols,
+                                -1);
+    IntT rulebook_len = (last - rulebook_ptr) / 2;
+    CallThrustScan(
+        dev_ctx, counter_ptr, kernel_size, offsets_ptr, h_counter, h_offsets);
+    rulebook->Resize({rulebook_rows, static_cast<int>(rulebook_len)});
+    // 3. sorted or merge the out index
+    out_index->ResizeAndAllocate({static_cast<int>(rulebook_len)});
+    DenseTensor unique_key =
+        phi::Empty<int>(dev_ctx, {static_cast<int>(rulebook_len)});
+    int* out_index_ptr = out_index->data<int>();
+    int* unique_key_ptr = unique_key.data<int>();
+    phi::backends::gpu::GpuMemsetAsync(
+        out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
+    phi::backends::gpu::GpuMemsetAsync(
+        unique_key_ptr, 0, sizeof(int), dev_ctx.stream());
+    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+    size_t cache_size = sizeof(int) * config.thread_per_block.x;
+    UniqueKernel<IntT><<<config.block_per_grid,
+                         config.thread_per_block,
+                         cache_size,
+                         dev_ctx.stream()>>>(rulebook_ptr + rulebook_len,
+                                             rulebook_len,
+                                             out_index_table_ptr,
+                                             out_index_ptr,
+                                             unique_key_ptr);
+    int out_nnz = 0;
+    phi::backends::gpu::GpuMemcpyAsync(&out_nnz,
+                                       unique_key_ptr,
+                                       sizeof(int),
+                                       gpuMemcpyDeviceToHost,
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+    const int64_t sparse_dim = 4;
+    phi::DenseTensor out_indices =
+        phi::Empty<IntT>(dev_ctx, {sparse_dim, out_nnz});
+    phi::DenseTensor out_values =
+        phi::Empty<T>(dev_ctx, {out_nnz, kernel_sizes[4]});
+    out->SetMember(out_indices, out_values, out_dims, false);
+    IntT* out_indices_ptr = out_indices.data<IntT>();
+    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1);
+    GetOutIndexTable<IntT><<<config.block_per_grid,
+                             config.thread_per_block,
+                             0,
+                             dev_ctx.stream()>>>(out_index_ptr,
+                                                 out_nnz,
+                                                 d_out_dims,
+                                                 out_index_table_ptr,
+                                                 out_indices_ptr);
+    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+    unique_value->ResizeAndAllocate({static_cast<int>(out_nnz * kernel_size)});
+    int* unique_value_ptr = unique_value->data<int>();
+    GroupIndexs<<<config.block_per_grid,
+                  config.thread_per_block,
+                  0,
+                  dev_ctx.stream()>>>(out_index_table_ptr,
+                                      rulebook_len,
+                                      kernel_size,
+                                      rulebook_ptr + rulebook_len,
+                                      out_index_ptr,
+                                      unique_value_ptr);
+    return rulebook_len;
+  }
+}
+}  // namespace sparse
+}  // namespace phi
--- a/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
@@ -19,13 +19,11 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/sparse/gpu/conv.cu.h"
-#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 namespace phi {
 namespace sparse {
@@ -42,43 +40,42 @@ template <typename T, typename IntT>
 void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
                            const SparseCooTensor& x,
                            const DenseTensor& kernel,
+                            const SparseCooTensor& out,
                            const DenseTensor& rulebook,
+                            const DenseTensor& counter,
                            const SparseCooTensor& out_grad,
                            const std::vector<int>& paddings,
                            const std::vector<int>& dilations,
                            const std::vector<int>& strides,
                            const int groups,
                            const bool subm,
+                            const std::string& key,
                            SparseCooTensor* x_grad,
                            DenseTensor* kernel_grad) {
  const auto& kernel_dims = kernel.dims();
  const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
  const int in_channels = kernel_dims[3];
  const int out_channels = kernel_dims[4];
-  const IntT* rulebook_ptr = rulebook.data<IntT>();
-  const int rulebook_len = rulebook.dims()[1];
+  int rulebook_len = 0;
+  const IntT* rulebook_ptr = phi::funcs::sparse::GetRulebookPtr<IntT>(
+      out, rulebook, key, &rulebook_len);
+  const int* counter_ptr = phi::funcs::sparse::GetCounterPtr(out, counter, key);
-  DenseTensorMeta in_features_meta(
-      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
-  DenseTensorMeta d_x_features_meta(
-      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
-  DenseTensorMeta out_grad_features_meta(
-      x.dtype(), {rulebook_len, out_channels}, DataLayout::NCHW);
  phi::DenseTensor in_features =
-      phi::Empty(dev_ctx, std::move(in_features_meta));
+      phi::Empty<T>(dev_ctx, {rulebook_len, in_channels});
  phi::DenseTensor d_x_features =
-      phi::Empty(dev_ctx, std::move(d_x_features_meta));
+      phi::Empty<T>(dev_ctx, {rulebook_len, in_channels});
  phi::DenseTensor out_grad_features =
-      phi::Empty(dev_ctx, std::move(out_grad_features_meta));
+      phi::Empty<T>(dev_ctx, {rulebook_len, out_channels});
  T* in_features_ptr = in_features.data<T>();
  T* d_x_features_ptr = d_x_features.data<T>();
  T* out_grad_features_ptr = out_grad_features.data<T>();
  *kernel_grad = phi::EmptyLike<T>(dev_ctx, kernel);
  T* d_kernel_ptr = kernel_grad->data<T>();
-  phi::funcs::SetConstant<GPUContext, T> set_zero;
+  phi::backends::gpu::GpuMemsetAsync(
-  set_zero(dev_ctx, kernel_grad, static_cast<T>(0.0f));
+      d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel(), dev_ctx.stream());
  int half_kernel_size = kernel_size / 2;
  auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
@@ -86,8 +83,12 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
      phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
  DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
  T* x_grad_values_ptr = x_grad_values.data<T>();
-  set_zero(dev_ctx, &x_grad_values, static_cast<T>(0.0f));
+  phi::backends::gpu::GpuMemsetAsync(x_grad_values_ptr,
-  set_zero(dev_ctx, &d_x_features, static_cast<T>(0.0f));
+                                     0,
+                                     sizeof(T) * x_grad_values.numel(),
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemsetAsync(
+      d_x_features_ptr, 0, sizeof(T) * d_x_features.numel(), dev_ctx.stream());
  phi::Copy<GPUContext>(dev_ctx,
                        x.non_zero_indices(),
                        dev_ctx.GetPlace(),
@@ -95,29 +96,14 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
                        &x_grad_indices);
  x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
-  std::vector<IntT> offsets(kernel_size + 1), counter(kernel_size, 0),
+  std::vector<int> offsets(kernel_size + 1);
-      h_counter(rulebook_len, 0);
-  phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
-                                     rulebook_ptr,
-                                     rulebook_len * sizeof(IntT),
-#ifdef PADDLE_WITH_HIP
-                                     hipMemcpyDeviceToHost,
-#else
-                                     cudaMemcpyDeviceToHost,
-#endif
-                                     dev_ctx.stream());
-  dev_ctx.Wait();
-  for (int i = 0; i < rulebook_len; i++) {
+  int offset = 0, max_count = 0;
-    counter[h_counter[i]] += 1;
-  }
-  IntT offset = 0, max_count = 0;
  for (int i = 0; i < kernel_size; i++) {
    offsets[i] = offset;
-    offset += counter[i];
+    offset += counter_ptr[i];
    if (i < half_kernel_size) {
-      max_count = std::max(max_count, counter[i]);
+      max_count = std::max(max_count, counter_ptr[i]);
    }
  }
  offsets[kernel_size] = offset;
@@ -138,36 +124,52 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
    }
  }
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+  auto config =
-      dev_ctx, rulebook_len * in_channels, 1);
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
-  GatherKernel<T, IntT><<<config.block_per_grid.x,
+  DenseTensor unique_value = phi::Empty<int>(
-                          config.thread_per_block.x,
+      dev_ctx, {static_cast<int>(x_grad->nnz() * kernel_size * 2)});
-                          0,
+  DenseTensor out_index =
-                          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+      phi::Empty<int>(dev_ctx, {static_cast<int>(x.nnz() * 2)});
-                                              rulebook_ptr + rulebook_len,
+  int* out_index_ptr = out_index.data<int>();
-                                              in_features_ptr,
+  int* unique_value_ptr = unique_value.data<int>();
-                                              rulebook_len,
+  phi::backends::gpu::GpuMemsetAsync(
-                                              in_channels);
+      out_index_ptr, 0, sizeof(int) * x.nnz() * 2, dev_ctx.stream());
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+  GroupIndexsV2<<<config.block_per_grid,
-      dev_ctx, rulebook_len * out_channels, 1);
+                  config.thread_per_block,
-  GatherKernel<T, IntT>
-      <<<config.block_per_grid.x,
-         config.thread_per_block.x,
                  0,
-         dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
+                  dev_ctx.stream()>>>(rulebook_len,
-                             rulebook_ptr + rulebook_len * 2,
+                                      x.nnz(),
-                             out_grad_features_ptr,
+                                      kernel_size,
+                                      offsets[kernel_size / 2],
+                                      rulebook_ptr,
+                                      out_index_ptr,
+                                      unique_value_ptr);
+  GatherV2<T, IntT>(dev_ctx,
+                    x.non_zero_elements().data<T>(),
+                    out_index_ptr,
+                    unique_value_ptr,
+                    x.nnz(),
+                    kernel_size,
+                    in_channels,
+                    2,
+                    in_features_ptr);
+  Gather<T, IntT>(dev_ctx,
+                  out_grad.non_zero_elements().data<T>(),
+                  rulebook_ptr + rulebook_len,
                  rulebook_len,
-                             out_channels);
+                  out_channels,
+                  out_grad_features_ptr);
  const T* kernel_ptr = kernel.data<T>();
  for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0 || (subm && i == half_kernel_size)) {
+    if (counter_ptr[i] <= 0 || (subm && i == half_kernel_size)) {
      continue;
    }
-    const int M = counter[i];
+    const int M = counter_ptr[i];
    const int K = in_channels;
    const int N = out_channels;
    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
@@ -204,32 +206,31 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
  }
  // 4. scatter
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+  phi::funcs::sparse::ScatterV2<T>(dev_ctx,
-      dev_ctx, rulebook_len * in_channels, 1);
-  phi::funcs::ScatterCUDAKernel<<<config.block_per_grid,
-                                  config.thread_per_block,
-                                  0,
-                                  dev_ctx.stream()>>>(
                                   d_x_features_ptr,
-      rulebook_ptr + rulebook_len,
+                                   out_index.data<int>(),
-      x_grad_values_ptr,
+                                   unique_value.data<int>(),
-      rulebook_len,
+                                   x_grad->nnz(),
+                                   kernel_size,
                                   in_channels,
-      false);
+                                   2,
+                                   x_grad_values_ptr);
 }
 template <typename T, typename Context>
 void Conv3dCooGradKernel(const Context& dev_ctx,
                         const SparseCooTensor& x,
                         const DenseTensor& kernel,
+                         const SparseCooTensor& out,
                         const DenseTensor& rulebook,
+                         const DenseTensor& counter,
                         const SparseCooTensor& out_grad,
                         const std::vector<int>& paddings,
                         const std::vector<int>& dilations,
                         const std::vector<int>& strides,
                         const int groups,
                         const bool subm,
+                         const std::string& key,
                         SparseCooTensor* x_grad,
                         DenseTensor* kernel_grad) {
  PD_VISIT_INTEGRAL_TYPES(
@@ -237,13 +238,16 @@ void Conv3dCooGradKernel(const Context& dev_ctx,
        Conv3dCooGradGPUKernel<T, data_t>(dev_ctx,
                                          x,
                                          kernel,
+                                          out,
                                          rulebook,
+                                          counter,
                                          out_grad,
                                          paddings,
                                          dilations,
                                          strides,
                                          groups,
                                          subm,
+                                          key,
                                          x_grad,
                                          kernel_grad);
      }));

--- a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
@@ -21,7 +21,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/scatter.cu.h"
 #include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
-#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+#include "paddle/phi/kernels/sparse/gpu/conv.cu.h"
+#include "glog/logging.h"
 namespace phi {
 namespace sparse {
@@ -35,8 +37,10 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
                        const std::vector<int>& strides,
                        const int groups,
                        const bool subm,
+                        const std::string& key,
                        SparseCooTensor* out,
-                        DenseTensor* rulebook) {
+                        DenseTensor* rulebook,
+                        DenseTensor* counter) {
  // update padding and dilation
  // Currently, only support x.layout is NDHWC, groups = 1
  // if x.layout != NDHWC then transpose(x), transpose(weight)
@@ -61,22 +65,40 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
      x_dims, kernel_sizes, subm_paddings, dilations, subm_strides, &out_dims);
  const int in_channels = kernel_dims[3];
  const int out_channels = kernel_dims[4];
-  std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
+  DenseTensor h_counter, h_offsets;
+  h_counter.Resize({kernel_size});
+  h_offsets.Resize({kernel_size + 1});
+  int* h_counter_ptr = dev_ctx.template HostAlloc<int>(&h_counter);
+  int* h_offsets_ptr = dev_ctx.template HostAlloc<int>(&h_offsets);
  // Second algorithm:
  // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
  // 1. product rulebook
-  DenseTensorMeta counter_meta(
+  DenseTensor counter_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
-      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensor offsets_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
-  DenseTensorMeta offsets_meta(
+  DenseTensor out_index = phi::Empty<int>(dev_ctx, {1});
-      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensor unique_value = phi::Empty<int>(dev_ctx, {1});
-  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
-  DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta));
+  VLOG(6) << "call SubmConv3D or Conv3D " << subm << " and the key is " << key;
-  DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
+  int rulebook_len = 0;
-  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
+  const IntT* rulebook_ptr = nullptr;
-  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
+  bool need_product_rulebook = true;
+  if (subm && !key.empty()) {
-  int n = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
+    rulebook_ptr = phi::funcs::sparse::PrepareSubm<T, IntT, GPUContext>(
+        dev_ctx,
+        x,
+        key,
+        out_dims,
+        out,
+        h_counter.data<int>(),
+        h_offsets.data<int>(),
+        &rulebook_len,
+        &need_product_rulebook);
+  }
+  if (need_product_rulebook) {
+    DenseTensor tmp_rulebook;
+    rulebook_len = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
                                                        x,
                                                        kernel_sizes,
                                                        subm_paddings,
@@ -84,62 +106,76 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
                                                        subm_strides,
                                                        out_dims,
                                                        subm,
-                                               rulebook,
+                                                        &tmp_rulebook,
                                                        &counter_per_kernel,
                                                        &offsets_per_kernel,
                                                        &out_index,
                                                        &unique_value,
                                                        out,
-                                               &h_counter,
+                                                        h_counter_ptr,
-                                               &offsets);
+                                                        h_offsets_ptr);
+    rulebook_ptr = tmp_rulebook.data<IntT>();
-  const int* counter_ptr = counter_per_kernel.data<int>();
+    phi::funcs::sparse::SaveToTable(
-  const int* offsets_ptr = counter_per_kernel.data<int>();
+        dev_ctx, x, key, tmp_rulebook, h_counter, out, rulebook, counter);
-  const IntT* rulebook_ptr = rulebook->data<IntT>();
+  }
  // 2. gather
-  DenseTensorMeta in_features_meta(
-      x.dtype(), {n, in_channels}, DataLayout::NCHW);
-  DenseTensorMeta out_features_meta(
-      x.dtype(), {n, out_channels}, DataLayout::NCHW);
  phi::DenseTensor in_features =
-      phi::Empty(dev_ctx, std::move(in_features_meta));
+      phi::Empty<T>(dev_ctx, {rulebook_len, in_channels});
  phi::DenseTensor out_features =
-      phi::Empty(dev_ctx, std::move(out_features_meta));
+      phi::Empty<T>(dev_ctx, {rulebook_len, out_channels});
  T* in_features_ptr = in_features.data<T>();
  T* out_features_ptr = out_features.data<T>();
  phi::funcs::SetConstant<GPUContext, T> set_zero;
  set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
-  auto config =
+  Gather<T, IntT>(dev_ctx,
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
+                  x.non_zero_elements().data<T>(),
-  GatherKernel<T, IntT><<<config.block_per_grid.x,
+                  rulebook_ptr,
-                          config.thread_per_block.x,
+                  rulebook_len,
-                          0,
+                  in_channels,
-                          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                  in_features_ptr);
-                                              rulebook_ptr + n,
-                                              in_features_ptr,
-                                              n,
-                                              in_channels);
  // 3. call gemm for every werght
  auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
  auto* out_values = out->mutable_non_zero_elements();
  T* out_values_ptr = out_values->data<T>();
+  set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
+  if (subm) {
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+    unique_value.ResizeAndAllocate(
+        {static_cast<int>(out->nnz() * kernel_size)});
+    out_index.ResizeAndAllocate({static_cast<int>(rulebook_len)});
+    int* out_index_ptr = out_index.data<int>();
+    int* unique_value_ptr = unique_value.data<int>();
+    phi::backends::gpu::GpuMemsetAsync(
+        out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
+    GroupIndexs<<<config.block_per_grid,
+                  config.thread_per_block,
+                  0,
+                  dev_ctx.stream()>>>(rulebook_len,
+                                      kernel_size,
+                                      rulebook_ptr + rulebook_len,
+                                      out_index_ptr,
+                                      unique_value_ptr);
+  }
  const T* kernel_ptr = kernel.data<T>();
  for (int i = 0; i < kernel_size; i++) {
-    if (h_counter[i] <= 0) {
+    if (h_counter_ptr[i] <= 0) {
      continue;
    }
    // call gemm: (n, in_channels) * (in_channels, out_channels)
-    const int M = h_counter[i];
+    const int M = h_counter_ptr[i];
    const int K = in_channels;
    const int N = out_channels;
-    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels;
    const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
-    T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels;
+    T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels;
    blas.GEMM(CblasNoTrans,
              CblasNoTrans,
@@ -154,40 +190,23 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
  }
  // 4. scatter
-  if (subm) {
+  phi::funcs::sparse::ScatterV2<T>(dev_ctx,
-    set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
+                                   out_features_ptr,
-    config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1);
-    phi::funcs::ScatterCUDAKernel<T, IntT>
-        <<<config.block_per_grid,
-           config.thread_per_block,
-           0,
-           dev_ctx.stream()>>>(out_features_ptr,
-                               rulebook_ptr + 2 * n,
-                               out_values_ptr,
-                               n,
-                               out_channels,
-                               false);
-  } else {
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, out->nnz() * out_channels, 1);
-    phi::funcs::sparse::ScatterKernel<T>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(out_features_ptr,
-                               unique_value.data<int>(),
                                   out_index.data<int>(),
+                                   unique_value.data<int>(),
                                   out->nnz(),
-                               n,
+                                   kernel_size,
                                   out_channels,
+                                   1,
                                   out_values_ptr);
-  }
 }
 /**
- * x: (N, D, H, W, C)
+ * x: the input SparseCooTensor, shape is (N, D, H, W, C)
- * kernel: (D, H, W, C, OC)
+ * kernel: the weight data, shape is (D, H, W, C, OC)
- * out: (N, D, H, W, OC)
+ * out: the output SparseCooTensor, shape is (N, D, H, W, OC)
+ * rulebook: return rulebook if key is not vailed else return nullptr
+ * counter: return counter if key is not vailed else return nullptr
 **/
 template <typename T, typename Context>
 void Conv3dCooKernel(const Context& dev_ctx,
@@ -198,8 +217,10 @@ void Conv3dCooKernel(const Context& dev_ctx,
                     const std::vector<int>& strides,
                     const int groups,
                     const bool subm,
+                     const std::string& key,
                     SparseCooTensor* out,
-                     DenseTensor* rulebook) {
+                     DenseTensor* rulebook,
+                     DenseTensor* counter) {
  PD_VISIT_INTEGRAL_TYPES(
      x.non_zero_indices().dtype(), "Conv3dCooGPUKernel", ([&] {
        Conv3dCooGPUKernel<T, data_t>(dev_ctx,
@@ -210,8 +231,10 @@ void Conv3dCooKernel(const Context& dev_ctx,
                                      strides,
                                      groups,
                                      subm,
+                                      key,
                                      out,
-                                      rulebook);
+                                      rulebook,
+                                      counter);
      }));
 }

--- a/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
@@ -238,6 +238,7 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
      x_indexs_ptr, x_indexs.numel(), table.data<int>());
  config =
      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1);
  const int VecBytes = 16;
  const int VecSize = VecBytes / sizeof(T);
  if (stride % VecSize == 0) {

--- a/paddle/phi/kernels/sparse/gpu/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/pool_grad_kernel.cu
@@ -55,6 +55,7 @@ template <typename T, typename IntT = int>
 void MaxPoolCooGradGPUKernel(const GPUContext& dev_ctx,
                             const SparseCooTensor& x,
                             const DenseTensor& rulebook,
+                             const DenseTensor& counter,
                             const SparseCooTensor& out,
                             const SparseCooTensor& out_grad,
                             const std::vector<int>& kernel_sizes,
@@ -63,23 +64,9 @@ void MaxPoolCooGradGPUKernel(const GPUContext& dev_ctx,
  const int in_channels = x.dims()[4];
  int rulebook_len = rulebook.dims()[1];
  const IntT* rulebook_ptr = rulebook.data<IntT>();
-  std::vector<IntT> offsets(kernel_size + 1), counter(kernel_size, 0),
+  std::vector<int> offsets(kernel_size + 1);
-      h_counter(rulebook_len, 0);
+  const int* counter_ptr = counter.data<int>();
-  phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
+  phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);
-                                     rulebook_ptr,
-                                     rulebook_len * sizeof(IntT),
-#ifdef PADDLE_WITH_HIP
-                                     hipMemcpyDeviceToHost,
-#else
-                                     cudaMemcpyDeviceToHost,
-#endif
-                                     dev_ctx.stream());
-  dev_ctx.Wait();
-  for (int i = 0; i < rulebook_len; i++) {
-    counter[h_counter[i]] += 1;
-  }
-  phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size);
  const T* in_features_ptr = x.non_zero_elements().data<T>();
  const T* out_features_ptr = out.non_zero_elements().data<T>();
@@ -99,12 +86,12 @@ void MaxPoolCooGradGPUKernel(const GPUContext& dev_ctx,
                        &x_grad_indices);
  for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0) {
+    if (counter_ptr[i] <= 0) {
      continue;
    }
    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, counter[i] * in_channels, 1);
+        dev_ctx, counter_ptr[i] * in_channels, 1);
    MaxPoolGradCudaKernel<T, IntT>
        <<<config.block_per_grid.x,
           config.thread_per_block.x,
@@ -112,8 +99,8 @@ void MaxPoolCooGradGPUKernel(const GPUContext& dev_ctx,
           dev_ctx.stream()>>>(in_features_ptr,
                               out_features_ptr,
                               out_grad_ptr,
-                               rulebook_ptr + offsets[i] + rulebook_len,
+                               rulebook_ptr + offsets[i],
-                               counter[i],
+                               counter_ptr[i],
                               rulebook_len,
                               in_channels,
                               x_grad_ptr);
@@ -124,6 +111,7 @@ template <typename T, typename Context>
 void MaxPoolCooGradKernel(const Context& dev_ctx,
                          const SparseCooTensor& x,
                          const DenseTensor& rulebook,
+                          const DenseTensor& counter,
                          const SparseCooTensor& out,
                          const SparseCooTensor& out_grad,
                          const std::vector<int>& kernel_sizes,
@@ -131,7 +119,7 @@ void MaxPoolCooGradKernel(const Context& dev_ctx,
  PD_VISIT_INTEGRAL_TYPES(
      x.non_zero_indices().dtype(), "MaxPoolCooGradGPUKernel", ([&] {
        MaxPoolCooGradGPUKernel<T, data_t>(
-            dev_ctx, x, rulebook, out, out_grad, kernel_sizes, x_grad);
+            dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, x_grad);
      }));
 }

--- a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/sparse/convolution.h"
-#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+#include "paddle/phi/kernels/sparse/gpu/conv.cu.h"
 namespace phi {
 namespace sparse {
@@ -55,7 +55,8 @@ void MaxPoolCooGPUKernel(const GPUContext& dev_ctx,
                         const std::vector<int>& dilations,
                         const std::vector<int>& strides,
                         SparseCooTensor* out,
-                         DenseTensor* rulebook) {
+                         DenseTensor* rulebook,
+                         DenseTensor* counter) {
  const auto& x_dims = x.dims();
  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
  const std::vector<int>& real_kernel_sizes =
@@ -65,7 +66,7 @@ void MaxPoolCooGPUKernel(const GPUContext& dev_ctx,
      x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
  const int in_channels = real_kernel_sizes[3];
-  std::vector<int> offsets(kernel_size + 1), counter(kernel_size);
+  std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
  DenseTensorMeta counter_meta(
      DataType::INT32, {kernel_size}, DataLayout::NCHW);
  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
@@ -89,13 +90,16 @@ void MaxPoolCooGPUKernel(const GPUContext& dev_ctx,
                                                          &out_index,
                                                          &unique_value,
                                                          out,
-                                                          &counter,
+                                                          h_counter.data(),
-                                                          &offsets);
+                                                          offsets.data());
  const IntT* rulebook_ptr = rulebook->data<IntT>();
  T* out_features_ptr = out->mutable_non_zero_elements()->data<T>();
  const T* in_features_ptr = x.non_zero_elements().data<T>();
+  counter->Resize({kernel_size});
+  int* counter_ptr = dev_ctx.template HostAlloc<int>(counter);
+  memcpy(counter_ptr, h_counter.data(), h_counter.size() * sizeof(int));
 // 2. max pool
 #ifdef PADDLE_WITH_HIP
  thrust::fill(thrust::hip::par.on(dev_ctx.stream()),
@@ -107,19 +111,18 @@ void MaxPoolCooGPUKernel(const GPUContext& dev_ctx,
               static_cast<T>(0));
  // TODO(zhangkaihuo) Replacing multiple calls with one kernel may be faster
  for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0) {
+    if (h_counter[i] <= 0) {
      continue;
    }
    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, counter[i] * in_channels, 1);
+        dev_ctx, h_counter[i] * in_channels, 1);
-    MaxPoolCudaKernel<T, IntT>
+    MaxPoolCudaKernel<T, IntT><<<config.block_per_grid.x,
-        <<<config.block_per_grid.x,
                                 config.thread_per_block.x,
                                 0,
                                 dev_ctx.stream()>>>(in_features_ptr,
-                               rulebook_ptr + offsets[i] + rulebook_len,
+                                                     rulebook_ptr + offsets[i],
-                               counter[i],
+                                                     h_counter[i],
                                                     rulebook_len,
                                                     in_channels,
                                                     out_features_ptr);
@@ -134,7 +137,8 @@ void MaxPoolCooKernel(const Context& dev_ctx,
                      const std::vector<int>& dilations,
                      const std::vector<int>& strides,
                      SparseCooTensor* out,
-                      DenseTensor* rulebook) {
+                      DenseTensor* rulebook,
+                      DenseTensor* counter) {
  PD_VISIT_INTEGRAL_TYPES(
      x.non_zero_indices().dtype(), "MaxPoolCooGPUKernel", ([&] {
        MaxPoolCooGPUKernel<T, data_t>(dev_ctx,
@@ -144,7 +148,8 @@ void MaxPoolCooKernel(const Context& dev_ctx,
                                       dilations,
                                       strides,
                                       out,
-                                       rulebook);
+                                       rulebook,
+                                       counter);
      }));
 }

--- a/paddle/phi/kernels/sparse/pool_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/pool_grad_kernel.h
@@ -25,6 +25,7 @@ template <typename T, typename Context>
 void MaxPoolCooGradKernel(const Context& dev_ctx,
                          const SparseCooTensor& x,
                          const DenseTensor& rulebook,
+                          const DenseTensor& counter,
                          const SparseCooTensor& out,
                          const SparseCooTensor& out_grad,
                          const std::vector<int>& kernel_sizes,
@@ -34,12 +35,13 @@ template <typename T, typename Context>
 SparseCooTensor MaxPoolCooGrad(const Context& dev_ctx,
                               const SparseCooTensor& x,
                               const DenseTensor& rulebook,
+                               const DenseTensor& counter,
                               const SparseCooTensor& out,
                               const SparseCooTensor& out_grad,
                               const std::vector<int>& kernel_sizes) {
  SparseCooTensor x_grad;
  MaxPoolCooGradKernel<T, Context>(
-      dev_ctx, x, rulebook, out, out_grad, kernel_sizes, &x_grad);
+      dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, &x_grad);
  return x_grad;
 }

--- a/paddle/phi/kernels/sparse/pool_kernel.h
+++ b/paddle/phi/kernels/sparse/pool_kernel.h
@@ -29,7 +29,8 @@ void MaxPoolCooKernel(const Context& dev_ctx,
                      const std::vector<int>& dilations,
                      const std::vector<int>& strides,
                      SparseCooTensor* out,
-                      DenseTensor* rulebook);
+                      DenseTensor* rulebook,
+                      DenseTensor* counter);
 template <typename T, typename Context>
 SparseCooTensor MaxPoolCoo(const Context& dev_ctx,
@@ -38,10 +39,18 @@ SparseCooTensor MaxPoolCoo(const Context& dev_ctx,
                           const std::vector<int>& paddings,
                           const std::vector<int>& dilations,
                           const std::vector<int>& strides,
-                           DenseTensor* rulebook) {
+                           DenseTensor* rulebook,
+                           DenseTensor* counter) {
  SparseCooTensor coo;
-  MaxPoolCooKernel<T, Context>(
+  MaxPoolCooKernel<T, Context>(dev_ctx,
-      dev_ctx, x, kernel_sizes, paddings, dilations, strides, &coo, rulebook);
+                               x,
+                               kernel_sizes,
+                               paddings,
+                               dilations,
+                               strides,
+                               &coo,
+                               rulebook,
+                               counter);
  return coo;
 }

--- a/paddle/phi/tests/api/test_sparse_conv_api.cc
+++ b/paddle/phi/tests/api/test_sparse_conv_api.cc
@@ -76,8 +76,8 @@ void TestConv3dBase(const std::vector<int>& indices,
         kernel.size() * sizeof(T));
  if (!std::is_same<T, phi::dtype::float16>::value) {
-    auto tensor_out = paddle::experimental::sparse::conv3d(
+    auto tensor_out = paddle::experimental::sparse::conv3d_coo(
-        x, weight, paddings, dilations, strides, 1, false);
+        x, weight, paddings, dilations, strides, 1, false, "Conv3d");
    auto out =
        std::dynamic_pointer_cast<phi::SparseCooTensor>(tensor_out.impl());

--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -112,8 +112,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
  };
  if (!std::is_same<T, phi::dtype::float16>::value) {
-    DenseTensor rulebook = phi::Empty(
+    DenseTensor rulebook, counter;
-        dev_ctx_cpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW));
    SparseCooTensor out = sparse::Conv3dCoo<T>(dev_ctx_cpu,
                                               x_tensor,
                                               kernel_tensor,
@@ -122,7 +121,9 @@ void TestConv3dBase(const std::vector<IntT>& indices,
                                               strides,
                                               1,
                                               subm,
-                                               &rulebook);
+                                               "Conv3d",
+                                               &rulebook,
+                                               &counter);
    ASSERT_EQ(correct_out_dims.size(), out.dims().size());
    for (int i = 0; i < correct_out_dims.size(); i++) {
@@ -142,13 +143,16 @@ void TestConv3dBase(const std::vector<IntT>& indices,
          sparse::Conv3dCooGrad<T>(dev_ctx_cpu,
                                   x_tensor,
                                   kernel_tensor,
+                                   out,
                                   rulebook,
+                                   counter,
                                   out,
                                   paddings,
                                   dilations,
                                   strides,
                                   1,
-                                   subm);
+                                   subm,
+                                   "Conv3d");
      f_verify(std::get<0>(grads).non_zero_elements().data<T>(), features_grad);
      f_verify(std::get<1>(grads).data<T>(), kernel_grad);
    }
@@ -196,8 +200,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
  phi::Copy(
      dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor);
-  DenseTensor d_rulebook = phi::Empty(
+  DenseTensor d_rulebook, d_counter;
-      dev_ctx_gpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW));
  SparseCooTensor d_out = sparse::Conv3dCoo<T>(dev_ctx_gpu,
                                               d_x_tensor,
                                               d_kernel_tensor,
@@ -206,8 +209,9 @@ void TestConv3dBase(const std::vector<IntT>& indices,
                                               strides,
                                               1,
                                               subm,
-                                               &d_rulebook);
+                                               "Conv3d",
+                                               &d_rulebook,
+                                               &d_counter);
  SparseCooTensor tmp_d_out = sparse::Coalesce<T>(dev_ctx_gpu, d_out);
  ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
@@ -245,13 +249,16 @@ void TestConv3dBase(const std::vector<IntT>& indices,
        sparse::Conv3dCooGrad<T>(dev_ctx_gpu,
                                 d_x_tensor,
                                 d_kernel_tensor,
+                                 d_out,
                                 d_rulebook,
+                                 d_counter,
                                 d_out,
                                 paddings,
                                 dilations,
                                 strides,
                                 1,
-                                 subm);
+                                 subm,
+                                 "Conv3d");
    DenseTensor d_features_grad = std::get<0>(grads).non_zero_elements();
    DenseTensor d_kernel_grad = std::get<1>(grads);
    DenseTensor h_features_grad =

--- a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
@@ -90,14 +90,15 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
  };
  if (!std::is_same<T, phi::dtype::float16>::value) {
-    DenseTensor rulebook;
+    DenseTensor rulebook, counter;
    SparseCooTensor out = sparse::MaxPoolCoo<T>(dev_ctx_cpu,
                                                x_tensor,
                                                kernel_sizes,
                                                paddings,
                                                dilations,
                                                strides,
-                                                &rulebook);
+                                                &rulebook,
+                                                &counter);
    ASSERT_EQ(correct_out_dims.size(), out.dims().size());
    for (int i = 0; i < correct_out_dims.size(); i++) {
@@ -114,7 +115,7 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
    if (backward) {
      SparseCooTensor x_grad = sparse::MaxPoolCooGrad<T>(
-          dev_ctx_cpu, x_tensor, rulebook, out, out, kernel_sizes);
+          dev_ctx_cpu, x_tensor, rulebook, counter, out, out, kernel_sizes);
      f_verify(x_grad.non_zero_elements().data<T>(), features_grad);
    }
  }
@@ -150,14 +151,16 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
  SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims);
-  DenseTensor d_rulebook;
+  DenseTensor d_rulebook, d_counter;
  SparseCooTensor d_out = sparse::MaxPoolCoo<T>(dev_ctx_gpu,
                                                d_x_tensor,
                                                kernel_sizes,
                                                paddings,
                                                dilations,
                                                strides,
-                                                &d_rulebook);
+                                                &d_rulebook,
+                                                &d_counter);
  SparseCooTensor tmp_d_out = sparse::Coalesce<T>(dev_ctx_gpu, d_out);
  ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
@@ -191,8 +194,13 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
  f_verify(h_features_tensor.data<T>(), correct_out_features);
  if (backward) {
-    SparseCooTensor x_grad = sparse::MaxPoolCooGrad<T>(
+    SparseCooTensor x_grad = sparse::MaxPoolCooGrad<T>(dev_ctx_gpu,
-        dev_ctx_gpu, d_x_tensor, d_rulebook, d_out, d_out, kernel_sizes);
+                                                       d_x_tensor,
+                                                       d_rulebook,
+                                                       d_counter,
+                                                       d_out,
+                                                       d_out,
+                                                       kernel_sizes);
    DenseTensor h_features_grad =
        phi::EmptyLike<T>(dev_ctx_cpu, x_grad.non_zero_elements());
    phi::Copy(dev_ctx_gpu,

--- a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
@@ -67,7 +67,7 @@ class TestSparseConv(unittest.TestCase):
                indices, values, dense_shape, stop_gradient=True)
            weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
            y = paddle.incubate.sparse.nn.functional.subm_conv3d(
-                sparse_x, weight)
+                sparse_x, weight, key='subm_conv')
            assert np.array_equal(sparse_x.indices().numpy(),
                                  y.indices().numpy())
@@ -91,7 +91,7 @@ class TestSparseConv(unittest.TestCase):
            with self.assertRaises(ValueError):
                #Currently, only support data_format='NDHWC'
                conv3d = paddle.incubate.sparse.nn.SubmConv3D(
-                    1, 1, (1, 3, 3), data_format='NCDHW')
+                    1, 1, (1, 3, 3), data_format='NCDHW', key='subm_conv')
    def test_SubmConv3D(self):
        with _test_eager_guard():
@@ -105,7 +105,7 @@ class TestSparseConv(unittest.TestCase):
                indices, values, dense_shape, False)
            subm_conv3d = paddle.incubate.sparse.nn.SubmConv3D(
-                1, 1, (1, 3, 3), data_format='NDHWC')
+                1, 1, (1, 3, 3), data_format='NDHWC', key='subm_conv')
            # test extra_repr
            print(subm_conv3d.extra_repr())
@@ -117,7 +117,7 @@ class TestSparseConv(unittest.TestCase):
            with self.assertRaises(ValueError):
                #Currently, only support data_format='NDHWC'
                conv3d = paddle.incubate.sparse.nn.SubmConv3D(
-                    1, 1, (1, 3, 3), data_format='NCDHW')
+                    1, 1, (1, 3, 3), data_format='NCDHW', key='subm_conv')
    def test_Conv3D_bias(self):
        with _test_eager_guard():

--- a/python/paddle/incubate/sparse/nn/functional/conv.py
+++ b/python/paddle/incubate/sparse/nn/functional/conv.py
@@ -29,6 +29,7 @@ def _conv3d(x,
            dilation=1,
            groups=1,
            subm=False,
+            key=None,
            data_format="NDHWC",
            name=None):
    assert in_dynamic_mode(), "Currently, only support dynamic mode"
@@ -62,8 +63,9 @@ def _conv3d(x,
    dilation = convert_to_list(dilation, dims, 'dilation')
    op_type = "conv3d"
-    pre_bias = _C_ops.final_state_sparse_conv3d(x, weight, padding, dilation,
+    pre_bias = _C_ops.final_state_sparse_conv3d_coo(
-                                                stride, groups, subm)
+        x, weight, padding, dilation, stride, groups, subm,
+        key if key is not None else "")
    if bias is not None:
        values = pre_bias.values()
        add_bias = elementwise_add(values, bias, axis=1)
@@ -186,7 +188,7 @@ def conv3d(x,
              # (1, 1, 1, 2, 1)
    """
    return _conv3d(x, weight, bias, stride, padding, dilation, groups, False,
-                   data_format, name)
+                   None, data_format, name)
 def subm_conv3d(x,
@@ -197,6 +199,7 @@ def subm_conv3d(x,
                dilation=1,
                groups=1,
                data_format="NDHWC",
+                key=None,
                name=None):
    r"""
@@ -274,6 +277,10 @@ def subm_conv3d(x,
            will be consistent with that of the input. An optional string from: `"NCDHW"`, `"NDHWC"`.
            The default is `"NDHWC"`. When it is `"NDHWC"`, the data is stored in the order of:
            `[batch_size, input_depth, input_height, input_width, input_channels]`.
+        key(str, optional): the key is used to save or use the same rulebook, 
+            the definition and role of rulebook refers to
+            https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf. The 
+            default value is None.
        name(str|None): For detailed information, please refer 
           to :ref:`api_guide_Name`. Usually name is no need to set and 
           None by default.
@@ -301,4 +308,4 @@ def subm_conv3d(x,
              #(1, 1, 3, 4, 1)
    """
    return _conv3d(x, weight, bias, stride, padding, dilation, groups, True,
-                   data_format, name)
+                   key, data_format, name)
--- a/python/paddle/incubate/sparse/nn/layer/conv.py
+++ b/python/paddle/incubate/sparse/nn/layer/conv.py
@@ -33,6 +33,7 @@ class _Conv3D(Layer):
                 dilation=1,
                 groups=1,
                 subm=False,
+                 key=None,
                 padding_mode='zeros',
                 weight_attr=None,
                 bias_attr=None,
@@ -46,6 +47,7 @@ class _Conv3D(Layer):
        self._out_channels = out_channels
        self._data_format = data_format
        self._subm = subm
+        self._key = key
        assert padding_mode == 'zeros', "Currently, only support padding_mode='zeros'"
        assert groups == 1, "Currently, only support groups=1"
@@ -95,6 +97,7 @@ class _Conv3D(Layer):
                             dilation=self._dilation,
                             groups=self._groups,
                             subm=self._subm,
+                             key=self._key,
                             data_format=self._data_format)
        return out
@@ -240,6 +243,7 @@ class Conv3D(_Conv3D):
                                     dilation=dilation,
                                     groups=groups,
                                     subm=False,
+                                     key=None,
                                     padding_mode=padding_mode,
                                     weight_attr=weight_attr,
                                     bias_attr=bias_attr,
@@ -293,6 +297,10 @@ class SubmConv3D(_Conv3D):
            of the input channels, while the second half of the filters is only
            connected to the second half of the input channels. The default value is 1.
        padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Currently only support ``'zeros'``.
+        key(str, optional): the key is used to save or use the same rulebook, 
+            the definition and role of rulebook refers to
+            https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf. The
+            default value is None.
        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
            of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
            will create ParamAttr as param_attr. If it is set to None, the parameter
@@ -361,6 +369,7 @@ class SubmConv3D(_Conv3D):
                 dilation=1,
                 groups=1,
                 padding_mode='zeros',
+                 key=None,
                 weight_attr=None,
                 bias_attr=None,
                 data_format="NDHWC"):
@@ -372,6 +381,7 @@ class SubmConv3D(_Conv3D):
                                         dilation=dilation,
                                         groups=groups,
                                         subm=True,
+                                         key=key,
                                         padding_mode=padding_mode,
                                         weight_attr=weight_attr,
                                         bias_attr=bias_attr,