Optimize sparse convolution (#43576)

9841b308 · zhangkaihuo · GitHub · 22342d51 · 9841b308 · 9841b308
27 changed file
--- a/paddle/phi/api/yaml/sparse_api.yaml
+++ b/paddle/phi/api/yaml/sparse_api.yaml
@@ -80,14 +80,14 @@
    data_type : x
  backward : cast_grad

- api : conv3d
-  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
-  output : Tensor(out), Tensor(rulebook)
+- api : conv3d_coo
+  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key)
+  output : Tensor(out), Tensor(rulebook), Tensor(counter) 
  kernel :
-    func : conv3d_coo{sparse_coo, dense -> sparse_coo, dense}
+    func : conv3d_coo{sparse_coo, dense -> sparse_coo, dense, dense}
    layout : x
-  intermediate : rulebook
-  backward : conv3d_grad
+  intermediate: rulebook, counter
+  backward : conv3d_coo_grad

 - api : coo_to_dense
  args : (Tensor x)
@@ -352,11 +352,11 @@

 - api: maxpool
  args : (Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides)
-  output : Tensor(out), Tensor(rulebook)
+  output : Tensor(out), Tensor(rulebook), Tensor(counter)
  kernel :
-    func : maxpool_coo{sparse_coo -> sparse_coo, dense}
+    func : maxpool_coo{sparse_coo -> sparse_coo, dense, dense}
    layout : x
-  intermediate : rulebook
+  intermediate : rulebook, counter
  backward : maxpool_grad

 - api: mv

--- a/paddle/phi/api/yaml/sparse_bw_api.yaml
+++ b/paddle/phi/api/yaml/sparse_bw_api.yaml
@@ -81,12 +81,12 @@
           cast_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
    data_type : out_grad

- backward_api : conv3d_grad
-  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
-  args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
+- backward_api : conv3d_coo_grad
+  forward : conv3d_coo (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) -> Tensor(out), Tensor(rulebook), Tensor(counter)
+  args : (Tensor x, Tensor kernel, Tensor out, Tensor rulebook, Tensor counter, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key)
  output : Tensor(x_grad), Tensor(kernel_grad)
  kernel :
-    func : conv3d_coo_grad{sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense}
+    func : conv3d_coo_grad{sparse_coo, dense, sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense}

 - backward_api : coo_to_dense_grad
  forward : coo_to_dense(Tensor x) -> Tensor(out)
@@ -164,11 +164,11 @@
           matmul_coo_coo_grad {sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo}

 - backward_api : maxpool_grad
-  forward : maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out), Tensor(rulebook)
-  args : (Tensor x, Tensor rulebook, Tensor out, Tensor out_grad, int[] kernel_sizes)
+  forward : maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out), Tensor(rulebook), Tensor(counter)
+  args : (Tensor x, Tensor rulebook, Tensor counter, Tensor out, Tensor out_grad, int[] kernel_sizes)
  output : Tensor(x_grad)
  kernel :
-    func : maxpool_coo_grad {sparse_coo, dense, sparse_coo, sparse_coo -> sparse_coo}
+    func : maxpool_coo_grad {sparse_coo, dense, dense, sparse_coo, sparse_coo -> sparse_coo}

 - backward_api : multiply_grad
  forward : multiply(Tensor x, Tensor y) -> Tensor(out)

--- a/paddle/phi/core/sparse_coo_tensor.h
+++ b/paddle/phi/core/sparse_coo_tensor.h
@@ -156,6 +156,48 @@ class SparseCooTensor : public TensorBase,
  /// \brief get the dnese dim
  int32_t dense_dim() const;

+  /// \brief query table according to key
+  const std::pair<DenseTensor, DenseTensor>* IndicesPairs(
+      const std::string& key) const {
+    if (indices_dict_ == nullptr) {
+      return nullptr;
+    }
+    const auto& iter = indices_dict_->find(key);
+    if (iter == indices_dict_->end()) {
+      return nullptr;
+    }
+    return &iter->second;
+  }
+
+  /// \brief save (key, indices_pairs)
+  void SaveIndicesPairs(
+      const std::string& key,
+      const std::pair<DenseTensor, DenseTensor>& indices_pairs) {
+    if (indices_dict_ == nullptr) {
+      indices_dict_ = std::make_shared<
+          std::map<std::string, std::pair<DenseTensor, DenseTensor>>>();
+    }
+    auto ret = indices_dict_->insert({key, indices_pairs});
+    if (ret.second == false) {
+      ret.first->second = indices_pairs;
+    }
+  }
+
+  /// \brief get indices_dict_
+  const std::shared_ptr<
+      std::map<std::string, std::pair<DenseTensor, DenseTensor>>>&
+  GetIndicesDict() const {
+    return indices_dict_;
+  }
+
+  /// \brief set indices_dict_
+  void SetIndicesDict(
+      const std::shared_ptr<
+          std::map<std::string, std::pair<DenseTensor, DenseTensor>>>&
+          indices_dict) {
+    indices_dict_ = indices_dict;
+  }
+
 private:
  // save the indices of non zero elements in original dense tensor
  DenseTensor non_zero_indices_;
@@ -165,6 +207,14 @@ class SparseCooTensor : public TensorBase,
  bool coalesced_ = false;
  // save the number of non zero elements in each batch
  DDim dims_;
+
+  // for submanifold conv
+  // SubmConv will generate a rulebook and a counter, which can be
+  // reused by different SubmConv.
+  // refer to sparse/gpu/convolution_kernel.cu.
+  std::shared_ptr<std::map<std::string, std::pair<DenseTensor, DenseTensor>>>
+      indices_dict_ = nullptr;
+
  /* --------------------------- */
  /*   example: non zero element is scalar */
  /* --------------------------- */

--- a/paddle/phi/kernels/funcs/sparse/convolution.h
+++ b/paddle/phi/kernels/funcs/sparse/convolution.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"

 namespace phi {
@@ -188,6 +189,88 @@ inline void PrefixSum(const T* counter, T* offsets, const int n) {
  offsets[n] = offset;
 }

+template <typename IntT>
+inline const IntT* GetRulebookPtr(const SparseCooTensor& coo,
+                                  const DenseTensor& rulebook,
+                                  const std::string& key,
+                                  int* rulebook_len) {
+  if (!key.empty()) {
+    const auto* indices_pairs = coo.IndicesPairs(key);
+    if (indices_pairs != nullptr) {
+      const DenseTensor& tmp_rulebook = indices_pairs->first;
+      *rulebook_len = tmp_rulebook.dims()[1];
+      return tmp_rulebook.data<IntT>();
+    }
+  }
+  *rulebook_len = rulebook.dims()[1];
+  return rulebook.data<IntT>();
+}
+
+inline const int* GetCounterPtr(const SparseCooTensor& coo,
+                                const DenseTensor& counter,
+                                const std::string& key) {
+  if (!key.empty()) {
+    const auto* indices_pairs = coo.IndicesPairs(key);
+    if (indices_pairs != nullptr) {
+      return indices_pairs->second.data<int>();
+    }
+  }
+  return counter.data<int>();
+}
+
+template <typename T, typename IntT, typename Context>
+inline const IntT* PrepareSubm(const Context& dev_ctx,
+                               const SparseCooTensor& x,
+                               const std::string& key,
+                               const DDim& out_dims,
+                               SparseCooTensor* out,
+                               int* counter,
+                               int* offsets,
+                               int* rulebook_len,
+                               bool* need_product_rulebook) {
+  const auto* indices_pairs = x.IndicesPairs(key);
+  if (indices_pairs != nullptr) {
+    *need_product_rulebook = false;
+    const DenseTensor& rulebook = indices_pairs->first;
+    const int counter_size = indices_pairs->second.numel();
+    memcpy(
+        counter, indices_pairs->second.data<int>(), counter_size * sizeof(int));
+    out->SetIndicesDict(x.GetIndicesDict());
+
+    *rulebook_len = rulebook.dims()[1];
+
+    DenseTensor out_indices =
+        phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
+    DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
+    phi::Copy(
+        dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices);
+    out->SetMember(out_indices, out_values, out_dims, false);
+    PrefixSum<int>(counter, offsets, counter_size);
+    return rulebook.data<IntT>();
+  }
+  return nullptr;
+}
+
+template <typename Context>
+inline void SaveToTable(const Context& dev_ctx,
+                        const SparseCooTensor& x,
+                        const std::string& key,
+                        const DenseTensor& in_rulebook,
+                        const DenseTensor& h_counter,
+                        SparseCooTensor* out,
+                        DenseTensor* out_rulebook,
+                        DenseTensor* counter) {
+  out->SetIndicesDict(x.GetIndicesDict());
+  if (!key.empty()) {
+    out->SaveIndicesPairs(key, std::make_pair(in_rulebook, h_counter));
+  } else {
+    *out_rulebook = in_rulebook;
+    counter->Resize({h_counter.numel()});
+    int* counter_ptr = dev_ctx.template HostAlloc<int>(counter);
+    memcpy(counter_ptr, h_counter.data<int>(), h_counter.numel() * sizeof(int));
+  }
+}
+
 }  // namespace sparse
 }  // namespace funcs
 }  // namespace phi
--- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+
+#define VecBytes 16

 namespace phi {
 namespace funcs {
@@ -28,33 +33,126 @@ namespace sparse {
 * channels: the output channel size
 * out: the outputs
 **/
-template <typename T>
+template <typename T, int VecSize>
 __global__ void ScatterKernel(const T* input,
                              const int* unique_value,
                              const int* out_index,
                              const int non_zero_num,
                              const int rulebook_len,
                              const int channels,
-                              T* out,
-                              const bool subm = false) {
+                              T* out) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
-    int indices_i = i / channels;
-    int channels_i = i - indices_i * channels;
+  const int vec_channels = channels / VecSize;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  for (int i = tid; i < non_zero_num * vec_channels;
+       i += gridDim.x * blockDim.x) {
+    int indices_i = i / vec_channels;
+    int channels_i = i - indices_i * vec_channels;

    int start = unique_value[indices_i];
    int end = indices_i == non_zero_num - 1 ? rulebook_len
                                            : unique_value[indices_i + 1];
    // max(end-start) = kernel_size
-    T sum = static_cast<T>(0);
-    if (subm) {
-      sum = out[indices_i * channels + channels_i];
-    }
+    StoreT sums = {static_cast<T>(0)};
    for (int j = start; j < end; j++) {
      const int out_feature_i = out_index[j];
-      sum += input[out_feature_i * channels + channels_i];
+      LoadT vec_in;
+      phi::Load<T, VecSize>(
+          input + out_feature_i * channels + channels_i * VecSize, &vec_in);
+#pragma unroll
+      for (int k = 0; k < VecSize; k++) {
+        sums[k] += vec_in[k];
+      }
    }
-    out[indices_i * channels + channels_i] = sum;
+    phi::Store<T, VecSize>(sums,
+                           out + indices_i * channels + channels_i * VecSize);
+  }
+}
+
+// scatter's index has been grouped in advance
+// index_counts record the count of each group
+// index_groups save the index of each group
+template <typename T, int VecSize>
+__global__ void ScatterKernelV2(const T* input,
+                                const int* index_counts,
+                                const int* index_groups,
+                                const int non_zero_num,
+                                const int kernel_size,
+                                const int channels,
+                                const int buffer_counts,
+                                T* out) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int vec_channels = channels / VecSize;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  for (int i = tid; i < non_zero_num * vec_channels;
+       i += gridDim.x * blockDim.x) {
+    int indices_i = i / vec_channels;
+    int channels_i = i - indices_i * vec_channels;
+
+    StoreT sums = {static_cast<T>(0)};
+    phi::Load<T, VecSize>(out + indices_i * channels + channels_i * VecSize,
+                          &sums);
+    for (int it = 0; it < buffer_counts; it++) {
+      int len = index_counts[indices_i + it * non_zero_num];
+      const int group_offset = it * kernel_size * non_zero_num;
+      for (int j = 0; j < len; j++) {
+        const int out_feature_i =
+            index_groups[indices_i * kernel_size + j + group_offset];
+        LoadT vec_in;
+        phi::Load<T, VecSize>(
+            input + out_feature_i * channels + channels_i * VecSize, &vec_in);
+#pragma unroll
+        for (int k = 0; k < VecSize; k++) {
+          sums[k] += vec_in[k];
+        }
+      }
+    }
+    phi::Store<T, VecSize>(sums,
+                           out + indices_i * channels + channels_i * VecSize);
+  }
+}
+
+template <typename T>
+void ScatterV2(const GPUContext& dev_ctx,
+               const T* input,
+               const int* index_counts,
+               const int* index_groups,
+               const int non_zero_num,
+               const int kernel_size,
+               const int channels,
+               const int buffer_counts,
+               T* output) {
+  const int VecSize = VecBytes / sizeof(T);
+  if (channels % VecSize == 0) {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, non_zero_num * channels / VecSize, 1);
+    ScatterKernelV2<T, VecSize><<<config.block_per_grid.x,
+                                  config.thread_per_block.x,
+                                  0,
+                                  dev_ctx.stream()>>>(input,
+                                                      index_counts,
+                                                      index_groups,
+                                                      non_zero_num,
+                                                      kernel_size,
+                                                      channels,
+                                                      buffer_counts,
+                                                      output);
+  } else {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, non_zero_num * channels, 1);
+    ScatterKernelV2<T, 1><<<config.block_per_grid.x,
+                            config.thread_per_block.x,
+                            0,
+                            dev_ctx.stream()>>>(input,
+                                                index_counts,
+                                                index_groups,
+                                                non_zero_num,
+                                                kernel_size,
+                                                channels,
+                                                buffer_counts,
+                                                output);
  }
 }


--- a/paddle/phi/kernels/sparse/conv_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/conv_grad_kernel.h
@@ -25,13 +25,16 @@ template <typename T, typename Context>
 void Conv3dCooGradKernel(const Context& dev_ctx,
                         const SparseCooTensor& x,
                         const DenseTensor& kernel,
+                         const SparseCooTensor& out,
                         const DenseTensor& rulebook,
+                         const DenseTensor& counter,
                         const SparseCooTensor& out_grad,
                         const std::vector<int>& paddings,
                         const std::vector<int>& dilations,
                         const std::vector<int>& strides,
                         const int groups,
                         const bool subm,
+                         const std::string& key,
                         SparseCooTensor* x_grad,
                         DenseTensor* kernel_grad);

@@ -40,13 +43,16 @@ std::tuple<SparseCooTensor, DenseTensor> Conv3dCooGrad(
    const Context& dev_ctx,
    const SparseCooTensor& x,
    const DenseTensor& kernel,
+    const SparseCooTensor& out,
    const DenseTensor& rulebook,
+    const DenseTensor& counter,
    const SparseCooTensor& out_grad,
    const std::vector<int>& paddings,
    const std::vector<int>& dilations,
    const std::vector<int>& strides,
    const int groups,
-    const bool subm) {
+    const bool subm,
+    const std::string& key) {
  SparseCooTensor x_grad;
  DenseTensor kernel_grad;

@@ -54,13 +60,16 @@ std::tuple<SparseCooTensor, DenseTensor> Conv3dCooGrad(
  Conv3dCooGradKernel<T, Context>(dev_ctx,
                                  x,
                                  kernel,
+                                  out,
                                  rulebook,
+                                  counter,
                                  out_grad,
                                  paddings,
                                  dilations,
                                  strides,
                                  groups,
                                  subm,
+                                  key,
                                  &x_grad,
                                  &kernel_grad);
  return std::make_tuple(x_grad, kernel_grad);

--- a/paddle/phi/kernels/sparse/conv_kernel.h
+++ b/paddle/phi/kernels/sparse/conv_kernel.h
@@ -31,8 +31,10 @@ void Conv3dCooKernel(const Context& dev_ctx,
                     const std::vector<int>& strides,
                     const int groups,
                     const bool subm,
+                     const std::string& key,
                     SparseCooTensor* out,
-                     DenseTensor* rulebook);
+                     DenseTensor* rulebook,
+                     DenseTensor* counter);

 template <typename T, typename Context>
 SparseCooTensor Conv3dCoo(const Context& dev_ctx,
@@ -43,7 +45,9 @@ SparseCooTensor Conv3dCoo(const Context& dev_ctx,
                          const std::vector<int>& strides,
                          const int groups,
                          const bool subm,
-                          DenseTensor* rulebook) {
+                          const std::string& key,
+                          DenseTensor* rulebook,
+                          DenseTensor* counter) {
  SparseCooTensor coo;
  Conv3dCooKernel<T, Context>(dev_ctx,
                              x,
@@ -53,8 +57,10 @@ SparseCooTensor Conv3dCoo(const Context& dev_ctx,
                              strides,
                              groups,
                              subm,
+                              key,
                              &coo,
-                              rulebook);
+                              rulebook,
+                              counter);
  return coo;
 }


--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -41,13 +41,12 @@ void ProductRuleBook(const Context& dev_ctx,
                     const DDim& out_dims,
                     const bool subm,
                     DenseTensor* rulebook,
-                     DenseTensor* counter_per_kernel) {
+                     int* counter_per_kernel) {
  const int64_t non_zero_num = x.nnz();
  const auto& non_zero_indices = x.non_zero_indices();
  const IntT* indices_ptr = non_zero_indices.data<IntT>();
-  int* counter_ptr = counter_per_kernel->data<int>();
  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
-  memset(counter_ptr, 0, kernel_size * sizeof(int));
+  memset(counter_per_kernel, 0, kernel_size * sizeof(int));

  int rulebook_len = 0;
  // calc the rulebook_len
@@ -107,7 +106,7 @@ void ProductRuleBook(const Context& dev_ctx,
              }

              if (rulebook_ptr == nullptr) {
-                counter_ptr[kernel_index - 1] += 1;
+                counter_per_kernel[kernel_index - 1] += 1;
                ++rulebook_len;
              } else {
                rulebook_ptr[rulebook_index] = kernel_index - 1;

--- a/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/sparse/cpu/convolution.h"
+#include "paddle/phi/kernels/sparse/cpu/conv.h"

 namespace phi {
 namespace sparse {
@@ -34,22 +34,27 @@ template <typename T, typename IntT = int>
 void Conv3dCooGradCPUKernel(const CPUContext& dev_ctx,
                            const SparseCooTensor& x,
                            const DenseTensor& kernel,
+                            const SparseCooTensor& out,
                            const DenseTensor& rulebook,
+                            const DenseTensor& counter,
                            const SparseCooTensor& out_grad,
                            const std::vector<int>& paddings,
                            const std::vector<int>& dilations,
                            const std::vector<int>& strides,
                            const int groups,
                            const bool subm,
+                            const std::string& key,
                            SparseCooTensor* x_grad,
                            DenseTensor* kernel_grad) {
  const auto& kernel_dims = kernel.dims();
  const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
  const int in_channels = kernel_dims[3];
  const int out_channels = kernel_dims[4];
-  const IntT* rulebook_ptr = rulebook.data<IntT>();

-  const int rulebook_len = rulebook.dims()[1];
+  int rulebook_len = 0;
+  const IntT* rulebook_ptr = phi::funcs::sparse::GetRulebookPtr<IntT>(
+      out, rulebook, key, &rulebook_len);
+  const int* counter_ptr = phi::funcs::sparse::GetCounterPtr(out, counter, key);

  DenseTensorMeta in_features_meta(
      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
@@ -86,16 +91,14 @@ void Conv3dCooGradCPUKernel(const CPUContext& dev_ctx,
                        &x_grad_indices);
  x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);

-  std::vector<IntT> offsets(kernel_size + 1), counter(kernel_size, 0);
-  for (int i = 0; i < rulebook_len; i++) {
-    counter[rulebook_ptr[i]] += 1;
-  }
-  IntT offset = 0, max_count = 0;
+  std::vector<IntT> offsets(kernel_size + 1);
+  IntT offset = 0;
+  int max_count = 0;
  for (int i = 0; i < kernel_size; i++) {
    offsets[i] = offset;
-    offset += counter[i];
+    offset += counter_ptr[i];
    if (i < half_kernel_size) {
-      max_count = std::max(max_count, counter[i]);
+      max_count = std::max(max_count, counter_ptr[i]);
    }
  }
  offsets[kernel_size] = offset;
@@ -129,11 +132,11 @@ void Conv3dCooGradCPUKernel(const CPUContext& dev_ctx,

  const T* kernel_ptr = kernel.data<T>();
  for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0 || (subm && i == half_kernel_size)) {
+    if (counter_ptr[i] <= 0 || (subm && i == half_kernel_size)) {
      continue;
    }

-    const int M = counter[i];
+    const int M = counter_ptr[i];
    const int K = in_channels;
    const int N = out_channels;
    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
@@ -171,7 +174,7 @@ void Conv3dCooGradCPUKernel(const CPUContext& dev_ctx,

  // 4. scatter
  Scatter<T, IntT>(d_x_features_ptr,
-                   rulebook.data<IntT>() + rulebook_len,
+                   rulebook_ptr + rulebook_len,
                   rulebook_len,
                   in_channels,
                   x_grad_values_ptr);
@@ -181,13 +184,16 @@ template <typename T, typename Context>
 void Conv3dCooGradKernel(const Context& dev_ctx,
                         const SparseCooTensor& x,
                         const DenseTensor& kernel,
+                         const SparseCooTensor& out,
                         const DenseTensor& rulebook,
+                         const DenseTensor& counter,
                         const SparseCooTensor& out_grad,
                         const std::vector<int>& paddings,
                         const std::vector<int>& dilations,
                         const std::vector<int>& strides,
                         const int groups,
                         const bool subm,
+                         const std::string& key,
                         SparseCooTensor* x_grad,
                         DenseTensor* kernel_grad) {
  PD_VISIT_INTEGRAL_TYPES(
@@ -195,13 +201,16 @@ void Conv3dCooGradKernel(const Context& dev_ctx,
        Conv3dCooGradCPUKernel<T, data_t>(dev_ctx,
                                          x,
                                          kernel,
+                                          out,
                                          rulebook,
+                                          counter,
                                          out_grad,
                                          paddings,
                                          dilations,
                                          strides,
                                          groups,
                                          subm,
+                                          key,
                                          x_grad,
                                          kernel_grad);
      }));

--- a/paddle/phi/kernels/sparse/cpu/conv_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc
@@ -14,9 +14,10 @@ limitations under the License. */

 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/sparse/cpu/convolution.h"
+#include "paddle/phi/kernels/sparse/cpu/conv.h"

 namespace phi {
 namespace sparse {
@@ -35,8 +36,10 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx,
                        const std::vector<int>& strides,
                        const int groups,
                        const bool subm,
+                        const std::string& key,
                        SparseCooTensor* out,
-                        DenseTensor* rulebook) {
+                        DenseTensor* rulebook,
+                        DenseTensor* counter) {
  // update padding and dilation
  // Currently, only support x.layout is NDHWC, groups = 1
  // if x.layout != NDHWC then transpose(x), transpose(weight)
@@ -66,26 +69,50 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx,
  // Second algorithm:
  // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
  // 1. product rulebook
-  DenseTensorMeta counter_meta(
-      DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
-
-  ProductRuleBook<T, CPUContext, IntT>(dev_ctx,
-                                       x,
-                                       kernel_sizes,
-                                       subm_paddings,
-                                       dilations,
-                                       subm_strides,
-                                       out_dims,
-                                       subm,
-                                       rulebook,
-                                       &counter_per_kernel);
-
-  UpdateRulebookAndOutIndex<T, CPUContext, IntT>(
-      dev_ctx, x, kernel_size, out_channels, out_dims, rulebook, out);
-
-  int n = rulebook->dims()[1];
-  const int* counter_ptr = counter_per_kernel.data<int>();
+  DenseTensor h_counter, h_offsets;
+  h_counter.Resize({kernel_size});
+  h_offsets.Resize({kernel_size + 1});
+  int* h_counter_ptr = dev_ctx.template HostAlloc<int>(&h_counter);
+  int* h_offsets_ptr = dev_ctx.template HostAlloc<int>(&h_offsets);
+
+  // DenseTensor* rulebook = nullptr;
+  const IntT* rulebook_ptr = nullptr;
+  int n = 0;
+  bool need_product_rulebook = true;
+  if (subm && !key.empty()) {
+    rulebook_ptr = phi::funcs::sparse::PrepareSubm<T, IntT, CPUContext>(
+        dev_ctx,
+        x,
+        key,
+        out_dims,
+        out,
+        h_counter_ptr,
+        h_offsets_ptr,
+        &n,
+        &need_product_rulebook);
+  }
+  if (need_product_rulebook) {
+    DenseTensor tmp_rulebook;
+    ProductRuleBook<T, CPUContext, IntT>(dev_ctx,
+                                         x,
+                                         kernel_sizes,
+                                         subm_paddings,
+                                         dilations,
+                                         subm_strides,
+                                         out_dims,
+                                         subm,
+                                         &tmp_rulebook,
+                                         h_counter_ptr);
+
+    UpdateRulebookAndOutIndex<T, CPUContext, IntT>(
+        dev_ctx, x, kernel_size, out_channels, out_dims, &tmp_rulebook, out);
+    n = tmp_rulebook.dims()[1];
+    rulebook_ptr = tmp_rulebook.data<IntT>();
+
+    phi::funcs::sparse::SaveToTable(
+        dev_ctx, x, key, tmp_rulebook, h_counter, out, rulebook, counter);
+  }
+  // int n = rulebook->dims()[1];

  // 2. gather
  DenseTensorMeta in_features_meta(
@@ -100,34 +127,33 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx,
  T* out_features_ptr = out_features.data<T>();

  Gather<T, IntT>(x.non_zero_elements().data<T>(),
-                  rulebook->data<IntT>() + n,
+                  rulebook_ptr + n,
                  n,
                  in_channels,
                  in_features_ptr);

  // 3. call gemm for every werght
  auto blas = phi::funcs::GetBlas<CPUContext, T>(dev_ctx);
-  std::vector<int> offsets(kernel_size + 1);
  int offset = 0;
  for (int i = 0; i < kernel_size; i++) {
-    offsets[i] = offset;
-    offset += counter_ptr[i];
+    h_offsets_ptr[i] = offset;
+    offset += h_counter_ptr[i];
  }
-  offsets[kernel_size] = offset;
+  h_offsets_ptr[kernel_size] = offset;

  const T* kernel_ptr = kernel.data<T>();
  for (int i = 0; i < kernel_size; i++) {
-    if (counter_ptr[i] <= 0) {
+    if (h_counter_ptr[i] <= 0) {
      continue;
    }

    // call gemm: (n, in_channels) * (in_channels, out_channels)
-    const int M = counter_ptr[i];
+    const int M = h_counter_ptr[i];
    const int K = in_channels;   // in_channels
    const int N = out_channels;  // out_channels
-    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels;
    const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
-    T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels;
+    T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels;
    blas.GEMM(CblasNoTrans,
              CblasNoTrans,
              M,
@@ -143,11 +169,8 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx,
  // 4. scatter
  T* out_values_ptr = out->mutable_non_zero_elements()->data<T>();
  memset(out_values_ptr, 0, sizeof(T) * out->nnz() * out_channels);
-  Scatter<T, IntT>(out_features_ptr,
-                   rulebook->data<IntT>() + n * 2,
-                   n,
-                   out_channels,
-                   out_values_ptr);
+  Scatter<T, IntT>(
+      out_features_ptr, rulebook_ptr + n * 2, n, out_channels, out_values_ptr);
 }

 template <typename T, typename Context>
@@ -159,8 +182,10 @@ void Conv3dCooKernel(const Context& dev_ctx,
                     const std::vector<int>& strides,
                     const int groups,
                     const bool subm,
+                     const std::string& key,
                     SparseCooTensor* out,
-                     DenseTensor* rulebook) {
+                     DenseTensor* rulebook,
+                     DenseTensor* counter) {
  PD_VISIT_INTEGRAL_TYPES(
      x.non_zero_indices().dtype(), "Conv3dCooCPUKernel", ([&] {
        Conv3dCooCPUKernel<T, data_t>(dev_ctx,
@@ -171,8 +196,10 @@ void Conv3dCooKernel(const Context& dev_ctx,
                                      strides,
                                      groups,
                                      subm,
+                                      key,
                                      out,
-                                      rulebook);
+                                      rulebook,
+                                      counter);
      }));
 }


--- a/paddle/phi/kernels/sparse/cpu/pool_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/pool_grad_kernel.cc
@@ -28,6 +28,7 @@ template <typename T, typename IntT = int>
 void MaxPoolCooGradCPUKernel(const CPUContext& dev_ctx,
                             const SparseCooTensor& x,
                             const DenseTensor& rulebook,
+                             const DenseTensor& counter,
                             const SparseCooTensor& out,
                             const SparseCooTensor& out_grad,
                             const std::vector<int>& kernel_sizes,
@@ -36,11 +37,10 @@ void MaxPoolCooGradCPUKernel(const CPUContext& dev_ctx,
  const int channels = x.dims()[4];
  int rulebook_len = rulebook.dims()[1];
  const IntT* rulebook_ptr = rulebook.data<IntT>();
-  std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0);
-  for (int i = 0; i < rulebook_len; i++) {
-    counter[rulebook_ptr[i]] += 1;
-  }
-  phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size);
+  std::vector<int> offsets(kernel_size + 1);
+  const int* counter_ptr = counter.data<int>();
+
+  phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);

  const T* in_features_ptr = x.non_zero_elements().data<T>();
  const T* out_features_ptr = out.non_zero_elements().data<T>();
@@ -60,7 +60,7 @@ void MaxPoolCooGradCPUKernel(const CPUContext& dev_ctx,

  phi::funcs::MaxPoolGrad<T> grad_functor;
  for (int i = 0; i < kernel_size; i++) {
-    for (int j = 0; j < counter[i]; j++) {
+    for (int j = 0; j < counter_ptr[i]; j++) {
      IntT in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
      IntT out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
      for (int c = 0; c < channels; c++) {
@@ -78,6 +78,7 @@ template <typename T, typename Context>
 void MaxPoolCooGradKernel(const Context& dev_ctx,
                          const SparseCooTensor& x,
                          const DenseTensor& rulebook,
+                          const DenseTensor& counter,
                          const SparseCooTensor& out,
                          const SparseCooTensor& out_grad,
                          const std::vector<int>& kernel_sizes,
@@ -85,7 +86,7 @@ void MaxPoolCooGradKernel(const Context& dev_ctx,
  PD_VISIT_INTEGRAL_TYPES(
      x.non_zero_indices().dtype(), "MaxPoolCooGradCPUKernel", ([&] {
        MaxPoolCooGradCPUKernel<T, data_t>(
-            dev_ctx, x, rulebook, out, out_grad, kernel_sizes, x_grad);
+            dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, x_grad);
      }));
 }


--- a/paddle/phi/kernels/sparse/cpu/pool_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/pool_kernel.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/sparse/convolution.h"
-#include "paddle/phi/kernels/sparse/cpu/convolution.h"
+#include "paddle/phi/kernels/sparse/cpu/conv.h"

 namespace phi {
 namespace sparse {
@@ -37,7 +37,8 @@ void MaxPoolCooCPUKernel(const CPUContext& dev_ctx,
                         const std::vector<int>& dilations,
                         const std::vector<int>& strides,
                         SparseCooTensor* out,
-                         DenseTensor* rulebook) {
+                         DenseTensor* rulebook,
+                         DenseTensor* counter) {
  const auto& x_dims = x.dims();
  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
  const std::vector<int>& real_kernel_sizes =
@@ -47,9 +48,7 @@ void MaxPoolCooCPUKernel(const CPUContext& dev_ctx,
      x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
  const int in_channels = real_kernel_sizes[3];

-  DenseTensorMeta counter_meta(
-      DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+  std::vector<int> counter_per_kernel(kernel_size, 0);

  const T* in_features_ptr = x.non_zero_elements().data<T>();
  // 1. product rule book
@@ -62,14 +61,17 @@ void MaxPoolCooCPUKernel(const CPUContext& dev_ctx,
                                       out_dims,
                                       false,
                                       rulebook,
-                                       &counter_per_kernel);
+                                       counter_per_kernel.data());

  UpdateRulebookAndOutIndex<T, CPUContext, IntT>(
      dev_ctx, x, kernel_size, in_channels, out_dims, rulebook, out);

  int rulebook_len = rulebook->dims()[1];
  const IntT* rulebook_ptr = rulebook->data<IntT>();
-  const int* counter_ptr = counter_per_kernel.data<int>();
+
+  counter->Resize({kernel_size});
+  int* counter_ptr = dev_ctx.template HostAlloc<int>(counter);
+  memcpy(counter_ptr, counter_per_kernel.data(), kernel_size * sizeof(int));

  std::vector<int> offsets(kernel_size + 1);
  phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);
@@ -105,7 +107,8 @@ void MaxPoolCooKernel(const Context& dev_ctx,
                      const std::vector<int>& dilations,
                      const std::vector<int>& strides,
                      SparseCooTensor* out,
-                      DenseTensor* rulebook) {
+                      DenseTensor* rulebook,
+                      DenseTensor* counter) {
  PD_VISIT_INTEGRAL_TYPES(
      x.non_zero_indices().dtype(), "MaxPoolCooCPUKernel", ([&] {
        MaxPoolCooCPUKernel<T, data_t>(dev_ctx,
@@ -115,7 +118,8 @@ void MaxPoolCooKernel(const Context& dev_ctx,
                                       dilations,
                                       strides,
                                       out,
-                                       rulebook);
+                                       rulebook,
+                                       counter);
      }));
 }


--- a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
@@ -125,16 +125,35 @@ void CoalesceGPUKernel(const GPUContext& dev_ctx,
  }

  // 5. scatter the values
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1);
-  phi::funcs::sparse::ScatterKernel<T>
-      <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
-          x_values_ptr,
-          public_indexs.data<int>(),
-          values_indexs_ptr,
-          out_nnz,
-          nnz,
-          stride,
-          out_values.data<T>());
+  const int VecSize = VecBytes / sizeof(T);
+  if (stride % VecSize == 0) {
+    config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, nnz * stride / VecSize, 1);
+    phi::funcs::sparse::ScatterKernel<T, VecSize>
+        <<<config.block_per_grid,
+           config.thread_per_block,
+           0,
+           dev_ctx.stream()>>>(x_values_ptr,
+                               public_indexs.data<int>(),
+                               values_indexs_ptr,
+                               out_nnz,
+                               nnz,
+                               stride,
+                               out_values.data<T>());
+  } else {
+    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1);
+    phi::funcs::sparse::ScatterKernel<T, 1>
+        <<<config.block_per_grid,
+           config.thread_per_block,
+           0,
+           dev_ctx.stream()>>>(x_values_ptr,
+                               public_indexs.data<int>(),
+                               values_indexs_ptr,
+                               out_nnz,
+                               nnz,
+                               stride,
+                               out_values.data<T>());
+  }

  // 6. convert index to coordinate
  Dim<DDim::kMaxRank> const_dims;

--- a/paddle/phi/kernels/sparse/gpu/conv.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/conv.cu.h
--- a/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
@@ -19,13 +19,11 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/scatter.cu.h"
-#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+#include "paddle/phi/kernels/sparse/gpu/conv.cu.h"

 namespace phi {
 namespace sparse {
@@ -42,43 +40,42 @@ template <typename T, typename IntT>
 void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
                            const SparseCooTensor& x,
                            const DenseTensor& kernel,
+                            const SparseCooTensor& out,
                            const DenseTensor& rulebook,
+                            const DenseTensor& counter,
                            const SparseCooTensor& out_grad,
                            const std::vector<int>& paddings,
                            const std::vector<int>& dilations,
                            const std::vector<int>& strides,
                            const int groups,
                            const bool subm,
+                            const std::string& key,
                            SparseCooTensor* x_grad,
                            DenseTensor* kernel_grad) {
  const auto& kernel_dims = kernel.dims();
  const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
  const int in_channels = kernel_dims[3];
  const int out_channels = kernel_dims[4];
-  const IntT* rulebook_ptr = rulebook.data<IntT>();

-  const int rulebook_len = rulebook.dims()[1];
+  int rulebook_len = 0;
+  const IntT* rulebook_ptr = phi::funcs::sparse::GetRulebookPtr<IntT>(
+      out, rulebook, key, &rulebook_len);
+  const int* counter_ptr = phi::funcs::sparse::GetCounterPtr(out, counter, key);

-  DenseTensorMeta in_features_meta(
-      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
-  DenseTensorMeta d_x_features_meta(
-      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
-  DenseTensorMeta out_grad_features_meta(
-      x.dtype(), {rulebook_len, out_channels}, DataLayout::NCHW);
  phi::DenseTensor in_features =
-      phi::Empty(dev_ctx, std::move(in_features_meta));
+      phi::Empty<T>(dev_ctx, {rulebook_len, in_channels});
  phi::DenseTensor d_x_features =
-      phi::Empty(dev_ctx, std::move(d_x_features_meta));
+      phi::Empty<T>(dev_ctx, {rulebook_len, in_channels});
  phi::DenseTensor out_grad_features =
-      phi::Empty(dev_ctx, std::move(out_grad_features_meta));
+      phi::Empty<T>(dev_ctx, {rulebook_len, out_channels});

  T* in_features_ptr = in_features.data<T>();
  T* d_x_features_ptr = d_x_features.data<T>();
  T* out_grad_features_ptr = out_grad_features.data<T>();
  *kernel_grad = phi::EmptyLike<T>(dev_ctx, kernel);
  T* d_kernel_ptr = kernel_grad->data<T>();
-  phi::funcs::SetConstant<GPUContext, T> set_zero;
-  set_zero(dev_ctx, kernel_grad, static_cast<T>(0.0f));
+  phi::backends::gpu::GpuMemsetAsync(
+      d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel(), dev_ctx.stream());

  int half_kernel_size = kernel_size / 2;
  auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
@@ -86,8 +83,12 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
      phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
  DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
  T* x_grad_values_ptr = x_grad_values.data<T>();
-  set_zero(dev_ctx, &x_grad_values, static_cast<T>(0.0f));
-  set_zero(dev_ctx, &d_x_features, static_cast<T>(0.0f));
+  phi::backends::gpu::GpuMemsetAsync(x_grad_values_ptr,
+                                     0,
+                                     sizeof(T) * x_grad_values.numel(),
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemsetAsync(
+      d_x_features_ptr, 0, sizeof(T) * d_x_features.numel(), dev_ctx.stream());
  phi::Copy<GPUContext>(dev_ctx,
                        x.non_zero_indices(),
                        dev_ctx.GetPlace(),
@@ -95,29 +96,14 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
                        &x_grad_indices);
  x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);

-  std::vector<IntT> offsets(kernel_size + 1), counter(kernel_size, 0),
-      h_counter(rulebook_len, 0);
-  phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
-                                     rulebook_ptr,
-                                     rulebook_len * sizeof(IntT),
-#ifdef PADDLE_WITH_HIP
-                                     hipMemcpyDeviceToHost,
-#else
-                                     cudaMemcpyDeviceToHost,
-#endif
-
-                                     dev_ctx.stream());
-  dev_ctx.Wait();
+  std::vector<int> offsets(kernel_size + 1);

-  for (int i = 0; i < rulebook_len; i++) {
-    counter[h_counter[i]] += 1;
-  }
-  IntT offset = 0, max_count = 0;
+  int offset = 0, max_count = 0;
  for (int i = 0; i < kernel_size; i++) {
    offsets[i] = offset;
-    offset += counter[i];
+    offset += counter_ptr[i];
    if (i < half_kernel_size) {
-      max_count = std::max(max_count, counter[i]);
+      max_count = std::max(max_count, counter_ptr[i]);
    }
  }
  offsets[kernel_size] = offset;
@@ -138,36 +124,52 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
    }
  }

-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, rulebook_len * in_channels, 1);
-  GatherKernel<T, IntT><<<config.block_per_grid.x,
-                          config.thread_per_block.x,
-                          0,
-                          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                                              rulebook_ptr + rulebook_len,
-                                              in_features_ptr,
-                                              rulebook_len,
-                                              in_channels);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+  DenseTensor unique_value = phi::Empty<int>(
+      dev_ctx, {static_cast<int>(x_grad->nnz() * kernel_size * 2)});
+  DenseTensor out_index =
+      phi::Empty<int>(dev_ctx, {static_cast<int>(x.nnz() * 2)});
+  int* out_index_ptr = out_index.data<int>();
+  int* unique_value_ptr = unique_value.data<int>();
+  phi::backends::gpu::GpuMemsetAsync(
+      out_index_ptr, 0, sizeof(int) * x.nnz() * 2, dev_ctx.stream());

-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, rulebook_len * out_channels, 1);
-  GatherKernel<T, IntT>
-      <<<config.block_per_grid.x,
-         config.thread_per_block.x,
-         0,
-         dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
-                             rulebook_ptr + rulebook_len * 2,
-                             out_grad_features_ptr,
-                             rulebook_len,
-                             out_channels);
+  GroupIndexsV2<<<config.block_per_grid,
+                  config.thread_per_block,
+                  0,
+                  dev_ctx.stream()>>>(rulebook_len,
+                                      x.nnz(),
+                                      kernel_size,
+                                      offsets[kernel_size / 2],
+                                      rulebook_ptr,
+                                      out_index_ptr,
+                                      unique_value_ptr);
+
+  GatherV2<T, IntT>(dev_ctx,
+                    x.non_zero_elements().data<T>(),
+                    out_index_ptr,
+                    unique_value_ptr,
+                    x.nnz(),
+                    kernel_size,
+                    in_channels,
+                    2,
+                    in_features_ptr);
+
+  Gather<T, IntT>(dev_ctx,
+                  out_grad.non_zero_elements().data<T>(),
+                  rulebook_ptr + rulebook_len,
+                  rulebook_len,
+                  out_channels,
+                  out_grad_features_ptr);

  const T* kernel_ptr = kernel.data<T>();
  for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0 || (subm && i == half_kernel_size)) {
+    if (counter_ptr[i] <= 0 || (subm && i == half_kernel_size)) {
      continue;
    }

-    const int M = counter[i];
+    const int M = counter_ptr[i];
    const int K = in_channels;
    const int N = out_channels;
    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
@@ -204,32 +206,31 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
  }

  // 4. scatter
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, rulebook_len * in_channels, 1);
-
-  phi::funcs::ScatterCUDAKernel<<<config.block_per_grid,
-                                  config.thread_per_block,
-                                  0,
-                                  dev_ctx.stream()>>>(
-      d_x_features_ptr,
-      rulebook_ptr + rulebook_len,
-      x_grad_values_ptr,
-      rulebook_len,
-      in_channels,
-      false);
+  phi::funcs::sparse::ScatterV2<T>(dev_ctx,
+                                   d_x_features_ptr,
+                                   out_index.data<int>(),
+                                   unique_value.data<int>(),
+                                   x_grad->nnz(),
+                                   kernel_size,
+                                   in_channels,
+                                   2,
+                                   x_grad_values_ptr);
 }

 template <typename T, typename Context>
 void Conv3dCooGradKernel(const Context& dev_ctx,
                         const SparseCooTensor& x,
                         const DenseTensor& kernel,
+                         const SparseCooTensor& out,
                         const DenseTensor& rulebook,
+                         const DenseTensor& counter,
                         const SparseCooTensor& out_grad,
                         const std::vector<int>& paddings,
                         const std::vector<int>& dilations,
                         const std::vector<int>& strides,
                         const int groups,
                         const bool subm,
+                         const std::string& key,
                         SparseCooTensor* x_grad,
                         DenseTensor* kernel_grad) {
  PD_VISIT_INTEGRAL_TYPES(
@@ -237,13 +238,16 @@ void Conv3dCooGradKernel(const Context& dev_ctx,
        Conv3dCooGradGPUKernel<T, data_t>(dev_ctx,
                                          x,
                                          kernel,
+                                          out,
                                          rulebook,
+                                          counter,
                                          out_grad,
                                          paddings,
                                          dilations,
                                          strides,
                                          groups,
                                          subm,
+                                          key,
                                          x_grad,
                                          kernel_grad);
      }));

--- a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
@@ -21,7 +21,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/scatter.cu.h"
 #include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
-#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+#include "paddle/phi/kernels/sparse/gpu/conv.cu.h"
+
+#include "glog/logging.h"

 namespace phi {
 namespace sparse {
@@ -35,8 +37,10 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
                        const std::vector<int>& strides,
                        const int groups,
                        const bool subm,
+                        const std::string& key,
                        SparseCooTensor* out,
-                        DenseTensor* rulebook) {
+                        DenseTensor* rulebook,
+                        DenseTensor* counter) {
  // update padding and dilation
  // Currently, only support x.layout is NDHWC, groups = 1
  // if x.layout != NDHWC then transpose(x), transpose(weight)
@@ -61,85 +65,117 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
      x_dims, kernel_sizes, subm_paddings, dilations, subm_strides, &out_dims);
  const int in_channels = kernel_dims[3];
  const int out_channels = kernel_dims[4];
-  std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
+  DenseTensor h_counter, h_offsets;
+  h_counter.Resize({kernel_size});
+  h_offsets.Resize({kernel_size + 1});
+  int* h_counter_ptr = dev_ctx.template HostAlloc<int>(&h_counter);
+  int* h_offsets_ptr = dev_ctx.template HostAlloc<int>(&h_offsets);

  // Second algorithm:
  // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
  // 1. product rulebook
-  DenseTensorMeta counter_meta(
-      DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  DenseTensorMeta offsets_meta(
-      DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
-  DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta));
-  DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
-  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
-  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
-
-  int n = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
-                                               x,
-                                               kernel_sizes,
-                                               subm_paddings,
-                                               dilations,
-                                               subm_strides,
-                                               out_dims,
-                                               subm,
-                                               rulebook,
-                                               &counter_per_kernel,
-                                               &offsets_per_kernel,
-                                               &out_index,
-                                               &unique_value,
-                                               out,
-                                               &h_counter,
-                                               &offsets);
-
-  const int* counter_ptr = counter_per_kernel.data<int>();
-  const int* offsets_ptr = counter_per_kernel.data<int>();
-  const IntT* rulebook_ptr = rulebook->data<IntT>();
+  DenseTensor counter_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
+  DenseTensor offsets_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
+  DenseTensor out_index = phi::Empty<int>(dev_ctx, {1});
+  DenseTensor unique_value = phi::Empty<int>(dev_ctx, {1});
+
+  VLOG(6) << "call SubmConv3D or Conv3D " << subm << " and the key is " << key;
+  int rulebook_len = 0;
+  const IntT* rulebook_ptr = nullptr;
+  bool need_product_rulebook = true;
+  if (subm && !key.empty()) {
+    rulebook_ptr = phi::funcs::sparse::PrepareSubm<T, IntT, GPUContext>(
+        dev_ctx,
+        x,
+        key,
+        out_dims,
+        out,
+        h_counter.data<int>(),
+        h_offsets.data<int>(),
+        &rulebook_len,
+        &need_product_rulebook);
+  }
+
+  if (need_product_rulebook) {
+    DenseTensor tmp_rulebook;
+    rulebook_len = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
+                                                        x,
+                                                        kernel_sizes,
+                                                        subm_paddings,
+                                                        dilations,
+                                                        subm_strides,
+                                                        out_dims,
+                                                        subm,
+                                                        &tmp_rulebook,
+                                                        &counter_per_kernel,
+                                                        &offsets_per_kernel,
+                                                        &out_index,
+                                                        &unique_value,
+                                                        out,
+                                                        h_counter_ptr,
+                                                        h_offsets_ptr);
+    rulebook_ptr = tmp_rulebook.data<IntT>();
+
+    phi::funcs::sparse::SaveToTable(
+        dev_ctx, x, key, tmp_rulebook, h_counter, out, rulebook, counter);
+  }

  // 2. gather
-  DenseTensorMeta in_features_meta(
-      x.dtype(), {n, in_channels}, DataLayout::NCHW);
-  DenseTensorMeta out_features_meta(
-      x.dtype(), {n, out_channels}, DataLayout::NCHW);
  phi::DenseTensor in_features =
-      phi::Empty(dev_ctx, std::move(in_features_meta));
+      phi::Empty<T>(dev_ctx, {rulebook_len, in_channels});
  phi::DenseTensor out_features =
-      phi::Empty(dev_ctx, std::move(out_features_meta));
+      phi::Empty<T>(dev_ctx, {rulebook_len, out_channels});
  T* in_features_ptr = in_features.data<T>();
  T* out_features_ptr = out_features.data<T>();
  phi::funcs::SetConstant<GPUContext, T> set_zero;
  set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));

-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
-  GatherKernel<T, IntT><<<config.block_per_grid.x,
-                          config.thread_per_block.x,
-                          0,
-                          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                                              rulebook_ptr + n,
-                                              in_features_ptr,
-                                              n,
-                                              in_channels);
+  Gather<T, IntT>(dev_ctx,
+                  x.non_zero_elements().data<T>(),
+                  rulebook_ptr,
+                  rulebook_len,
+                  in_channels,
+                  in_features_ptr);

  // 3. call gemm for every werght
  auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
  auto* out_values = out->mutable_non_zero_elements();
  T* out_values_ptr = out_values->data<T>();
+  set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
+
+  if (subm) {
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+    unique_value.ResizeAndAllocate(
+        {static_cast<int>(out->nnz() * kernel_size)});
+    out_index.ResizeAndAllocate({static_cast<int>(rulebook_len)});
+    int* out_index_ptr = out_index.data<int>();
+    int* unique_value_ptr = unique_value.data<int>();
+    phi::backends::gpu::GpuMemsetAsync(
+        out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
+    GroupIndexs<<<config.block_per_grid,
+                  config.thread_per_block,
+                  0,
+                  dev_ctx.stream()>>>(rulebook_len,
+                                      kernel_size,
+                                      rulebook_ptr + rulebook_len,
+                                      out_index_ptr,
+                                      unique_value_ptr);
+  }

  const T* kernel_ptr = kernel.data<T>();
  for (int i = 0; i < kernel_size; i++) {
-    if (h_counter[i] <= 0) {
+    if (h_counter_ptr[i] <= 0) {
      continue;
    }

    // call gemm: (n, in_channels) * (in_channels, out_channels)
-    const int M = h_counter[i];
+    const int M = h_counter_ptr[i];
    const int K = in_channels;
    const int N = out_channels;
-    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels;
    const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
-    T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels;
+    T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels;

    blas.GEMM(CblasNoTrans,
              CblasNoTrans,
@@ -154,40 +190,23 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
  }

  // 4. scatter
-  if (subm) {
-    set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
-    config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1);
-    phi::funcs::ScatterCUDAKernel<T, IntT>
-        <<<config.block_per_grid,
-           config.thread_per_block,
-           0,
-           dev_ctx.stream()>>>(out_features_ptr,
-                               rulebook_ptr + 2 * n,
-                               out_values_ptr,
-                               n,
-                               out_channels,
-                               false);
-  } else {
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, out->nnz() * out_channels, 1);
-    phi::funcs::sparse::ScatterKernel<T>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(out_features_ptr,
-                               unique_value.data<int>(),
-                               out_index.data<int>(),
-                               out->nnz(),
-                               n,
-                               out_channels,
-                               out_values_ptr);
-  }
+  phi::funcs::sparse::ScatterV2<T>(dev_ctx,
+                                   out_features_ptr,
+                                   out_index.data<int>(),
+                                   unique_value.data<int>(),
+                                   out->nnz(),
+                                   kernel_size,
+                                   out_channels,
+                                   1,
+                                   out_values_ptr);
 }
+
 /**
- * x: (N, D, H, W, C)
- * kernel: (D, H, W, C, OC)
- * out: (N, D, H, W, OC)
+ * x: the input SparseCooTensor, shape is (N, D, H, W, C)
+ * kernel: the weight data, shape is (D, H, W, C, OC)
+ * out: the output SparseCooTensor, shape is (N, D, H, W, OC)
+ * rulebook: return rulebook if key is not vailed else return nullptr
+ * counter: return counter if key is not vailed else return nullptr
 **/
 template <typename T, typename Context>
 void Conv3dCooKernel(const Context& dev_ctx,
@@ -198,8 +217,10 @@ void Conv3dCooKernel(const Context& dev_ctx,
                     const std::vector<int>& strides,
                     const int groups,
                     const bool subm,
+                     const std::string& key,
                     SparseCooTensor* out,
-                     DenseTensor* rulebook) {
+                     DenseTensor* rulebook,
+                     DenseTensor* counter) {
  PD_VISIT_INTEGRAL_TYPES(
      x.non_zero_indices().dtype(), "Conv3dCooGPUKernel", ([&] {
        Conv3dCooGPUKernel<T, data_t>(dev_ctx,
@@ -210,8 +231,10 @@ void Conv3dCooKernel(const Context& dev_ctx,
                                      strides,
                                      groups,
                                      subm,
+                                      key,
                                      out,
-                                      rulebook);
+                                      rulebook,
+                                      counter);
      }));
 }


--- a/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
@@ -238,6 +238,7 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
      x_indexs_ptr, x_indexs.numel(), table.data<int>());
  config =
      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1);
+
  const int VecBytes = 16;
  const int VecSize = VecBytes / sizeof(T);
  if (stride % VecSize == 0) {

--- a/paddle/phi/kernels/sparse/gpu/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/pool_grad_kernel.cu
@@ -55,6 +55,7 @@ template <typename T, typename IntT = int>
 void MaxPoolCooGradGPUKernel(const GPUContext& dev_ctx,
                             const SparseCooTensor& x,
                             const DenseTensor& rulebook,
+                             const DenseTensor& counter,
                             const SparseCooTensor& out,
                             const SparseCooTensor& out_grad,
                             const std::vector<int>& kernel_sizes,
@@ -63,23 +64,9 @@ void MaxPoolCooGradGPUKernel(const GPUContext& dev_ctx,
  const int in_channels = x.dims()[4];
  int rulebook_len = rulebook.dims()[1];
  const IntT* rulebook_ptr = rulebook.data<IntT>();
-  std::vector<IntT> offsets(kernel_size + 1), counter(kernel_size, 0),
-      h_counter(rulebook_len, 0);
-  phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
-                                     rulebook_ptr,
-                                     rulebook_len * sizeof(IntT),
-#ifdef PADDLE_WITH_HIP
-                                     hipMemcpyDeviceToHost,
-#else
-                                     cudaMemcpyDeviceToHost,
-#endif
-
-                                     dev_ctx.stream());
-  dev_ctx.Wait();
-  for (int i = 0; i < rulebook_len; i++) {
-    counter[h_counter[i]] += 1;
-  }
-  phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size);
+  std::vector<int> offsets(kernel_size + 1);
+  const int* counter_ptr = counter.data<int>();
+  phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);

  const T* in_features_ptr = x.non_zero_elements().data<T>();
  const T* out_features_ptr = out.non_zero_elements().data<T>();
@@ -99,12 +86,12 @@ void MaxPoolCooGradGPUKernel(const GPUContext& dev_ctx,
                        &x_grad_indices);

  for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0) {
+    if (counter_ptr[i] <= 0) {
      continue;
    }

    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, counter[i] * in_channels, 1);
+        dev_ctx, counter_ptr[i] * in_channels, 1);
    MaxPoolGradCudaKernel<T, IntT>
        <<<config.block_per_grid.x,
           config.thread_per_block.x,
@@ -112,8 +99,8 @@ void MaxPoolCooGradGPUKernel(const GPUContext& dev_ctx,
           dev_ctx.stream()>>>(in_features_ptr,
                               out_features_ptr,
                               out_grad_ptr,
-                               rulebook_ptr + offsets[i] + rulebook_len,
-                               counter[i],
+                               rulebook_ptr + offsets[i],
+                               counter_ptr[i],
                               rulebook_len,
                               in_channels,
                               x_grad_ptr);
@@ -124,6 +111,7 @@ template <typename T, typename Context>
 void MaxPoolCooGradKernel(const Context& dev_ctx,
                          const SparseCooTensor& x,
                          const DenseTensor& rulebook,
+                          const DenseTensor& counter,
                          const SparseCooTensor& out,
                          const SparseCooTensor& out_grad,
                          const std::vector<int>& kernel_sizes,
@@ -131,7 +119,7 @@ void MaxPoolCooGradKernel(const Context& dev_ctx,
  PD_VISIT_INTEGRAL_TYPES(
      x.non_zero_indices().dtype(), "MaxPoolCooGradGPUKernel", ([&] {
        MaxPoolCooGradGPUKernel<T, data_t>(
-            dev_ctx, x, rulebook, out, out_grad, kernel_sizes, x_grad);
+            dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, x_grad);
      }));
 }


--- a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/sparse/convolution.h"
-#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+#include "paddle/phi/kernels/sparse/gpu/conv.cu.h"

 namespace phi {
 namespace sparse {
@@ -55,7 +55,8 @@ void MaxPoolCooGPUKernel(const GPUContext& dev_ctx,
                         const std::vector<int>& dilations,
                         const std::vector<int>& strides,
                         SparseCooTensor* out,
-                         DenseTensor* rulebook) {
+                         DenseTensor* rulebook,
+                         DenseTensor* counter) {
  const auto& x_dims = x.dims();
  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
  const std::vector<int>& real_kernel_sizes =
@@ -65,7 +66,7 @@ void MaxPoolCooGPUKernel(const GPUContext& dev_ctx,
      x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
  const int in_channels = real_kernel_sizes[3];

-  std::vector<int> offsets(kernel_size + 1), counter(kernel_size);
+  std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
  DenseTensorMeta counter_meta(
      DataType::INT32, {kernel_size}, DataLayout::NCHW);
  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
@@ -89,13 +90,16 @@ void MaxPoolCooGPUKernel(const GPUContext& dev_ctx,
                                                          &out_index,
                                                          &unique_value,
                                                          out,
-                                                          &counter,
-                                                          &offsets);
+                                                          h_counter.data(),
+                                                          offsets.data());

  const IntT* rulebook_ptr = rulebook->data<IntT>();

  T* out_features_ptr = out->mutable_non_zero_elements()->data<T>();
  const T* in_features_ptr = x.non_zero_elements().data<T>();
+  counter->Resize({kernel_size});
+  int* counter_ptr = dev_ctx.template HostAlloc<int>(counter);
+  memcpy(counter_ptr, h_counter.data(), h_counter.size() * sizeof(int));
 // 2. max pool
 #ifdef PADDLE_WITH_HIP
  thrust::fill(thrust::hip::par.on(dev_ctx.stream()),
@@ -107,22 +111,21 @@ void MaxPoolCooGPUKernel(const GPUContext& dev_ctx,
               static_cast<T>(0));
  // TODO(zhangkaihuo) Replacing multiple calls with one kernel may be faster
  for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0) {
+    if (h_counter[i] <= 0) {
      continue;
    }

    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, counter[i] * in_channels, 1);
-    MaxPoolCudaKernel<T, IntT>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(in_features_ptr,
-                               rulebook_ptr + offsets[i] + rulebook_len,
-                               counter[i],
-                               rulebook_len,
-                               in_channels,
-                               out_features_ptr);
+        dev_ctx, h_counter[i] * in_channels, 1);
+    MaxPoolCudaKernel<T, IntT><<<config.block_per_grid.x,
+                                 config.thread_per_block.x,
+                                 0,
+                                 dev_ctx.stream()>>>(in_features_ptr,
+                                                     rulebook_ptr + offsets[i],
+                                                     h_counter[i],
+                                                     rulebook_len,
+                                                     in_channels,
+                                                     out_features_ptr);
  }
 }

@@ -134,7 +137,8 @@ void MaxPoolCooKernel(const Context& dev_ctx,
                      const std::vector<int>& dilations,
                      const std::vector<int>& strides,
                      SparseCooTensor* out,
-                      DenseTensor* rulebook) {
+                      DenseTensor* rulebook,
+                      DenseTensor* counter) {
  PD_VISIT_INTEGRAL_TYPES(
      x.non_zero_indices().dtype(), "MaxPoolCooGPUKernel", ([&] {
        MaxPoolCooGPUKernel<T, data_t>(dev_ctx,
@@ -144,7 +148,8 @@ void MaxPoolCooKernel(const Context& dev_ctx,
                                       dilations,
                                       strides,
                                       out,
-                                       rulebook);
+                                       rulebook,
+                                       counter);
      }));
 }


--- a/paddle/phi/kernels/sparse/pool_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/pool_grad_kernel.h
@@ -25,6 +25,7 @@ template <typename T, typename Context>
 void MaxPoolCooGradKernel(const Context& dev_ctx,
                          const SparseCooTensor& x,
                          const DenseTensor& rulebook,
+                          const DenseTensor& counter,
                          const SparseCooTensor& out,
                          const SparseCooTensor& out_grad,
                          const std::vector<int>& kernel_sizes,
@@ -34,12 +35,13 @@ template <typename T, typename Context>
 SparseCooTensor MaxPoolCooGrad(const Context& dev_ctx,
                               const SparseCooTensor& x,
                               const DenseTensor& rulebook,
+                               const DenseTensor& counter,
                               const SparseCooTensor& out,
                               const SparseCooTensor& out_grad,
                               const std::vector<int>& kernel_sizes) {
  SparseCooTensor x_grad;
  MaxPoolCooGradKernel<T, Context>(
-      dev_ctx, x, rulebook, out, out_grad, kernel_sizes, &x_grad);
+      dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, &x_grad);
  return x_grad;
 }


--- a/paddle/phi/kernels/sparse/pool_kernel.h
+++ b/paddle/phi/kernels/sparse/pool_kernel.h
@@ -29,7 +29,8 @@ void MaxPoolCooKernel(const Context& dev_ctx,
                      const std::vector<int>& dilations,
                      const std::vector<int>& strides,
                      SparseCooTensor* out,
-                      DenseTensor* rulebook);
+                      DenseTensor* rulebook,
+                      DenseTensor* counter);

 template <typename T, typename Context>
 SparseCooTensor MaxPoolCoo(const Context& dev_ctx,
@@ -38,10 +39,18 @@ SparseCooTensor MaxPoolCoo(const Context& dev_ctx,
                           const std::vector<int>& paddings,
                           const std::vector<int>& dilations,
                           const std::vector<int>& strides,
-                           DenseTensor* rulebook) {
+                           DenseTensor* rulebook,
+                           DenseTensor* counter) {
  SparseCooTensor coo;
-  MaxPoolCooKernel<T, Context>(
-      dev_ctx, x, kernel_sizes, paddings, dilations, strides, &coo, rulebook);
+  MaxPoolCooKernel<T, Context>(dev_ctx,
+                               x,
+                               kernel_sizes,
+                               paddings,
+                               dilations,
+                               strides,
+                               &coo,
+                               rulebook,
+                               counter);
  return coo;
 }


--- a/paddle/phi/tests/api/test_sparse_conv_api.cc
+++ b/paddle/phi/tests/api/test_sparse_conv_api.cc
@@ -76,8 +76,8 @@ void TestConv3dBase(const std::vector<int>& indices,
         kernel.size() * sizeof(T));

  if (!std::is_same<T, phi::dtype::float16>::value) {
-    auto tensor_out = paddle::experimental::sparse::conv3d(
-        x, weight, paddings, dilations, strides, 1, false);
+    auto tensor_out = paddle::experimental::sparse::conv3d_coo(
+        x, weight, paddings, dilations, strides, 1, false, "Conv3d");

    auto out =
        std::dynamic_pointer_cast<phi::SparseCooTensor>(tensor_out.impl());

--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -112,8 +112,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
  };

  if (!std::is_same<T, phi::dtype::float16>::value) {
-    DenseTensor rulebook = phi::Empty(
-        dev_ctx_cpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW));
+    DenseTensor rulebook, counter;
    SparseCooTensor out = sparse::Conv3dCoo<T>(dev_ctx_cpu,
                                               x_tensor,
                                               kernel_tensor,
@@ -122,7 +121,9 @@ void TestConv3dBase(const std::vector<IntT>& indices,
                                               strides,
                                               1,
                                               subm,
-                                               &rulebook);
+                                               "Conv3d",
+                                               &rulebook,
+                                               &counter);

    ASSERT_EQ(correct_out_dims.size(), out.dims().size());
    for (int i = 0; i < correct_out_dims.size(); i++) {
@@ -142,13 +143,16 @@ void TestConv3dBase(const std::vector<IntT>& indices,
          sparse::Conv3dCooGrad<T>(dev_ctx_cpu,
                                   x_tensor,
                                   kernel_tensor,
+                                   out,
                                   rulebook,
+                                   counter,
                                   out,
                                   paddings,
                                   dilations,
                                   strides,
                                   1,
-                                   subm);
+                                   subm,
+                                   "Conv3d");
      f_verify(std::get<0>(grads).non_zero_elements().data<T>(), features_grad);
      f_verify(std::get<1>(grads).data<T>(), kernel_grad);
    }
@@ -196,8 +200,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
  phi::Copy(
      dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor);

-  DenseTensor d_rulebook = phi::Empty(
-      dev_ctx_gpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW));
+  DenseTensor d_rulebook, d_counter;
  SparseCooTensor d_out = sparse::Conv3dCoo<T>(dev_ctx_gpu,
                                               d_x_tensor,
                                               d_kernel_tensor,
@@ -206,8 +209,9 @@ void TestConv3dBase(const std::vector<IntT>& indices,
                                               strides,
                                               1,
                                               subm,
-                                               &d_rulebook);
-
+                                               "Conv3d",
+                                               &d_rulebook,
+                                               &d_counter);
  SparseCooTensor tmp_d_out = sparse::Coalesce<T>(dev_ctx_gpu, d_out);

  ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
@@ -245,13 +249,16 @@ void TestConv3dBase(const std::vector<IntT>& indices,
        sparse::Conv3dCooGrad<T>(dev_ctx_gpu,
                                 d_x_tensor,
                                 d_kernel_tensor,
+                                 d_out,
                                 d_rulebook,
+                                 d_counter,
                                 d_out,
                                 paddings,
                                 dilations,
                                 strides,
                                 1,
-                                 subm);
+                                 subm,
+                                 "Conv3d");
    DenseTensor d_features_grad = std::get<0>(grads).non_zero_elements();
    DenseTensor d_kernel_grad = std::get<1>(grads);
    DenseTensor h_features_grad =

--- a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
@@ -90,14 +90,15 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
  };

  if (!std::is_same<T, phi::dtype::float16>::value) {
-    DenseTensor rulebook;
+    DenseTensor rulebook, counter;
    SparseCooTensor out = sparse::MaxPoolCoo<T>(dev_ctx_cpu,
                                                x_tensor,
                                                kernel_sizes,
                                                paddings,
                                                dilations,
                                                strides,
-                                                &rulebook);
+                                                &rulebook,
+                                                &counter);

    ASSERT_EQ(correct_out_dims.size(), out.dims().size());
    for (int i = 0; i < correct_out_dims.size(); i++) {
@@ -114,7 +115,7 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,

    if (backward) {
      SparseCooTensor x_grad = sparse::MaxPoolCooGrad<T>(
-          dev_ctx_cpu, x_tensor, rulebook, out, out, kernel_sizes);
+          dev_ctx_cpu, x_tensor, rulebook, counter, out, out, kernel_sizes);
      f_verify(x_grad.non_zero_elements().data<T>(), features_grad);
    }
  }
@@ -150,14 +151,16 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,

  SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims);

-  DenseTensor d_rulebook;
+  DenseTensor d_rulebook, d_counter;
  SparseCooTensor d_out = sparse::MaxPoolCoo<T>(dev_ctx_gpu,
                                                d_x_tensor,
                                                kernel_sizes,
                                                paddings,
                                                dilations,
                                                strides,
-                                                &d_rulebook);
+                                                &d_rulebook,
+                                                &d_counter);
+
  SparseCooTensor tmp_d_out = sparse::Coalesce<T>(dev_ctx_gpu, d_out);

  ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
@@ -191,8 +194,13 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
  f_verify(h_features_tensor.data<T>(), correct_out_features);

  if (backward) {
-    SparseCooTensor x_grad = sparse::MaxPoolCooGrad<T>(
-        dev_ctx_gpu, d_x_tensor, d_rulebook, d_out, d_out, kernel_sizes);
+    SparseCooTensor x_grad = sparse::MaxPoolCooGrad<T>(dev_ctx_gpu,
+                                                       d_x_tensor,
+                                                       d_rulebook,
+                                                       d_counter,
+                                                       d_out,
+                                                       d_out,
+                                                       kernel_sizes);
    DenseTensor h_features_grad =
        phi::EmptyLike<T>(dev_ctx_cpu, x_grad.non_zero_elements());
    phi::Copy(dev_ctx_gpu,

--- a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
@@ -67,7 +67,7 @@ class TestSparseConv(unittest.TestCase):
                indices, values, dense_shape, stop_gradient=True)
            weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
            y = paddle.incubate.sparse.nn.functional.subm_conv3d(
-                sparse_x, weight)
+                sparse_x, weight, key='subm_conv')
            assert np.array_equal(sparse_x.indices().numpy(),
                                  y.indices().numpy())

@@ -91,7 +91,7 @@ class TestSparseConv(unittest.TestCase):
            with self.assertRaises(ValueError):
                #Currently, only support data_format='NDHWC'
                conv3d = paddle.incubate.sparse.nn.SubmConv3D(
-                    1, 1, (1, 3, 3), data_format='NCDHW')
+                    1, 1, (1, 3, 3), data_format='NCDHW', key='subm_conv')

    def test_SubmConv3D(self):
        with _test_eager_guard():
@@ -105,7 +105,7 @@ class TestSparseConv(unittest.TestCase):
                indices, values, dense_shape, False)

            subm_conv3d = paddle.incubate.sparse.nn.SubmConv3D(
-                1, 1, (1, 3, 3), data_format='NDHWC')
+                1, 1, (1, 3, 3), data_format='NDHWC', key='subm_conv')
            # test extra_repr
            print(subm_conv3d.extra_repr())

@@ -117,7 +117,7 @@ class TestSparseConv(unittest.TestCase):
            with self.assertRaises(ValueError):
                #Currently, only support data_format='NDHWC'
                conv3d = paddle.incubate.sparse.nn.SubmConv3D(
-                    1, 1, (1, 3, 3), data_format='NCDHW')
+                    1, 1, (1, 3, 3), data_format='NCDHW', key='subm_conv')

    def test_Conv3D_bias(self):
        with _test_eager_guard():

--- a/python/paddle/incubate/sparse/nn/functional/conv.py
+++ b/python/paddle/incubate/sparse/nn/functional/conv.py
@@ -29,6 +29,7 @@ def _conv3d(x,
            dilation=1,
            groups=1,
            subm=False,
+            key=None,
            data_format="NDHWC",
            name=None):
    assert in_dynamic_mode(), "Currently, only support dynamic mode"
@@ -62,8 +63,9 @@ def _conv3d(x,
    dilation = convert_to_list(dilation, dims, 'dilation')
    op_type = "conv3d"

-    pre_bias = _C_ops.final_state_sparse_conv3d(x, weight, padding, dilation,
-                                                stride, groups, subm)
+    pre_bias = _C_ops.final_state_sparse_conv3d_coo(
+        x, weight, padding, dilation, stride, groups, subm,
+        key if key is not None else "")
    if bias is not None:
        values = pre_bias.values()
        add_bias = elementwise_add(values, bias, axis=1)
@@ -186,7 +188,7 @@ def conv3d(x,
              # (1, 1, 1, 2, 1)
    """
    return _conv3d(x, weight, bias, stride, padding, dilation, groups, False,
-                   data_format, name)
+                   None, data_format, name)


 def subm_conv3d(x,
@@ -197,6 +199,7 @@ def subm_conv3d(x,
                dilation=1,
                groups=1,
                data_format="NDHWC",
+                key=None,
                name=None):
    r"""

@@ -274,6 +277,10 @@ def subm_conv3d(x,
            will be consistent with that of the input. An optional string from: `"NCDHW"`, `"NDHWC"`.
            The default is `"NDHWC"`. When it is `"NDHWC"`, the data is stored in the order of:
            `[batch_size, input_depth, input_height, input_width, input_channels]`.
+        key(str, optional): the key is used to save or use the same rulebook, 
+            the definition and role of rulebook refers to
+            https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf. The 
+            default value is None.
        name(str|None): For detailed information, please refer 
           to :ref:`api_guide_Name`. Usually name is no need to set and 
           None by default.
@@ -301,4 +308,4 @@ def subm_conv3d(x,
              #(1, 1, 3, 4, 1)
    """
    return _conv3d(x, weight, bias, stride, padding, dilation, groups, True,
-                   data_format, name)
+                   key, data_format, name)
--- a/python/paddle/incubate/sparse/nn/layer/conv.py
+++ b/python/paddle/incubate/sparse/nn/layer/conv.py
@@ -33,6 +33,7 @@ class _Conv3D(Layer):
                 dilation=1,
                 groups=1,
                 subm=False,
+                 key=None,
                 padding_mode='zeros',
                 weight_attr=None,
                 bias_attr=None,
@@ -46,6 +47,7 @@ class _Conv3D(Layer):
        self._out_channels = out_channels
        self._data_format = data_format
        self._subm = subm
+        self._key = key

        assert padding_mode == 'zeros', "Currently, only support padding_mode='zeros'"
        assert groups == 1, "Currently, only support groups=1"
@@ -95,6 +97,7 @@ class _Conv3D(Layer):
                             dilation=self._dilation,
                             groups=self._groups,
                             subm=self._subm,
+                             key=self._key,
                             data_format=self._data_format)
        return out

@@ -240,6 +243,7 @@ class Conv3D(_Conv3D):
                                     dilation=dilation,
                                     groups=groups,
                                     subm=False,
+                                     key=None,
                                     padding_mode=padding_mode,
                                     weight_attr=weight_attr,
                                     bias_attr=bias_attr,
@@ -293,6 +297,10 @@ class SubmConv3D(_Conv3D):
            of the input channels, while the second half of the filters is only
            connected to the second half of the input channels. The default value is 1.
        padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Currently only support ``'zeros'``.
+        key(str, optional): the key is used to save or use the same rulebook, 
+            the definition and role of rulebook refers to
+            https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf. The
+            default value is None.
        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
            of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
            will create ParamAttr as param_attr. If it is set to None, the parameter
@@ -361,6 +369,7 @@ class SubmConv3D(_Conv3D):
                 dilation=1,
                 groups=1,
                 padding_mode='zeros',
+                 key=None,
                 weight_attr=None,
                 bias_attr=None,
                 data_format="NDHWC"):
@@ -372,6 +381,7 @@ class SubmConv3D(_Conv3D):
                                         dilation=dilation,
                                         groups=groups,
                                         subm=True,
+                                         key=key,
                                         padding_mode=padding_mode,
                                         weight_attr=weight_attr,
                                         bias_attr=bias_attr,