[PHI Decoupling]Remove distribute header (#52202)

* remove distribute * fix py3 bugs * fix gpu-ps bugs * fix compile bugs * fix unittest bugs

[PHI Decoupling]Remove distribute header (#52202)
* remove distribute * fix py3 bugs * fix gpu-ps bugs * fix compile bugs * fix unittest bugs
e923642e · YuanRisheng · GitHub · f22b9666 · e923642e · e923642e
18 changed file
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -65,3 +65,15 @@ if(WITH_CUSTOM_DEVICE)
         comm_static_check
         dense_tensor)
 endif()
+
+set(COMM_UTILS_DEPS process_group)
+if(WITH_NCCL OR WITH_RCCL)
+  set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_nccl)
+endif()
+if(WITH_CUSTOM_DEVICE)
+  set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_custom)
+endif()
+cc_library(
+  processgroup_comm_utils
+  SRCS processgroup_comm_utils.cc
+  DEPS ${COMM_UTILS_DEPS})
--- a/paddle/phi/backends/processgroup_comm_utils.cc
+++ b/paddle/phi/backends/processgroup_comm_utils.cc
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -20,7 +20,7 @@ cc_library(
  graph_helper
  SRCS graph_helper.cc
  DEPS graph program_utils scale_loss_grad_op_handle
-       grad_merge_all_reduce_op_handle)
+       grad_merge_all_reduce_op_handle collective_helper)
 cc_library(
  pass
  SRCS pass.cc

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -162,6 +162,10 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper ps_gpu_wrapper)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_function)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} processgroup_comm_utils)
+if(WITH_NCCL OR WITH_RCCL)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} process_group_nccl)
+endif()
 if (WITH_GPU OR WITH_ROCM)
  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor)
 endif()

--- a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@@ -14,9 +14,9 @@ limitations under the License. */

 #include "paddle/fluid/operators/inplace_abn_op.h"
 #include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/fluid/operators/sync_batch_norm_utils.h"
 #include "paddle/phi/kernels/batch_norm_grad_kernel.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
-#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h"
 #include "paddle/phi/kernels/sync_batch_norm_grad_kernel.h"
 #include "paddle/phi/kernels/sync_batch_norm_kernel.h"


--- a/paddle/fluid/operators/margin_cross_entropy_op.cc
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This File is used for compile margin_cross_entropy_op.cu.
+// And this file will be deleted after margin_cross_entropy_op is moved to phi
--- a/paddle/phi/kernels/gpu/margin_cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/margin_cross_entropy_kernel.cu
@@ -22,19 +22,25 @@ namespace cub = hipcub;

 #include <vector>
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
+#include "paddle/phi/kernels/margin_cross_entropy_grad_kernel.h"
+
+#include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
-// trace op include
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/kernel_registry.h"

 namespace phi {

@@ -467,6 +473,116 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
 #endif
 }

+template <typename T, typename IndexT>
+__global__ void CalculateGrad(T* logits_grad,
+                              const T* loss_grad,
+                              const T* logits,
+                              const IndexT* label,
+                              const float margin1,
+                              const float margin2,
+                              const float scale,
+                              const int rank,
+                              const int64_t N,
+                              const int64_t D,
+                              const int* class_interval_ptr) {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  int start_index = class_interval_ptr[rank];
+  CUDA_KERNEL_LOOP(i, N * D) {
+    auto row = i / D;
+    auto col = i % D;
+    if ((col + start_index) == label[row]) {
+      logits_grad[i] = (logits_grad[i] - static_cast<T>(1.0)) * loss_grad[row];
+      if (fabs(margin1 - 1.0) > 1e-8 || fabs(margin2) > 1e-8) {
+        MPType dout = static_cast<MPType>(logits_grad[i]);
+        MPType one = static_cast<MPType>(1.0f);
+        MPType x = static_cast<MPType>(logits[i]);
+        MPType m1 = static_cast<MPType>(margin1);
+        MPType m2 = static_cast<MPType>(margin2);
+
+        MPType d = m1 * sin(m1 * acos(x) + m2) / sqrt(one - x * x);
+        logits_grad[i] = static_cast<T>(dout * d);
+      }
+    } else {
+      logits_grad[i] *= loss_grad[row];
+    }
+    if (fabs(scale - 1.0) > 1e-8) {
+      logits_grad[i] *= static_cast<T>(scale);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void MarginCrossEntropyGradKernel(const Context& dev_ctx,
+                                  const DenseTensor& logits,
+                                  const DenseTensor& label,
+                                  const DenseTensor& softmax,
+                                  const DenseTensor& loss_grad,
+                                  bool return_softmax,
+                                  int ring_id,
+                                  int rank,
+                                  int nranks,
+                                  float margin1,
+                                  float margin2,
+                                  float margin3,
+                                  float scale,
+                                  DenseTensor* logits_grad) {
+  const auto softmax_dims = softmax.dims();
+  const int axis = softmax_dims.size() - 1;
+  const int N = phi::funcs::SizeToAxis(axis, softmax_dims);
+  const int D = phi::funcs::SizeFromAxis(axis, softmax_dims);
+
+  if (return_softmax) {
+    phi::Copy<Context>(
+        dev_ctx, softmax, dev_ctx.GetPlace(), false, logits_grad);
+  } else {
+    logits_grad->ShareDataWith(softmax);
+  }
+
+  int blocks = NumBlocks(N * D);
+  int threads = kNumCUDAThreads;
+  const auto& label_type = label.dtype();
+
+  DenseTensor class_interval;
+  GetClassInterval<T, Context>(dev_ctx.stream(),
+                               dev_ctx.GetPlace(),
+                               dev_ctx,
+                               ring_id,
+                               rank,
+                               nranks,
+                               D,
+                               &class_interval);
+
+  if (label_type == phi::DataType::INT32) {
+    typedef int32_t LabelT;
+    CalculateGrad<T, LabelT>
+        <<<blocks, threads, 0, dev_ctx.stream()>>>(logits_grad->data<T>(),
+                                                   loss_grad.data<T>(),
+                                                   logits.data<T>(),
+                                                   label.data<LabelT>(),
+                                                   margin1,
+                                                   margin2,
+                                                   scale,
+                                                   rank,
+                                                   N,
+                                                   D,
+                                                   class_interval.data<int>());
+  } else if (label_type == phi::DataType::INT64) {
+    typedef int64_t LabelT;
+    CalculateGrad<T, LabelT>
+        <<<blocks, threads, 0, dev_ctx.stream()>>>(logits_grad->data<T>(),
+                                                   loss_grad.data<T>(),
+                                                   logits.data<T>(),
+                                                   label.data<LabelT>(),
+                                                   margin1,
+                                                   margin2,
+                                                   scale,
+                                                   rank,
+                                                   N,
+                                                   D,
+                                                   class_interval.data<int>());
+  }
+}
+
 }  // namespace phi

 PD_REGISTER_KERNEL(margin_cross_entropy,
@@ -476,3 +592,11 @@ PD_REGISTER_KERNEL(margin_cross_entropy,
                   float,
                   double,
                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(margin_cross_entropy_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MarginCrossEntropyGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
@@ -12,33 +12,36 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/phi/kernels/sync_batch_norm_kernel.h"
+#include "paddle/fluid/operators/sync_batch_norm_utils.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h"
+#include "paddle/phi/kernels/sync_batch_norm_kernel.h"
+
+// sparse header
+#include "paddle/phi/kernels/sparse/empty_kernel.h"

 namespace phi {

 template <typename T, typename Context>
-void SyncBatchNormKernel(const Context &ctx,
-                         const DenseTensor &x,
-                         const DenseTensor &mean,
-                         const DenseTensor &variance,
-                         const DenseTensor &scale,
-                         const DenseTensor &bias,
+void SyncBatchNormKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& mean,
+                         const DenseTensor& variance,
+                         const DenseTensor& scale,
+                         const DenseTensor& bias,
                         bool is_test,
                         float momentum,
                         float epsilon_f,
-                         const std::string &data_layout_str,
+                         const std::string& data_layout_str,
                         bool use_global_stats,
                         bool trainable_statistics,
-                         DenseTensor *y,
-                         DenseTensor *mean_out,
-                         DenseTensor *variance_out,
-                         DenseTensor *saved_mean,
-                         DenseTensor *saved_variance,
-                         DenseTensor *reserve_space) {
+                         DenseTensor* y,
+                         DenseTensor* mean_out,
+                         DenseTensor* variance_out,
+                         DenseTensor* saved_mean,
+                         DenseTensor* saved_variance,
+                         DenseTensor* reserve_space) {
  PADDLE_ENFORCE_EQ(use_global_stats,
                    false,
                    phi::errors::InvalidArgument(
@@ -50,7 +53,7 @@ void SyncBatchNormKernel(const Context &ctx,
  const bool trainable_stats = trainable_statistics;
  const DataLayout layout = phi::StringToDataLayout(data_layout_str);
  bool test_mode = is_test && (!trainable_statistics);
-  const auto &x_dims = x.dims();
+  const auto& x_dims = x.dims();
  PADDLE_ENFORCE_GE(x_dims.size(),
                    2,
                    phi::errors::InvalidArgument(
@@ -63,14 +66,14 @@ void SyncBatchNormKernel(const Context &ctx,
  funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
  int x_numel = x.numel();

-  const T *x_d = x.template data<T>();
-  const auto *s_d = scale.template data<BatchNormParamType<T>>();
-  const auto *b_d = bias.template data<BatchNormParamType<T>>();
+  const T* x_d = x.template data<T>();
+  const auto* s_d = scale.template data<BatchNormParamType<T>>();
+  const auto* b_d = bias.template data<BatchNormParamType<T>>();

-  T *y_d = ctx.template Alloc<T>(y);
+  T* y_d = ctx.template Alloc<T>(y);

-  const BatchNormParamType<T> *mean_data = nullptr;
-  const BatchNormParamType<T> *var_data = nullptr;
+  const BatchNormParamType<T>* mean_data = nullptr;
+  const BatchNormParamType<T>* var_data = nullptr;

  auto stream = ctx.stream();
  const int block = 512;
@@ -90,7 +93,7 @@ void SyncBatchNormKernel(const Context &ctx,
        bytes,
        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));

-    auto *stats = reinterpret_cast<BatchNormParamType<T> *>(alloc_ptr->ptr());
+    auto* stats = reinterpret_cast<BatchNormParamType<T>*>(alloc_ptr->ptr());
    const int threads = 256;
    int grid = std::min(C, (max_threads + threads - 1) / threads);
    if (layout == phi::DataLayout::kNCHW) {
@@ -122,12 +125,12 @@ void SyncBatchNormKernel(const Context &ctx,
    }
 #endif

-    auto *est_mean_data = ctx.template Alloc<BatchNormParamType<T>>(mean_out);
-    auto *est_var_data =
+    auto* est_mean_data = ctx.template Alloc<BatchNormParamType<T>>(mean_out);
+    auto* est_var_data =
        ctx.template Alloc<BatchNormParamType<T>>(variance_out);

-    auto *sv_mean_data = ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
-    auto *sv_inv_var_data =
+    auto* sv_mean_data = ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
+    auto* sv_inv_var_data =
        ctx.template Alloc<BatchNormParamType<T>>(saved_variance);

    // Note, Input('Mean')/Input('Variance') share variable with
@@ -176,6 +179,127 @@ void SyncBatchNormKernel(const Context &ctx,
  }
 }

+template <typename T, typename Context>
+void SyncBatchNormGradKernel(const Context& ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& scale,
+                             const DenseTensor& bias,
+                             const DenseTensor& saved_mean,
+                             const DenseTensor& saved_variance,
+                             const paddle::optional<DenseTensor>& reserve_space,
+                             const DenseTensor& y_grad,
+                             float momentum,
+                             float epsilon_f,
+                             const std::string& data_layout_str,
+                             bool is_test,
+                             bool use_global_stats,
+                             bool trainable_statistics,
+                             DenseTensor* x_grad,
+                             DenseTensor* scale_grad,
+                             DenseTensor* bias_grad) {
+  SyncBatchNormGradFunctor<T, Context>(ctx,
+                                       &x,
+                                       nullptr,
+                                       scale,
+                                       bias,
+                                       saved_mean,
+                                       saved_variance,
+                                       y_grad,
+                                       epsilon_f,
+                                       data_layout_str,
+                                       x_grad,
+                                       scale_grad,
+                                       bias_grad);
+}
+
+}  // namespace phi
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void SyncBatchNormCooKernel(const Context& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& mean,
+                            const DenseTensor& variance,
+                            const DenseTensor& scale,
+                            const DenseTensor& bias,
+                            bool is_test,
+                            float momentum,
+                            float epsilon,
+                            const std::string& data_layout,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            SparseCooTensor* y,
+                            DenseTensor* mean_out,
+                            DenseTensor* variance_out,
+                            DenseTensor* saved_mean,
+                            DenseTensor* saved_variance,
+                            DenseTensor* reserve_space) {
+  EmptyLikeCooKernel<T, Context>(dev_ctx, x, y);
+  phi::SyncBatchNormKernel<T, Context>(dev_ctx,
+                                       x.values(),
+                                       mean,
+                                       variance,
+                                       scale,
+                                       bias,
+                                       is_test,
+                                       momentum,
+                                       epsilon,
+                                       data_layout,
+                                       use_global_stats,
+                                       trainable_statistics,
+                                       y->mutable_values(),
+                                       mean_out,
+                                       variance_out,
+                                       saved_mean,
+                                       saved_variance,
+                                       reserve_space);
+  y->SetIndicesDict(x.GetIndicesDict());
+}
+
+template <typename T, typename Context>
+void SyncBatchNormCooGradKernel(
+    const Context& dev_ctx,
+    const SparseCooTensor& x,
+    const DenseTensor& scale,
+    const DenseTensor& bias,
+    const DenseTensor& saved_mean,
+    const DenseTensor& saved_variance,
+    const paddle::optional<DenseTensor>& reserve_space,
+    const SparseCooTensor& y_grad,
+    float momentum,
+    float epsilon,
+    const std::string& data_layout,
+    bool is_test,
+    bool use_global_stats,
+    bool trainable_statistics,
+    SparseCooTensor* x_grad,
+    DenseTensor* scale_grad,
+    DenseTensor* bias_grad) {
+  EmptyLikeCooKernel<T, Context>(dev_ctx, x, x_grad);
+  *scale_grad = phi::EmptyLike<T, Context>(dev_ctx, scale);
+  *bias_grad = phi::EmptyLike<T, Context>(dev_ctx, bias);
+  phi::SyncBatchNormGradKernel<T, Context>(dev_ctx,
+                                           x.values(),
+                                           scale,
+                                           bias,
+                                           saved_mean,
+                                           saved_variance,
+                                           reserve_space,
+                                           y_grad.values(),
+                                           momentum,
+                                           epsilon,
+                                           data_layout,
+                                           is_test,
+                                           use_global_stats,
+                                           trainable_statistics,
+                                           x_grad->mutable_values(),
+                                           scale_grad,
+                                           bias_grad);
+}
+
+}  // namespace sparse
 }  // namespace phi

 #ifdef PADDLE_WITH_HIP
@@ -216,3 +340,54 @@ PD_REGISTER_KERNEL(sync_batch_norm,
  }
 }
 #endif
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(sync_batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SyncBatchNormGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(sync_batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SyncBatchNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(sync_batch_norm_coo,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SyncBatchNormCooKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(sync_batch_norm_coo,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SyncBatchNormCooKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(sync_batch_norm_coo_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SyncBatchNormCooGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(sync_batch_norm_coo_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SyncBatchNormCooGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
--- a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -227,10 +227,18 @@ if(WITH_GPU)
      SRCS device_event_test.cc
      DEPS device_event_gpu)
  endif()
-  nv_library(
-    cuda_graph_with_memory_pool
-    SRCS cuda_graph_with_memory_pool.cc
-    DEPS ${DEVICE_EVENT_LIBS} device_context allocator phi_backends)
+  if(WITH_CUSTOM_DEVICE)
+    nv_library(
+      cuda_graph_with_memory_pool
+      SRCS cuda_graph_with_memory_pool.cc
+      DEPS ${DEVICE_EVENT_LIBS} device_event_custom_device device_context
+           allocator phi_backends)
+  else()
+    nv_library(
+      cuda_graph_with_memory_pool
+      SRCS cuda_graph_with_memory_pool.cc
+      DEPS ${DEVICE_EVENT_LIBS} device_context allocator phi_backends)
+  endif()
  nv_test(
    device_context_test
    SRCS device_context_test.cu

--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -74,15 +74,3 @@ if(WITH_CUSTOM_DEVICE)
    SRCS custom/capi_test.cc
    DEPS phi_capi)
 endif()
-
-set(COMM_UTILS_DEPS process_group)
-if(WITH_NCCL OR WITH_RCCL)
-  set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_nccl)
-endif()
-if(WITH_CUSTOM_DEVICE)
-  set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_custom)
-endif()
-cc_library(
-  processgroup_comm_utils
-  SRCS processgroup_comm_utils.cc
-  DEPS ${COMM_UTILS_DEPS})
--- a/paddle/phi/core/distributed/CMakeLists.txt
+++ b/paddle/phi/core/distributed/CMakeLists.txt
@@ -16,7 +16,7 @@ if(WITH_GLOO)
  cc_library(
    gloo_utils
    SRCS gloo_utils.cc
-    DEPS gloo dense_tensor enforce)
+    DEPS gloo dense_tensor enforce tcp_store)

  cc_library(
    gloo_comm_context

--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -88,13 +88,11 @@ if(WITH_FLASHATTN)
 endif()

 if(WITH_NCCL OR WITH_RCCL)
-  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} process_group_nccl
-                         nccl_comm_context)
+  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} nccl_comm_context)
 endif()
 if(WITH_GLOO)
  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} gloo_comm_context)
 endif()
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup_comm_utils)
 if(WITH_CUDNN_FRONTEND)
  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} cudnn-frontend)
 endif()
@@ -180,32 +178,24 @@ endif()

 file(GLOB kernel_xpu "xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc")

-add_library(phi_cpu ${kernel_cc})
-kernel_declare("${kernel_cc}")
 if(WITH_MKLDNN)
-  target_link_libraries(phi_cpu ${COMMON_KERNEL_DEPS}
-                        get_kerneltype_forvar_utils)
-else()
-  target_link_libraries(phi_cpu ${COMMON_KERNEL_DEPS})
+  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} get_kerneltype_forvar_utils)
 endif()

-set(ADD_PHI_KERNELS phi_cpu)
-
 if(WITH_GPU OR WITH_ROCM)
  if(WITH_GPU)
-    add_library(phi_gpu ${kernel_cu})
+    add_library(phi_gpu ${kernel_cu} ${kernel_cc})
    if(WITH_CUTLASS)
      add_dependencies(phi_gpu cutlass_codegen)
    endif()
  elseif(WITH_ROCM)
-    hip_add_library(phi_gpu STATIC ${kernel_cu})
+    hip_add_library(phi_gpu STATIC ${kernel_cu} ${kernel_cc})
  endif()
  kernel_declare("${kernel_cu}")
+  kernel_declare("${kernel_cc}")
  target_link_libraries(phi_gpu ${COMMON_KERNEL_DEPS})
  set(ADD_PHI_KERNELS ${ADD_PHI_KERNELS} phi_gpu)
-endif()
-
-if(WITH_XPU)
+elseif(WITH_XPU)
  if(WITH_XPU_KP)
    file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/
         DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps/)
@@ -215,15 +205,27 @@ if(WITH_XPU)
      file(RENAME ${kernel} "${CMAKE_CURRENT_BINARY_DIR}/kps/${name}.kps")
    endforeach()
    file(GLOB kernel_xpu_kps "${CMAKE_CURRENT_BINARY_DIR}/kps/*.kps")
-    xpu_add_library(phi_xpu STATIC ${kernel_xpu} ${kernel_xpu_kps} DEPENDS
-                    ${COMMON_KERNEL_DEPS})
+    xpu_add_library(
+      phi_xpu
+      STATIC
+      ${kernel_xpu}
+      ${kernel_xpu_kps}
+      ${kernel_cc}
+      DEPENDS
+      ${COMMON_KERNEL_DEPS})
  else()
-    add_library(phi_xpu ${kernel_xpu})
+    add_library(phi_xpu ${kernel_xpu} ${kernel_cc})
  endif()
  kernel_declare("${kernel_xpu}")
  kernel_declare("${kernel_xpu_kps}")
+  kernel_declare("${kernel_cc}")
  target_link_libraries(phi_xpu ${COMMON_KERNEL_DEPS})
  set(ADD_PHI_KERNELS ${ADD_PHI_KERNELS} phi_xpu)
+else()
+  add_library(phi_cpu ${kernel_cc})
+  target_link_libraries(phi_cpu ${COMMON_KERNEL_DEPS})
+  kernel_declare("${kernel_cc}")
+  set(ADD_PHI_KERNELS phi_cpu)
 endif()

 set_property(GLOBAL PROPERTY PHI_KERNELS ${ADD_PHI_KERNELS})

--- a/paddle/phi/kernels/gpu/margin_cross_entropy_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/margin_cross_entropy_grad_kernel.cu
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// old op include, fluid should be removed
-#ifdef PADDLE_WITH_HIP
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#else
-#include <cub/cub.cuh>
-#endif
-
-#include <vector>
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/funcs/axis_utils.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
-#include "paddle/phi/kernels/margin_cross_entropy_grad_kernel.h"
-
-#include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/process_group.h"
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
-#endif
-#include "paddle/phi/backends/gpu/gpu_context.h"
-
-namespace phi {
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaxinumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-template <typename T, typename Context>
-void GetClassInterval(const gpuStream_t& stream,
-                      const phi::Place& place,
-                      const Context& dev_ctx,
-                      const int rid,
-                      const int rank,
-                      const int nranks,
-                      const int D,
-                      DenseTensor* class_interval) {
-  std::vector<int> shard_dim_vec(nranks + 1, 0);
-  shard_dim_vec[rank + 1] = D;
-  if (nranks <= 1) {
-    phi::TensorFromVector(shard_dim_vec, dev_ctx, class_interval);
-    return;
-  }
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  DenseTensor num_classes_per_device;
-  phi::TensorFromVector(shard_dim_vec, dev_ctx, &num_classes_per_device);
-  int* num_classes_per_device_ptr = num_classes_per_device.data<int>();
-
-  auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
-  if (map->has(rid)) {
-    // Use ProcessGroup
-    paddle::distributed::ProcessGroup* pg = map->get(rid);
-    std::vector<phi::DenseTensor> in_tensor;
-    std::vector<phi::DenseTensor> out_tensor;
-    in_tensor.push_back(num_classes_per_device);
-    out_tensor.push_back(num_classes_per_device);
-
-    paddle::distributed::AllreduceOptions opts;
-    opts.reduce_op = paddle::distributed::ReduceOp::SUM;
-    auto task = pg->AllReduce(in_tensor, out_tensor, opts);
-    task->Wait();
-  } else {
-    const auto& comm =
-        paddle::platform::NCCLCommContext::Instance().Get(rid, place);
-    // use global calculate stream
-    const auto calcu_stream =
-        static_cast<GPUContext*>(phi::DeviceContextPool::Instance().Get(place))
-            ->stream();
-
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
-        num_classes_per_device_ptr,
-        num_classes_per_device_ptr,
-        num_classes_per_device.numel(),
-        phi::ToNCCLDataType(num_classes_per_device.dtype()),
-        ncclSum,
-        comm->comm(),
-        calcu_stream));
-  }
-
-  class_interval->Resize({nranks + 1});
-  auto class_interval_ptr = dev_ctx.template Alloc<int>(class_interval);
-
-  size_t cub_temp_storage_bytes = 0;
-  cub::DeviceScan::InclusiveSum<int*, int*>(
-      nullptr, cub_temp_storage_bytes, nullptr, nullptr, nranks + 1, stream);
-  auto cub_temp_storage =
-      phi::memory_utils::Alloc(place, cub_temp_storage_bytes);
-  cub::DeviceScan::InclusiveSum<int*, int*>(cub_temp_storage->ptr(),
-                                            cub_temp_storage_bytes,
-                                            num_classes_per_device_ptr,
-                                            class_interval_ptr,
-                                            nranks + 1,
-                                            stream);
-  return;
-#endif
-}
-
-template <typename T, typename IndexT>
-__global__ void CalculateGrad(T* logits_grad,
-                              const T* loss_grad,
-                              const T* logits,
-                              const IndexT* label,
-                              const float margin1,
-                              const float margin2,
-                              const float scale,
-                              const int rank,
-                              const int64_t N,
-                              const int64_t D,
-                              const int* class_interval_ptr) {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  int start_index = class_interval_ptr[rank];
-  CUDA_KERNEL_LOOP(i, N * D) {
-    auto row = i / D;
-    auto col = i % D;
-    if ((col + start_index) == label[row]) {
-      logits_grad[i] = (logits_grad[i] - static_cast<T>(1.0)) * loss_grad[row];
-      if (fabs(margin1 - 1.0) > 1e-8 || fabs(margin2) > 1e-8) {
-        MPType dout = static_cast<MPType>(logits_grad[i]);
-        MPType one = static_cast<MPType>(1.0f);
-        MPType x = static_cast<MPType>(logits[i]);
-        MPType m1 = static_cast<MPType>(margin1);
-        MPType m2 = static_cast<MPType>(margin2);
-
-        MPType d = m1 * sin(m1 * acos(x) + m2) / sqrt(one - x * x);
-        logits_grad[i] = static_cast<T>(dout * d);
-      }
-    } else {
-      logits_grad[i] *= loss_grad[row];
-    }
-    if (fabs(scale - 1.0) > 1e-8) {
-      logits_grad[i] *= static_cast<T>(scale);
-    }
-  }
-}
-
-template <typename T, typename Context>
-void MarginCrossEntropyGradKernel(const Context& dev_ctx,
-                                  const DenseTensor& logits,
-                                  const DenseTensor& label,
-                                  const DenseTensor& softmax,
-                                  const DenseTensor& loss_grad,
-                                  bool return_softmax,
-                                  int ring_id,
-                                  int rank,
-                                  int nranks,
-                                  float margin1,
-                                  float margin2,
-                                  float margin3,
-                                  float scale,
-                                  DenseTensor* logits_grad) {
-  const auto softmax_dims = softmax.dims();
-  const int axis = softmax_dims.size() - 1;
-  const int N = phi::funcs::SizeToAxis(axis, softmax_dims);
-  const int D = phi::funcs::SizeFromAxis(axis, softmax_dims);
-
-  if (return_softmax) {
-    phi::Copy<Context>(
-        dev_ctx, softmax, dev_ctx.GetPlace(), false, logits_grad);
-  } else {
-    logits_grad->ShareDataWith(softmax);
-  }
-
-  int blocks = NumBlocks(N * D);
-  int threads = kNumCUDAThreads;
-  const auto& label_type = label.dtype();
-
-  DenseTensor class_interval;
-  GetClassInterval<T, Context>(dev_ctx.stream(),
-                               dev_ctx.GetPlace(),
-                               dev_ctx,
-                               ring_id,
-                               rank,
-                               nranks,
-                               D,
-                               &class_interval);
-
-  if (label_type == phi::DataType::INT32) {
-    typedef int32_t LabelT;
-    CalculateGrad<T, LabelT>
-        <<<blocks, threads, 0, dev_ctx.stream()>>>(logits_grad->data<T>(),
-                                                   loss_grad.data<T>(),
-                                                   logits.data<T>(),
-                                                   label.data<LabelT>(),
-                                                   margin1,
-                                                   margin2,
-                                                   scale,
-                                                   rank,
-                                                   N,
-                                                   D,
-                                                   class_interval.data<int>());
-  } else if (label_type == phi::DataType::INT64) {
-    typedef int64_t LabelT;
-    CalculateGrad<T, LabelT>
-        <<<blocks, threads, 0, dev_ctx.stream()>>>(logits_grad->data<T>(),
-                                                   loss_grad.data<T>(),
-                                                   logits.data<T>(),
-                                                   label.data<LabelT>(),
-                                                   margin1,
-                                                   margin2,
-                                                   scale,
-                                                   rank,
-                                                   N,
-                                                   D,
-                                                   class_interval.data<int>());
-  }
-}
-
-}  // namespace phi
-
-PD_REGISTER_KERNEL(margin_cross_entropy_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MarginCrossEntropyGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/sync_batch_norm_grad_kernel.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void SyncBatchNormGradKernel(const Context& ctx,
-                             const DenseTensor& x,
-                             const DenseTensor& scale,
-                             const DenseTensor& bias,
-                             const DenseTensor& saved_mean,
-                             const DenseTensor& saved_variance,
-                             const paddle::optional<DenseTensor>& reserve_space,
-                             const DenseTensor& y_grad,
-                             float momentum,
-                             float epsilon_f,
-                             const std::string& data_layout_str,
-                             bool is_test,
-                             bool use_global_stats,
-                             bool trainable_statistics,
-                             DenseTensor* x_grad,
-                             DenseTensor* scale_grad,
-                             DenseTensor* bias_grad) {
-  SyncBatchNormGradFunctor<T, Context>(ctx,
-                                       &x,
-                                       nullptr,
-                                       scale,
-                                       bias,
-                                       saved_mean,
-                                       saved_variance,
-                                       y_grad,
-                                       epsilon_f,
-                                       data_layout_str,
-                                       x_grad,
-                                       scale_grad,
-                                       bias_grad);
-}
-
-}  // namespace phi
-
-#ifdef PADDLE_WITH_HIP
-PD_REGISTER_KERNEL(sync_batch_norm_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SyncBatchNormGradKernel,
-                   float,
-                   phi::dtype::float16) {}
-#else
-PD_REGISTER_KERNEL(sync_batch_norm_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SyncBatchNormGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-#endif
--- a/paddle/phi/kernels/sparse/gpu/sync_batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sync_batch_norm_grad_kernel.cu
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/sparse/sync_batch_norm_grad_kernel.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/sparse/empty_kernel.h"
-#include "paddle/phi/kernels/sync_batch_norm_grad_kernel.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void SyncBatchNormCooGradKernel(
-    const Context& dev_ctx,
-    const SparseCooTensor& x,
-    const DenseTensor& scale,
-    const DenseTensor& bias,
-    const DenseTensor& saved_mean,
-    const DenseTensor& saved_variance,
-    const paddle::optional<DenseTensor>& reserve_space,
-    const SparseCooTensor& y_grad,
-    float momentum,
-    float epsilon,
-    const std::string& data_layout,
-    bool is_test,
-    bool use_global_stats,
-    bool trainable_statistics,
-    SparseCooTensor* x_grad,
-    DenseTensor* scale_grad,
-    DenseTensor* bias_grad) {
-  EmptyLikeCooKernel<T, Context>(dev_ctx, x, x_grad);
-  *scale_grad = phi::EmptyLike<T, Context>(dev_ctx, scale);
-  *bias_grad = phi::EmptyLike<T, Context>(dev_ctx, bias);
-  phi::SyncBatchNormGradKernel<T, Context>(dev_ctx,
-                                           x.values(),
-                                           scale,
-                                           bias,
-                                           saved_mean,
-                                           saved_variance,
-                                           reserve_space,
-                                           y_grad.values(),
-                                           momentum,
-                                           epsilon,
-                                           data_layout,
-                                           is_test,
-                                           use_global_stats,
-                                           trainable_statistics,
-                                           x_grad->mutable_values(),
-                                           scale_grad,
-                                           bias_grad);
-}
-
-}  // namespace sparse
-}  // namespace phi
-
-#ifdef PADDLE_WITH_HIP
-PD_REGISTER_KERNEL(sync_batch_norm_coo_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SyncBatchNormCooGradKernel,
-                   float,
-                   phi::dtype::float16) {}
-#else
-PD_REGISTER_KERNEL(sync_batch_norm_coo_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SyncBatchNormCooGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-#endif
--- a/paddle/phi/kernels/sparse/gpu/sync_batch_norm_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sync_batch_norm_kernel.cu
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/sparse/sync_batch_norm_kernel.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/sparse/empty_kernel.h"
-#include "paddle/phi/kernels/sync_batch_norm_kernel.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void SyncBatchNormCooKernel(const Context& dev_ctx,
-                            const SparseCooTensor& x,
-                            const DenseTensor& mean,
-                            const DenseTensor& variance,
-                            const DenseTensor& scale,
-                            const DenseTensor& bias,
-                            bool is_test,
-                            float momentum,
-                            float epsilon,
-                            const std::string& data_layout,
-                            bool use_global_stats,
-                            bool trainable_statistics,
-                            SparseCooTensor* y,
-                            DenseTensor* mean_out,
-                            DenseTensor* variance_out,
-                            DenseTensor* saved_mean,
-                            DenseTensor* saved_variance,
-                            DenseTensor* reserve_space) {
-  EmptyLikeCooKernel<T, Context>(dev_ctx, x, y);
-  phi::SyncBatchNormKernel<T, Context>(dev_ctx,
-                                       x.values(),
-                                       mean,
-                                       variance,
-                                       scale,
-                                       bias,
-                                       is_test,
-                                       momentum,
-                                       epsilon,
-                                       data_layout,
-                                       use_global_stats,
-                                       trainable_statistics,
-                                       y->mutable_values(),
-                                       mean_out,
-                                       variance_out,
-                                       saved_mean,
-                                       saved_variance,
-                                       reserve_space);
-  y->SetIndicesDict(x.GetIndicesDict());
-}
-
-}  // namespace sparse
-}  // namespace phi
-
-#ifdef PADDLE_WITH_HIP
-PD_REGISTER_KERNEL(sync_batch_norm_coo,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SyncBatchNormCooKernel,
-                   float,
-                   phi::dtype::float16) {}
-#else
-PD_REGISTER_KERNEL(sync_batch_norm_coo,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SyncBatchNormCooKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-#endif