[PHI decoupling] remove "gpu_primitives.h" in fluid (#48063)

* remove "gpu_primitives.h" in fluid namespace * fix PR-CI-GpuPS fail * fix PR-CI-GpuPS fail

[PHI decoupling] remove "gpu_primitives.h" in fluid (#48063)
* remove "gpu_primitives.h" in fluid namespace * fix PR-CI-GpuPS fail * fix PR-CI-GpuPS fail
9918bf9c · Wang Xin · GitHub · a33d563c · 9918bf9c · 9918bf9c
63 changed file
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
@@ -13,12 +13,12 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HETERPS
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace framework {
-const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS;
+const int CUDA_NUM_THREADS = phi::PADDLE_CUDA_NUM_THREADS;
 #define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS)
 #define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0
@@ -45,7 +45,7 @@ __global__ void PullCopy(float** dest,
    int x = low;
    int y = i - (x ? len[x - 1] : 0);
    float* feature_value_ptr =
-        (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size));
+        (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size));  // NOLINT
    int mf_dim = gpu_dim[x] - 3;
    gpu_accessor.Select(
        dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim);
@@ -79,7 +79,7 @@ __global__ void PullDedupCopy(const size_t N,
      return;
    }
-    float* src_ptr = (float*)((char*)src + uint64_t(restore_idx[i]) *
+    float* src_ptr = (float*)((char*)src + uint64_t(restore_idx[i]) *  // NOLINT
                                               uint64_t(max_val_size));
    switch (off) {
      case 0:
@@ -125,9 +125,10 @@ __global__ void PushCopyWithPool(float* dest,
    }
    int x = low;
    int y = i - (x ? len[low - 1] : 0);
-    float* cur = (float*)((char*)dest + i * grad_value_size);
+    float* cur = (float*)((char*)dest + i * grad_value_size);  // NOLINT
-    cur[gpu_accessor.common_push_value.SlotIndex()] = (float)slot_vector[x];
+    cur[gpu_accessor.common_push_value.SlotIndex()] =
+        (float)slot_vector[x];  // NOLINT
    int mf_dim = mf_dim_vector[x];
    cur[gpu_accessor.common_push_value.MfDimIndex()] = mf_dim;
@@ -170,31 +171,29 @@ __global__ void PushMergeCopyAtomic(const size_t N,
    int y = i - slot_lens[x];
    const float* ptr = src[x] + y * hidden;
-    float* cur = (float*)((char*)dest + d_restore_idx[i] * grad_value_size);
+    float* cur =
+        (float*)((char*)dest + d_restore_idx[i] * grad_value_size);  // NOLINT
    int mf_dim = slot_dims[x] - 3;
    switch (off) {
      case 0:
-        cur[accessor.SlotIndex()] = (float)slot_vector[x];
+        cur[accessor.SlotIndex()] = (float)slot_vector[x];  // NOLINT
        cur[accessor.MfDimIndex()] = mf_dim;
-        paddle::platform::CudaAtomicAdd(&cur[accessor.ShowIndex()],
+        phi::CudaAtomicAdd(&cur[accessor.ShowIndex()], *(ptr + off));
-                                        *(ptr + off));
        break;
      case 1:
-        paddle::platform::CudaAtomicAdd(&cur[accessor.ClickIndex()],
+        phi::CudaAtomicAdd(&cur[accessor.ClickIndex()], *(ptr + off));
-                                        *(ptr + off));
        break;
      case 2:
-        paddle::platform::CudaAtomicAdd(&cur[accessor.EmbedGIndex()],
+        phi::CudaAtomicAdd(&cur[accessor.EmbedGIndex()],
-                                        *(ptr + off) * -1. * bs);
+                           *(ptr + off) * -1. * bs);
        break;
      default:
        int embedx_idx = off - 3;
        if (mf_dim < embedx_idx) {
          return;
        }
-        paddle::platform::CudaAtomicAdd(
+        phi::CudaAtomicAdd(&cur[accessor.EmbedxGIndex() + embedx_idx],
-            &cur[accessor.EmbedxGIndex() + embedx_idx],
+                           *(ptr + off) * -1. * bs);
-            *(ptr + off) * -1. * bs);
        break;
    }
  }
@@ -228,7 +227,7 @@ __global__ void PushMergeCopy(const size_t N,
    int i = idx / hidden;
    int off = idx % hidden;
    // filter 0 keys
-    float* cur = (float*)((char*)dest + i * grad_value_size);
+    float* cur = (float*)((char*)dest + i * grad_value_size);  // NOLINT
    if (total_keys[i] == 0) {
      switch (off) {
@@ -262,7 +261,7 @@ __global__ void PushMergeCopy(const size_t N,
    switch (off) {
      case 0:
-        cur[accessor.SlotIndex()] = (float)slot_vector[x];
+        cur[accessor.SlotIndex()] = (float)slot_vector[x];  // NOLINT
        cur[accessor.MfDimIndex()] = mf_dim;
        SUM_GRAD_VALUE
        cur[accessor.ShowIndex()] = val;
@@ -331,8 +330,8 @@ void AccessorWrapper<GPUAccessor>::CopyForPushImpl(
    const uint64_t total_length,
    const int batch_size,
    size_t grad_value_size,
-    std::vector<int>& slot_vector,
+    std::vector<int>& slot_vector,           // NOLINT
-    std::vector<int>& slot_mf_dim_vector) {
+    std::vector<int>& slot_mf_dim_vector) {  // NOLINT
  auto stream = dynamic_cast<phi::GPUContext*>(
                    paddle::platform::DeviceContextPool::Instance().Get(place))
                    ->stream();

--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -22,12 +22,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace framework {
-const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS;
+const int CUDA_NUM_THREADS = phi::PADDLE_CUDA_NUM_THREADS;
 #define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS)
 #define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0

--- a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
@@ -20,8 +20,8 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h"
 #include "paddle/fluid/operators/fused_token_prune_op.cu.h"
@@ -149,7 +149,7 @@ __global__ void ReduceSum2<half>(
  }
  if (tid == 0) {
-    platform::fastAtomicAdd<platform::float16>(
+    phi::fastAtomicAdd<platform::float16>(
        reinterpret_cast<platform::float16*>(dst),
        static_cast<size_t>(batch * max_seq_len + col),
        static_cast<size_t>(bsz * max_seq_len),

--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -23,7 +23,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/assign_pos_op.cu
+++ b/paddle/fluid/operators/assign_pos_op.cu
@@ -23,8 +23,8 @@ We retain the following license from the original files:
 #include "paddle/fluid/operators/assign_pos_op.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 DECLARE_bool(avoid_op_randomness);
@@ -47,7 +47,7 @@ __global__ void AssignPos(T* cum_count,
  CUDA_KERNEL_LOOP(i, limit) {
    int number_idx = numbers[i];
    if (number_idx > -1) {
-      int p = platform::CudaAtomicAdd(cum_count + number_idx, -1);
+      int p = phi::CudaAtomicAdd(cum_count + number_idx, -1);
      out[p - 1] = i;
    }
  }

--- a/paddle/fluid/operators/batch_fc_op.cu
+++ b/paddle/fluid/operators/batch_fc_op.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/batch_fc_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 namespace paddle {

--- a/paddle/fluid/operators/bilateral_slice_op.cu
+++ b/paddle/fluid/operators/bilateral_slice_op.cu
@@ -14,7 +14,7 @@
 #include "paddle/fluid/operators/bilateral_slice_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/center_loss_op.cu
+++ b/paddle/fluid/operators/center_loss_op.cu
@@ -16,11 +16,11 @@ limitations under the License. */
 #include "paddle/fluid/operators/center_loss_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
 __global__ void ComputeDifferent(T *centers_diff,
@@ -75,7 +75,7 @@ __global__ void UpdateCenters(T *centers,
    const T *diff = centers_diff + idy * D;
    T *cent = centers + id * D;
    for (int i = idx; i < D; i += BlockDimX) {
-      paddle::platform::CudaAtomicAdd(&cent[i], alpha[0] * diff[i] / count);
+      phi::CudaAtomicAdd(&cent[i], alpha[0] * diff[i] / count);
    }
    idy += BlockDimY * GridDimX;
  }

--- a/paddle/fluid/operators/collective/c_embedding_op.cu
+++ b/paddle/fluid/operators/collective/c_embedding_op.cu
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
@@ -77,8 +77,7 @@ __global__ void CEmbeddingGrad(T *table,
    auto id = ids[row];
    if (id >= start_idx && id < end_idx) {
      auto real_idx = id - start_idx;
-      paddle::platform::CudaAtomicAdd(&table[real_idx * columns + col],
+      phi::CudaAtomicAdd(&table[real_idx * columns + col], output[i]);
-                                      output[i]);
    }
  }
 }

--- a/paddle/fluid/operators/collective/c_split_op.cu
+++ b/paddle/fluid/operators/collective/c_split_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_split_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/conv_shift_op.cu
+++ b/paddle/fluid/operators/conv_shift_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/conv_shift_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace paddle {

--- a/paddle/fluid/operators/cvm_op.cu
+++ b/paddle/fluid/operators/cvm_op.cu
@@ -16,12 +16,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/cvm_op.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 using Tensor = phi::DenseTensor;
 using LoDTensor = phi::DenseTensor;

--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/data_norm_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -29,7 +29,7 @@ namespace operators {
 using Tensor = phi::DenseTensor;
 using LoDTensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 inline int GET_BLOCKS(const int N) {
  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;

--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -32,7 +32,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -41,7 +41,7 @@ namespace operators {
 using Tensor = phi::DenseTensor;
 using LoDTensor = phi::DenseTensor;
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 static inline int GET_BLOCKS(const int N) {
  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
@@ -447,18 +447,14 @@ __global__ void DeformablePSROIPoolBackwardAccKernel(
        // compute gradient of input
        if (bottom_data_diff) {
-          platform::CudaAtomicAdd(
+          phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y0 * width + x0,
-              bottom_data_diff + bottom_index + y0 * width + x0,
+                             q00 * diff_val);
-              q00 * diff_val);
+          phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y1 * width + x0,
-          platform::CudaAtomicAdd(
+                             q01 * diff_val);
-              bottom_data_diff + bottom_index + y1 * width + x0,
+          phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y0 * width + x1,
-              q01 * diff_val);
+                             q10 * diff_val);
-          platform::CudaAtomicAdd(
+          phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y1 * width + x1,
-              bottom_data_diff + bottom_index + y0 * width + x1,
+                             q11 * diff_val);
-              q10 * diff_val);
-          platform::CudaAtomicAdd(
-              bottom_data_diff + bottom_index + y1 * width + x1,
-              q11 * diff_val);
        }
        // compute gradient of trans
@@ -478,8 +474,8 @@ __global__ void DeformablePSROIPoolBackwardAccKernel(
                    u00 * (1 - dist_x)) *
                   trans_std * diff_val;
        diff_y *= roi_height;
-        platform::CudaAtomicAdd(bottom_trans_diff + trans_index_x, diff_x);
+        phi::CudaAtomicAdd(bottom_trans_diff + trans_index_x, diff_x);
-        platform::CudaAtomicAdd(bottom_trans_diff + trans_index_y, diff_y);
+        phi::CudaAtomicAdd(bottom_trans_diff + trans_index_y, diff_y);
      }
    }
  }

--- a/paddle/fluid/operators/dequantize_log_op.cu
+++ b/paddle/fluid/operators/dequantize_log_op.cu
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/dequantize_log_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/math.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/box_clip_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
@@ -11,7 +11,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -26,8 +26,8 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 namespace paddle {
@@ -50,7 +50,7 @@ static __global__ void GetLengthLoD(const int nthreads,
                                    const int* batch_ids,
                                    int* length_lod) {
  CUDA_KERNEL_LOOP(i, nthreads) {
-    platform::CudaAtomicAdd(length_lod + batch_ids[i], 1);
+    phi::CudaAtomicAdd(length_lod + batch_ids[i], 1);
  }
 }

--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -14,13 +14,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
 using Tensor = phi::DenseTensor;
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 #define CUDA_BLOCK_SIZE 16
 template <typename T>

--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -15,12 +15,12 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 using paddle::platform::float16;
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/math.h"

--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -43,7 +43,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #endif

--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/fake_quantize_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
+++ b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 namespace paddle {

--- a/paddle/fluid/operators/gather_scatter_kernel.cu
+++ b/paddle/fluid/operators/gather_scatter_kernel.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/gather_scatter_kernel.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
@@ -35,7 +35,7 @@ class ReduceAdd {
      typename tensor_t,
      std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr>
  __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
-    platform::CudaAtomicAdd(self_data, *src_data);
+    phi::CudaAtomicAdd(self_data, *src_data);
  }
  template <typename tensor_t,
            std::enable_if_t<std::is_same<tensor_t, uint8_t>::value>* = nullptr>

--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -41,8 +41,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/graph_khop_sampler_imp.h"
 #include "paddle/fluid/operators/graph_khop_sampler_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 constexpr int WARP_SIZE = 32;
@@ -134,8 +134,7 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed,
        const int num = curand(&rng) % (idx + 1);
 #endif
        if (num < k) {
-          paddle::platform::CudaAtomicMax(output_idxs + out_row_start + num,
+          phi::CudaAtomicMax(output_idxs + out_row_start + num, idx);
-                                          idx);
        }
      }
 #ifdef PADDLE_WITH_CUDA

--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -22,7 +22,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/group_norm_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
@@ -51,7 +51,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
  typedef cub::WarpReduce<T> WarpReduce;
  typename WarpReduce::TempStorage temp_storage;
  value = WarpReduce(temp_storage).Sum(value);
-  if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value);
+  if (cub::LaneId() == 0) phi::CudaAtomicAdd(sum, value);
 }
 template <typename T>
@@ -429,14 +429,14 @@ __global__ void GroupNormBackwardGetMeanAndVar(const T* x,
  if (flags & kHasScale) {
 #if CUDA_VERSION >= 11070
-    platform::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data);
+    phi::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data);
 #else
    CudaAtomicAddWithWarp(&(d_scale[ccid]), d_scale_data);
 #endif
  }
  if (flags & kHasBias) {
 #if CUDA_VERSION >= 11070
-    platform::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data);
+    phi::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data);
 #else
    CudaAtomicAddWithWarp(&(d_bias[ccid]), d_bias_data);
 #endif

--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -14,7 +14,7 @@
 #include "paddle/fluid/operators/interpolate_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
@@ -126,7 +126,7 @@ __global__ void KeNearestNeighborInterpBw(T* in,
                   in_img_idx * num_channels + channel_id];
    }
    const T out_pos = out[out_id_h * output_w + out_id_w];
-    platform::CudaAtomicAdd(in_pos, out_pos);
+    phi::CudaAtomicAdd(in_pos, out_pos);
  }
 }
@@ -243,12 +243,11 @@ __global__ void KeLinearInterpBw(T* in,
    const T* out_pos = &out[out_id_w];
    if (data_layout == DataLayout::kNCHW) {
-      platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]);
    } else {
-      platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
+      phi::CudaAtomicAdd(&in_pos[w_id * num_channels], w1lambda * out_pos[0]);
-                              w1lambda * out_pos[0]);
    }
  }
 }
@@ -408,19 +407,19 @@ __global__ void KeBilinearInterpBw(T* in,
    const T* out_pos = &out[out_id_h * output_w + out_id_w];
    if (data_layout == DataLayout::kNCHW) {
-      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w],
+      phi::CudaAtomicAdd(&in_pos[h_id * in_img_w],
-                              h1lambda * w2lambda * out_pos[0]);
+                         h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
+      phi::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
-                              h1lambda * w1lambda * out_pos[0]);
+                         h1lambda * w1lambda * out_pos[0]);
    } else {
-      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
+      phi::CudaAtomicAdd(&in_pos[w_id * num_channels],
-                              h2lambda * w1lambda * out_pos[0]);
+                         h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels],
+      phi::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels],
-                              h1lambda * w2lambda * out_pos[0]);
+                         h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(
+      phi::CudaAtomicAdd(
          &in_pos[h_id * in_img_w * num_channels + w_id * num_channels],
          h1lambda * w1lambda * out_pos[0]);
    }
@@ -638,22 +637,22 @@ __global__ void KeTrilinearInterpBw(T* in,
      const T* out_pos = &out[out_id_h * output_w + out_id_w];
      // trilinear interpolation grad
-      platform::CudaAtomicAdd(&in_pos1[0],
+      phi::CudaAtomicAdd(&in_pos1[0],
-                              d2lambda * h2lambda * w2lambda * out_pos[0]);
+                         d2lambda * h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos1[w_id],
+      phi::CudaAtomicAdd(&in_pos1[w_id],
-                              d2lambda * h2lambda * w1lambda * out_pos[0]);
+                         d2lambda * h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w],
+      phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w],
-                              d2lambda * h1lambda * w2lambda * out_pos[0]);
+                         d2lambda * h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id],
+      phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id],
-                              d2lambda * h1lambda * w1lambda * out_pos[0]);
+                         d2lambda * h1lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[0],
+      phi::CudaAtomicAdd(&in_pos2[0],
-                              d1lambda * h2lambda * w2lambda * out_pos[0]);
+                         d1lambda * h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[w_id],
+      phi::CudaAtomicAdd(&in_pos2[w_id],
-                              d1lambda * h2lambda * w1lambda * out_pos[0]);
+                         d1lambda * h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w],
+      phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w],
-                              d1lambda * h1lambda * w2lambda * out_pos[0]);
+                         d1lambda * h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id],
+      phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id],
-                              d1lambda * h1lambda * w1lambda * out_pos[0]);
+                         d1lambda * h1lambda * w1lambda * out_pos[0]);
    } else {
      int in_pos1_idx = out_id_h * input_w +
                        in_img_idt * in_img_h * in_img_w * num_channels +
@@ -666,22 +665,22 @@ __global__ void KeTrilinearInterpBw(T* in,
      const T* out_pos = &out[out_id_h * output_w + out_id_w];
      // trilinear interpolation grad
-      platform::CudaAtomicAdd(&in_pos1[0],
+      phi::CudaAtomicAdd(&in_pos1[0],
-                              d2lambda * h2lambda * w2lambda * out_pos[0]);
+                         d2lambda * h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos1[w_id * num_channels],
+      phi::CudaAtomicAdd(&in_pos1[w_id * num_channels],
-                              d2lambda * h2lambda * w1lambda * out_pos[0]);
+                         d2lambda * h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels],
+      phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels],
-                              d2lambda * h1lambda * w2lambda * out_pos[0]);
+                         d2lambda * h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(
+      phi::CudaAtomicAdd(
          &in_pos1[h_id * in_img_w * num_channels + w_id * num_channels],
          d2lambda * h1lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[0],
+      phi::CudaAtomicAdd(&in_pos2[0],
-                              d1lambda * h2lambda * w2lambda * out_pos[0]);
+                         d1lambda * h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[w_id * num_channels],
+      phi::CudaAtomicAdd(&in_pos2[w_id * num_channels],
-                              d1lambda * h2lambda * w1lambda * out_pos[0]);
+                         d1lambda * h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels],
+      phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels],
-                              d1lambda * h1lambda * w2lambda * out_pos[0]);
+                         d1lambda * h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(
+      phi::CudaAtomicAdd(
          &in_pos2[h_id * in_img_w * num_channels + w_id * num_channels],
          d1lambda * h1lambda * w1lambda * out_pos[0]);
    }
@@ -903,8 +902,8 @@ __global__ void KeBicubicInterpBw(T* in,
          in_pos = &in[out_id_h * input_w + access_y * in_img_w * num_channels +
                       access_x * num_channels + channel_id];
        }
-        platform::CudaAtomicAdd(&in_pos[0],
+        phi::CudaAtomicAdd(&in_pos[0],
-                                (out_pos[0] * y_coeffs[j] * x_coeffs[i]));
+                           (out_pos[0] * y_coeffs[j] * x_coeffs[i]));
      }
    }
  }

--- a/paddle/fluid/operators/limit_by_capacity_op.cu
+++ b/paddle/fluid/operators/limit_by_capacity_op.cu
@@ -22,8 +22,8 @@
 #include "paddle/fluid/operators/limit_by_capacity_op.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
@@ -39,7 +39,7 @@ __global__ void limit_by_capacity_impl(
    wid = i / n_expert;
    eid = i % n_expert;
    auto proposal = expc[wid * n_expert + eid];
-    auto cap_left = paddle::platform::CudaAtomicAdd(cap + eid, proposal * (-1));
+    auto cap_left = phi::CudaAtomicAdd(cap + eid, proposal * (-1));
    if (cap_left >= proposal) {
      out[wid * n_expert + eid] = proposal;
    } else if (cap_left >= 0) {

--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/lookup_table_op.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
@@ -93,7 +93,7 @@ __global__ void LookupTableGrad(T *table,
    const T *out = output + idy * D;
    T *tab = table + id * D;
    for (int i = idx; i < D; i += BlockDimX) {
-      paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
+      phi::CudaAtomicAdd(&tab[i], out[i]);
    }
    idy += BlockDimY * GridDimX;
  }

--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/lookup_table_v2_op.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
@@ -65,10 +65,10 @@ __global__ void LookupTableV2Grad(T *table,
    const T *out = output + idy * D;
    T *tab = table + id * D;
 #ifdef PADDLE_WITH_CUDA
-    paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
+    phi::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
 #else
    for (int i = idx; i < D; i += blockDim.x) {
-      paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
+      phi::CudaAtomicAdd(&tab[i], out[i]);
    }
 #endif
    idy += blockDim.y * gridDim.x;

--- a/paddle/fluid/operators/math/cos_sim_functor.cu
+++ b/paddle/fluid/operators/math/cos_sim_functor.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/math/cos_sim_functor.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
@@ -44,7 +44,7 @@ __global__ void CosSimDyKernel(const T* x_norm,
    for (size_t i = 0; i < cols; ++i) {
      T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod -
                             z_data * y[i] * reciprocal_y_norm_square);
-      platform::CudaAtomicAdd(dy + i, dy_data);
+      phi::CudaAtomicAdd(dy + i, dy_data);
    }
  }
 }

--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -15,10 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/math.h"
 namespace paddle {
 namespace operators {
 namespace math {

--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
@@ -466,8 +466,7 @@ __global__ void col2imOCF(const T* col_data,
        if (height_offset >= 0 && height_offset < im_height &&
            width_offset >= 0 && width_offset < im_width) {
-          paddle::platform::CudaAtomicAdd(im_data + im_offset,
+          phi::CudaAtomicAdd(im_data + im_offset, col_data[col_offset]);
-                                          col_data[col_offset]);
        }
      }
    }

--- a/paddle/fluid/operators/math/maxouting.cu
+++ b/paddle/fluid/operators/math/maxouting.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/math/maxouting.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/operators/math/sequence_pooling.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace paddle {

--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/math/sequence_scale.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
 namespace math {
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 template <typename T, int BlockSize>
 __global__ void SequenceScaleKernel(T* seq,

--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/math/unpooling.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/vol2col.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
@@ -15,13 +15,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/mean_iou_op.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace paddle {
 namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 template <typename T>
 __global__ void CountCUDAKernel(const int num_classes,

--- a/paddle/fluid/operators/number_count_op.cu
+++ b/paddle/fluid/operators/number_count_op.cu
@@ -22,8 +22,8 @@
 #include "paddle/fluid/operators/number_count_op.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
@@ -77,7 +77,7 @@ __global__ void NumberCount(const T* numbers,
 #endif
    }
    if (threadIdx.x % WARP_SIZE == 0) {
-      platform::CudaAtomicAdd(number_count + i, x);
+      phi::CudaAtomicAdd(number_count + i, x);
    }
  }
 }

--- a/paddle/fluid/operators/one_hot_op.cu
+++ b/paddle/fluid/operators/one_hot_op.cu
@@ -14,11 +14,11 @@
 #include "paddle/fluid/operators/one_hot_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 template <typename InT, typename OutT>
 __global__ void FillOutputKernel(const InT* p_in_data,

--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
@@ -56,7 +56,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
    for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
      // Since index in rows of SelectedRows can be duplicate, we have to use
      // Atomic Operation to avoid concurrent write error.
-      paddle::platform::CudaAtomicAdd(
+      phi::CudaAtomicAdd(
          tensor_out_ptr + index,
          -static_cast<T>(1.0) * learning_rate[0] * selected_rows_ptr[index]);
    }

--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -16,13 +16,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace paddle {
 namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 template <typename T>
 __global__ void Pad2DConstNCHW(const int nthreads,
@@ -257,9 +257,8 @@ __global__ void Pad2DGradReflectNCHW(const int out_size,
    in_w = max(in_w, -in_w);
    in_h = min(in_h, 2 * in_height - in_h - 2);
    in_w = min(in_w, 2 * in_width - in_w - 2);
-    platform::CudaAtomicAdd(
+    phi::CudaAtomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
-        &d_in_data[(nc * in_height + in_h) * in_width + in_w],
+                       d_out_data[out_index]);
-        d_out_data[out_index]);
  }
 }
@@ -288,7 +287,7 @@ __global__ void Pad2DGradReflectNHWC(const int out_size,
    in_w = max(in_w, -in_w);
    in_h = min(in_h, in_height * 2 - in_h - 2);
    in_w = min(in_w, in_width * 2 - in_w - 2);
-    platform::CudaAtomicAdd(
+    phi::CudaAtomicAdd(
        &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
        d_out_data[out_index]);
  }
@@ -313,9 +312,8 @@ __global__ void Pad2DGradEdgeNCHW(const int out_size,
    nc /= out_height;
    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-    platform::CudaAtomicAdd(
+    phi::CudaAtomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
-        &d_in_data[(nc * in_height + in_h) * in_width + in_w],
+                       d_out_data[out_index]);
-        d_out_data[out_index]);
  }
 }
@@ -340,7 +338,7 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size,
    n /= out_height;
    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-    platform::CudaAtomicAdd(
+    phi::CudaAtomicAdd(
        &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
        d_out_data[out_index]);
  }

--- a/paddle/fluid/operators/prroi_pool_op.h
+++ b/paddle/fluid/operators/prroi_pool_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #endif
 namespace paddle {
@@ -96,7 +96,7 @@ DEVICE void PrRoIPoolingDistributeDiff(T* diff,
                                       const T coeff) {
  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
  if (!overflow) {
-    paddle::platform::CudaAtomicAdd(diff + h * width + w, top_diff * coeff);
+    phi::CudaAtomicAdd(diff + h * width + w, top_diff * coeff);
  }
 }
 #else
@@ -166,7 +166,7 @@ HOSTDEVICE void PrRoIPoolingMatDistributeDiff(T* diff,
 #if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T>
 DEVICE void AccumulateRois(T* offset, T data) {
-  paddle::platform::CudaAtomicAdd(offset, data);
+  phi::CudaAtomicAdd(offset, data);
 }
 #else
 template <typename T>

--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
@@ -21,7 +21,7 @@
 //  Licensed under the Apache License, Version 2.0 (the "License").
 #include "paddle/fluid/operators/prune_gate_by_capacity_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
@@ -47,7 +47,7 @@ __global__ void prune_gate_by_capacity_kernel(const T1* gate_idx_data,
                                              const int64_t batch_size) {
  CUDA_KERNEL_LOOP(i, batch_size) {
    auto orig_cap =
-        platform::CudaAtomicAdd(expert_count_data + gate_idx_data[i], -1);
+        phi::CudaAtomicAdd(expert_count_data + gate_idx_data[i], -1);
    if (orig_cap <= 0) {
      new_gate_idx_data[i] = -1;
    } else {

--- a/paddle/fluid/operators/pull_box_extended_sparse_op.cu
+++ b/paddle/fluid/operators/pull_box_extended_sparse_op.cu
@@ -14,7 +14,7 @@
 #include "paddle/fluid/operators/pull_box_extended_sparse_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/pull_box_sparse_op.kps
+++ b/paddle/fluid/operators/pull_box_sparse_op.kps
@@ -37,7 +37,7 @@ limitations under the License. */
 #include "xpu/kernel/math.h"            // NOLINT
 #else
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #endif
 #include "paddle/fluid/operators/pull_box_sparse_op.h"
@@ -46,9 +46,13 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 #ifdef PADDLE_WITH_XPU_KP
-REGISTER_OP_KERNEL(pull_box_sparse, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(pull_box_sparse,
+                   KP,
+                   plat::XPUPlace,
                   ops::PullBoxSparseKernel<float>);
-REGISTER_OP_KERNEL(push_box_sparse, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(push_box_sparse,
+                   KP,
+                   plat::XPUPlace,
                   ops::PushBoxSparseKernel<float>);
 #else
 REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseKernel<float>);

--- a/paddle/fluid/operators/pull_gpups_sparse_op.cu
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.cu
@@ -14,11 +14,11 @@
 #include "paddle/fluid/operators/pull_gpups_sparse_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 using LoDTensor = phi::DenseTensor;
 template <typename T>

--- a/paddle/fluid/operators/quantize_linear_op.cu
+++ b/paddle/fluid/operators/quantize_linear_op.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fake_dequantize_op.cu.h"
 #include "paddle/fluid/operators/fake_quantize_op.cu.h"
 #include "paddle/fluid/operators/quantize_linear_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/random_routing_op.cu
+++ b/paddle/fluid/operators/random_routing_op.cu
@@ -14,8 +14,8 @@
 #include "paddle/fluid/operators/random_routing_op.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/rank_attention_op.cu
+++ b/paddle/fluid/operators/rank_attention_op.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/rank_attention.cu.h"
 #include "paddle/fluid/operators/rank_attention_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 namespace paddle {

--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
@@ -16,11 +16,11 @@
 #include <thrust/host_vector.h>
 #include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 using LoDTensor = phi::DenseTensor;
 template <typename T>

--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
@@ -16,11 +16,11 @@ limitations under the License. */
 #include <thrust/host_vector.h>
 #include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 using LoDTensor = phi::DenseTensor;
 template <typename T>

--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
@@ -72,7 +72,7 @@ __global__ void sequence_expand_grad_kernel(const T* dout_data,
    for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
      for (int tid_x = threadIdx.x; tid_x < x_item_length;
           tid_x += blockDim.x) {
-        platform::CudaAtomicAdd(
+        phi::CudaAtomicAdd(
            &dx_data[(x_offset + tid_y) * x_item_length + tid_x],
            dout_data[(out_offset + tid_z * x_item_count + tid_y) *
                          x_item_length +

--- a/paddle/fluid/operators/shuffle_channel_op.cu
+++ b/paddle/fluid/operators/shuffle_channel_op.cu
@@ -11,7 +11,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/shuffle_channel_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/temporal_shift_op.cu
+++ b/paddle/fluid/operators/temporal_shift_op.cu
@@ -11,7 +11,7 @@
 #include "paddle/fluid/operators/temporal_shift_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -28,8 +28,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #define FINAL_MASK 0xffffffff
 #ifdef __HIPCC__
@@ -713,7 +713,7 @@ __device__ void RadixCountUsingMask(const T* input,
  if (GetLaneId() == 0) {
 #pragma unroll
    for (uint32_t i = 0; i < RadixSize; ++i) {
-      platform::CudaAtomicAdd(&shared_mem[i], counts[i]);
+      phi::CudaAtomicAdd(&shared_mem[i], counts[i]);
    }
  }

--- a/paddle/fluid/operators/transpose_op.cu.h
+++ b/paddle/fluid/operators/transpose_op.cu.h
@@ -16,9 +16,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/gpu_utils.h"
 #include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/autotune/auto_tune_base.h"

--- a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
+++ b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
@@ -24,17 +24,15 @@
 #define PADDLE_CUDA_FP16
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 using paddle::platform::float16;
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 template <typename T>
 __global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
-  CUDA_KERNEL_LOOP(i, num) {
+  CUDA_KERNEL_LOOP(i, num) { phi::CudaAtomicAdd(&data_b[i], data_a[i]); }
-    paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]);
-  }
 }
 template <typename T>

--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#ifdef PADDLE_WITH_CUDA
-#include <cuda.h>
-#endif
-#ifdef PADDLE_WITH_HIP
-#include <hip/hip_runtime.h>
-#endif
-#include <stdio.h>
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
-namespace paddle {
-namespace platform {
-#define CUDA_ATOMIC_WRAPPER(op, T) \
-  __device__ __forceinline__ T CudaAtomic##op(T *address, const T val)
-#define USE_CUDA_ATOMIC(op, T) \
-  CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
-// Default thread count per block(or block size).
-// TODO(typhoonzero): need to benchmark against setting this value
-//                    to 1024.
-constexpr int PADDLE_CUDA_NUM_THREADS = 512;
-// For atomicAdd.
-USE_CUDA_ATOMIC(Add, float);
-USE_CUDA_ATOMIC(Add, int);
-USE_CUDA_ATOMIC(Add, unsigned int);
-// CUDA API uses unsigned long long int, we cannot use uint64_t here.
-// It because unsigned long long int is not necessarily uint64_t
-USE_CUDA_ATOMIC(Add, unsigned long long int);  // NOLINT
-CUDA_ATOMIC_WRAPPER(Add, int64_t) {
-  // Here, we check long long int must be int64_t.
-  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
-                "long long should be int64");
-  return CudaAtomicAdd(
-      reinterpret_cast<unsigned long long int *>(address),  // NOLINT
-      static_cast<unsigned long long int>(val));            // NOLINT
-}
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
-USE_CUDA_ATOMIC(Add, double);
-#else
-CUDA_ATOMIC_WRAPPER(Add, double) {
-  unsigned long long int *address_as_ull =                  // NOLINT
-      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
-  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull,
-                    assumed,
-                    __double_as_longlong(val + __longlong_as_double(assumed)));
-    // Note: uses integer comparison to avoid hang in case of NaN
-  } while (assumed != old);
-  return __longlong_as_double(old);
-}
-#endif
-#ifdef PADDLE_CUDA_FP16
-// NOTE(dzhwinter): cuda do not have atomicCAS for half.
-// Just use the half address as a unsigned value address and
-// do the atomicCAS. According to the value store at high 16 bits
-// or low 16 bits, then do a different sum and CAS.
-// Given most warp-threads will failed on the atomicCAS, so this
-// implemented should be avoided in high concurrency. It's will be
-// slower than the way convert value into 32bits and do a full atomicCAS.
-// convert the value into float and do the add arithmetic.
-// then store the result into a uint32.
-inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) {
-  float16 low_half;
-  // the float16 in lower 16bits
-  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
-  low_half = static_cast<float16>(static_cast<float>(low_half) + x);
-  return (val & 0xFFFF0000u) | low_half.x;
-}
-inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
-  float16 high_half;
-  // the float16 in higher 16bits
-  high_half.x = static_cast<uint16_t>(val >> 16);
-  high_half = static_cast<float16>(static_cast<float>(high_half) + x);
-  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
-}
-#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-static __device__ __forceinline__ float16 CUDAFP16ToPDFP16(__half x) {
-  return *reinterpret_cast<float16 *>(&x);
-}
-static __device__ __forceinline__ __half PDFP16ToCUDAFP16(float16 x) {
-  return *reinterpret_cast<__half *>(&x);
-}
-CUDA_ATOMIC_WRAPPER(Add, float16) {
-  return CUDAFP16ToPDFP16(
-      atomicAdd(reinterpret_cast<__half *>(address), PDFP16ToCUDAFP16(val)));
-}
-#else
-CUDA_ATOMIC_WRAPPER(Add, float16) {
-  // concrete packed float16 value may exsits in lower or higher 16bits
-  // of the 32bits address.
-  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
-      reinterpret_cast<char *>(address) -
-      (reinterpret_cast<uintptr_t>(address) & 0x02));
-  float val_f = static_cast<float>(val);
-  uint32_t old = *address_as_ui;
-  uint32_t sum;
-  uint32_t newval;
-  uint32_t assumed;
-  if (((uintptr_t)address & 0x02) == 0) {
-    // the float16 value stay at lower 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f));
-    } while (old != assumed);
-    float16 ret;
-    ret.x = old & 0xFFFFu;
-    return ret;
-  } else {
-    // the float16 value stay at higher 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_ui, assumed, add_to_high_half(assumed, val_f));
-    } while (old != assumed);
-    float16 ret;
-    ret.x = old >> 16;
-    return ret;
-  }
-}
-#endif
-template <typename T, bool IsAvailable, typename NVType, typename NVVec2Type>
-struct VecAtomicAddHelperBase {
-  static constexpr auto kIsAvailable = IsAvailable;
-  using NVT = NVType;
-  using NVVec2T = NVVec2Type;
-};
-template <typename T>
-struct VecAtomicAddHelper : VecAtomicAddHelperBase<T, false, void, void> {};
-#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-template <>
-struct VecAtomicAddHelper<platform::float16>
-    : VecAtomicAddHelperBase<platform::float16, true, __half, __half2> {};
-#endif
-#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-template <>
-struct VecAtomicAddHelper<platform::bfloat16>
-    : VecAtomicAddHelperBase<platform::bfloat16,
-                             true,
-                             __nv_bfloat16,
-                             __nv_bfloat162> {};
-#endif
-// The performance of "atomicAdd(half* )" is bad, but for "atomicAdd(half2* )"
-// is good. So for fp16 type, we can use "atomicAdd(half2* )" to speed up.
-template <typename T,
-          typename std::enable_if<VecAtomicAddHelper<T>::kIsAvailable>::type * =
-              nullptr>
-__device__ __forceinline__ void fastAtomicAdd(T *tensor,
-                                              size_t index,
-                                              const size_t numel,
-                                              T value) {
-  // whether the address is 32-byte aligned.
-  using NVT = typename VecAtomicAddHelper<T>::NVT;
-  using NVVec2T = typename VecAtomicAddHelper<T>::NVVec2T;
-  NVT *target_addr = reinterpret_cast<NVT *>(tensor + index);
-  bool aligned_half2 =
-      (reinterpret_cast<std::uintptr_t>(target_addr) % sizeof(NVVec2T) == 0);
-  if (aligned_half2 && index < (numel - 1)) {
-    NVVec2T value2;
-    value2.x = *reinterpret_cast<NVT *>(&value);
-    value2.y = 0.0;
-    atomicAdd(reinterpret_cast<NVVec2T *>(target_addr), value2);
-  } else if (!aligned_half2 && index > 0) {
-    NVVec2T value2;
-    value2.x = 0.0;
-    value2.y = *reinterpret_cast<NVT *>(&value);
-    atomicAdd(reinterpret_cast<NVVec2T *>(target_addr - 1), value2);
-  } else {
-    atomicAdd(reinterpret_cast<NVT *>(tensor) + index,
-              *reinterpret_cast<NVT *>(&value));
-  }
-}
-template <typename T,
-          typename std::enable_if<!VecAtomicAddHelper<T>::kIsAvailable>::type
-              * = nullptr>
-__device__ __forceinline__ void fastAtomicAdd(T *arr,
-                                              size_t index,
-                                              const size_t numel,
-                                              T value) {
-  CudaAtomicAdd(arr + index, value);
-}
-#endif
-// NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16.
-inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) {
-  bfloat16 low_half;
-  // the bfloat16 in lower 16bits
-  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
-  low_half = static_cast<bfloat16>(static_cast<float>(low_half) + x);
-  return (val & 0xFFFF0000u) | low_half.x;
-}
-inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) {
-  bfloat16 high_half;
-  // the bfloat16 in higher 16bits
-  high_half.x = static_cast<uint16_t>(val >> 16);
-  high_half = static_cast<bfloat16>(static_cast<float>(high_half) + x);
-  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
-}
-#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-static __device__ __forceinline__ bfloat16 CUDABF16ToPDBF16(__nv_bfloat16 x) {
-  return *reinterpret_cast<bfloat16 *>(&x);
-}
-static __device__ __forceinline__ __nv_bfloat16 PDBF16ToCUDABF16(bfloat16 x) {
-  return *reinterpret_cast<__nv_bfloat16 *>(&x);
-}
-CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
-  return CUDABF16ToPDBF16(atomicAdd(reinterpret_cast<__nv_bfloat16 *>(address),
-                                    PDBF16ToCUDABF16(val)));
-}
-#else
-CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
-  // concrete packed bfloat16 value may exsits in lower or higher 16bits
-  // of the 32bits address.
-  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
-      reinterpret_cast<char *>(address) -
-      (reinterpret_cast<uintptr_t>(address) & 0x02));
-  float val_f = static_cast<float>(val);
-  uint32_t old = *address_as_ui;
-  uint32_t sum;
-  uint32_t newval;
-  uint32_t assumed;
-  if (((uintptr_t)address & 0x02) == 0) {
-    // the bfloat16 value stay at lower 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(
-          address_as_ui, assumed, bf16_add_to_low_half(assumed, val_f));
-    } while (old != assumed);
-    bfloat16 ret;
-    ret.x = old & 0xFFFFu;
-    return ret;
-  } else {
-    // the bfloat16 value stay at higher 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(
-          address_as_ui, assumed, bf16_add_to_high_half(assumed, val_f));
-    } while (old != assumed);
-    bfloat16 ret;
-    ret.x = old >> 16;
-    return ret;
-  }
-}
-#endif
-CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
-  float *real = reinterpret_cast<float *>(address);
-  float *imag = real + 1;
-  return complex<float>(CudaAtomicAdd(real, val.real),
-                        CudaAtomicAdd(imag, val.imag));
-}
-CUDA_ATOMIC_WRAPPER(Add, complex<double>) {
-  double *real = reinterpret_cast<double *>(address);
-  double *imag = real + 1;
-  return complex<double>(CudaAtomicAdd(real, val.real),
-                         CudaAtomicAdd(imag, val.imag));
-}
-// For atomicMax
-USE_CUDA_ATOMIC(Max, int);
-USE_CUDA_ATOMIC(Max, unsigned int);
-// CUDA API uses unsigned long long int, we cannot use uint64_t here.
-// It because unsigned long long int is not necessarily uint64_t
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
-USE_CUDA_ATOMIC(Max, unsigned long long int);  // NOLINT
-#else
-CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) {  // NOLINT
-  if (*address >= val) {
-    return *address;
-  }
-  unsigned long long int old = *address, assumed;  // NOLINT
-  do {
-    assumed = old;
-    if (assumed >= val) {
-      break;
-    }
-    old = atomicCAS(address, assumed, val);
-  } while (assumed != old);
-}
-#endif
-CUDA_ATOMIC_WRAPPER(Max, int64_t) {
-  // Here, we check long long int must be int64_t.
-  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
-                "long long should be int64");
-  long long int res = *address;  // NOLINT
-  while (val > res) {
-    long long int old = res;                                           // NOLINT
-    res = (long long int)atomicCAS((unsigned long long int *)address,  // NOLINT
-                                   (unsigned long long int)old,        // NOLINT
-                                   (unsigned long long int)val);       // NOLINT
-    if (res == old) {
-      break;
-    }
-  }
-  return res;
-}
-CUDA_ATOMIC_WRAPPER(Max, float) {
-  if (*address >= val) {
-    return *address;
-  }
-  int *const address_as_i = reinterpret_cast<int *>(address);
-  int old = *address_as_i, assumed;
-  do {
-    assumed = old;
-    if (__int_as_float(assumed) >= val) {
-      break;
-    }
-    old = atomicCAS(address_as_i, assumed, __float_as_int(val));
-  } while (assumed != old);
-  return __int_as_float(old);
-}
-CUDA_ATOMIC_WRAPPER(Max, double) {
-  if (*address >= val) {
-    return *address;
-  }
-  unsigned long long int *const address_as_ull =            // NOLINT
-      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
-  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
-  do {
-    assumed = old;
-    if (__longlong_as_double(assumed) >= val) {
-      break;
-    }
-    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
-  } while (assumed != old);
-  return __longlong_as_double(old);
-}
-#ifdef PADDLE_CUDA_FP16
-inline static __device__ uint32_t max_to_low_half(uint32_t val, float x) {
-  float16 low_half;
-  // The float16 in lower 16bits
-  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
-  low_half = static_cast<float16>(max(static_cast<float>(low_half), x));
-  return (val & 0xFFFF0000u) | low_half.x;
-}
-inline static __device__ uint32_t max_to_high_half(uint32_t val, float x) {
-  float16 high_half;
-  // The float16 in higher 16bits
-  high_half.x = static_cast<uint16_t>(val >> 16);
-  high_half = static_cast<float16>(max(static_cast<float>(high_half), x));
-  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
-}
-CUDA_ATOMIC_WRAPPER(Max, float16) {
-  if (*address >= val) {
-    return *address;
-  }
-  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
-      reinterpret_cast<char *>(address) -
-      (reinterpret_cast<uintptr_t>(address) & 0x02));
-  float val_f = static_cast<float>(val);
-  uint32_t old = *address_as_ui;
-  uint32_t assumed;
-  if (((uintptr_t)address & 0x02) == 0) {
-    // The float16 value stay at lower 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_ui, assumed, max_to_low_half(assumed, val_f));
-    } while (old != assumed);
-    float16 ret;
-    ret.x = old & 0xFFFFu;
-    return ret;
-  } else {
-    // The float16 value stay at higher 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_ui, assumed, max_to_high_half(assumed, val_f));
-    } while (old != assumed);
-    float16 ret;
-    ret.x = old >> 16;
-    return ret;
-  }
-}
-#endif
-// For atomicMin
-USE_CUDA_ATOMIC(Min, int);
-USE_CUDA_ATOMIC(Min, unsigned int);
-// CUDA API uses unsigned long long int, we cannot use uint64_t here.
-// It because unsigned long long int is not necessarily uint64_t
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
-USE_CUDA_ATOMIC(Min, unsigned long long int);  // NOLINT
-#else
-CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) {  // NOLINT
-  if (*address <= val) {
-    return *address;
-  }
-  unsigned long long int old = *address, assumed;  // NOLINT
-  do {
-    assumed = old;
-    if (assumed <= val) {
-      break;
-    }
-    old = atomicCAS(address, assumed, val);
-  } while (assumed != old);
-}
-#endif
-CUDA_ATOMIC_WRAPPER(Min, int64_t) {
-  // Here, we check long long int must be int64_t.
-  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
-                "long long should be int64");
-  long long int res = *address;  // NOLINT
-  while (val < res) {
-    long long int old = res;                                           // NOLINT
-    res = (long long int)atomicCAS((unsigned long long int *)address,  // NOLINT
-                                   (unsigned long long int)old,        // NOLINT
-                                   (unsigned long long int)val);       // NOLINT
-    if (res == old) {
-      break;
-    }
-  }
-  return res;
-}
-CUDA_ATOMIC_WRAPPER(Min, float) {
-  if (*address <= val) {
-    return *address;
-  }
-  int *const address_as_i = reinterpret_cast<int *>(address);
-  int old = *address_as_i, assumed;
-  do {
-    assumed = old;
-    if (__int_as_float(assumed) <= val) {
-      break;
-    }
-    old = atomicCAS(address_as_i, assumed, __float_as_int(val));
-  } while (assumed != old);
-  return __int_as_float(old);
-}
-CUDA_ATOMIC_WRAPPER(Min, double) {
-  if (*address <= val) {
-    return *address;
-  }
-  unsigned long long int *const address_as_ull =            // NOLINT
-      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
-  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
-  do {
-    assumed = old;
-    if (__longlong_as_double(assumed) <= val) {
-      break;
-    }
-    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
-  } while (assumed != old);
-  return __longlong_as_double(old);
-}
-#ifdef PADDLE_CUDA_FP16
-inline static __device__ uint32_t min_to_low_half(uint32_t val, float x) {
-  float16 low_half;
-  // The float16 in lower 16bits
-  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
-  low_half = static_cast<float16>(min(static_cast<float>(low_half), x));
-  return (val & 0xFFFF0000u) | low_half.x;
-}
-inline static __device__ uint32_t min_to_high_half(uint32_t val, float x) {
-  float16 high_half;
-  // The float16 in higher 16bits
-  high_half.x = static_cast<uint16_t>(val >> 16);
-  high_half = static_cast<float16>(min(static_cast<float>(high_half), x));
-  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
-}
-CUDA_ATOMIC_WRAPPER(Min, float16) {
-  if (*address <= val) {
-    return *address;
-  }
-  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
-      reinterpret_cast<char *>(address) -
-      (reinterpret_cast<uintptr_t>(address) & 0x02));
-  float val_f = static_cast<float>(val);
-  uint32_t old = *address_as_ui;
-  uint32_t assumed;
-  if (((uintptr_t)address & 0x02) == 0) {
-    // The float16 value stay at lower 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_ui, assumed, min_to_low_half(assumed, val_f));
-    } while (old != assumed);
-    float16 ret;
-    ret.x = old & 0xFFFFu;
-    return ret;
-  } else {
-    // The float16 value stay at higher 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_ui, assumed, min_to_high_half(assumed, val_f));
-    } while (old != assumed);
-    float16 ret;
-    ret.x = old >> 16;
-    return ret;
-  }
-}
-#endif
-#ifdef PADDLE_WITH_CUDA
-/*
- * One thead block deals with elementwise atomicAdd for vector of len.
- * @in: [x1, x2, x3, ...]
- * @out:[y1+x1, y2+x2, y3+x3, ...]
- * */
-template <typename T,
-          typename std::enable_if<!VecAtomicAddHelper<T>::kIsAvailable>::type
-              * = nullptr>
-__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
-    const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
-  for (int i = tid; i < len; i += threads_per_block) {
-    CudaAtomicAdd(&out[i], in[i]);
-  }
-}
-// Note: assume that len is even. If len is odd, call fastAtomicAdd directly.
-template <typename T,
-          typename std::enable_if<VecAtomicAddHelper<T>::kIsAvailable>::type * =
-              nullptr>
-__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
-    const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
-  int i = 0;
-  int loops = len / 2 * 2;
-  using NVT = typename VecAtomicAddHelper<T>::NVT;
-  using NVVec2T = typename VecAtomicAddHelper<T>::NVVec2T;
-  bool aligned_half2 =
-      (reinterpret_cast<std::uintptr_t>(out) % sizeof(NVT) == 0);
-  if (aligned_half2) {
-    for (i = tid * 2; i < loops; i += threads_per_block * 2) {
-      NVVec2T value2;
-      T value_1 = in[i];
-      T value_2 = in[i + 1];
-      value2.x = *reinterpret_cast<NVT *>(&value_1);
-      value2.y = *reinterpret_cast<NVT *>(&value_2);
-      atomicAdd(reinterpret_cast<NVVec2T *>(&out[i]), value2);
-    }
-    for (; i < len; i += threads_per_block) {
-      fastAtomicAdd(out, i, len, in[i]);
-    }
-  } else {
-    for (int i = tid; i < len; i += threads_per_block) {
-      fastAtomicAdd(out, i, len, in[i]);
-    }
-  }
-}
-#endif
-}  // namespace platform
-}  // namespace paddle
--- a/paddle/phi/backends/gpu/gpu_primitives.h
+++ b/paddle/phi/backends/gpu/gpu_primitives.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.