From 9adca1e73e13049d82f667571e3f236372c1ef31 Mon Sep 17 00:00:00 2001
From: Wang Xin <xinwang614@gmail.com>
Date: Wed, 16 Nov 2022 11:08:10 +0800
Subject: [PATCH] move "gpu_primitives.h" to phi (#48015)

---
 paddle/phi/backends/gpu/gpu_primitives.h      | 610 ++++++++++++++++++
 .../phi/kernels/funcs/detail/gru_gpu_kernel.h |   2 +-
 .../kernels/funcs/detail/lstm_gpu_kernel.h    |  11 +-
 paddle/phi/kernels/funcs/gather.cu.h          |   4 +-
 paddle/phi/kernels/funcs/pooling.cu           |   9 +-
 paddle/phi/kernels/funcs/scatter.cu.h         |   6 +-
 paddle/phi/kernels/funcs/segment_pooling.cu   |  20 +-
 .../kernels/funcs/selected_rows_functor.cu    |  12 +-
 paddle/phi/kernels/gpu/accuracy_kernel.cu     |   4 +-
 paddle/phi/kernels/gpu/adagrad_kernel.cu      |  10 +-
 .../kernels/gpu/affine_grid_grad_kernel.cu    |  51 +-
 paddle/phi/kernels/gpu/affine_grid_kernel.cu  |   2 +-
 paddle/phi/kernels/gpu/auc_kernel.cu          |   8 +-
 paddle/phi/kernels/gpu/bincount_kernel.cu     |   9 +-
 paddle/phi/kernels/gpu/box_coder.cu           |   2 +-
 .../gpu/deformable_conv_grad_kernel.cu        |   6 +-
 paddle/phi/kernels/gpu/depthwise_conv.h       |   6 +-
 .../phi/kernels/gpu/diagonal_grad_kernel.cu   |   4 +-
 paddle/phi/kernels/gpu/diagonal_kernel.cu     |   4 +-
 .../gpu/distribute_fpn_proposals_kernel.cu    |   4 +-
 .../phi/kernels/gpu/edit_distance_kernel.cu   |   4 +-
 .../phi/kernels/gpu/embedding_grad_kernel.cu  |   6 +-
 paddle/phi/kernels/gpu/graph_reindex_funcs.h  |   2 +-
 .../phi/kernels/gpu/graph_send_recv_funcs.h   |  21 +-
 .../kernels/gpu/graph_send_ue_recv_funcs.h    |  53 +-
 .../kernels/gpu/grid_sample_grad_kernel.cu    |   6 +-
 .../phi/kernels/gpu/group_norm_grad_kernel.cu |   4 +-
 paddle/phi/kernels/gpu/group_norm_utils.h     |   4 +-
 paddle/phi/kernels/gpu/histogram_kernel.cu    |   8 +-
 .../phi/kernels/gpu/index_add_grad_kernel.cu  |   4 +-
 paddle/phi/kernels/gpu/index_add_kernel.cu    |   6 +-
 .../kernels/gpu/index_sample_grad_kernel.cu   |   6 +-
 .../kernels/gpu/index_select_grad_kernel.cu   |   6 +-
 paddle/phi/kernels/gpu/index_select_impl.h    |   4 +-
 paddle/phi/kernels/gpu/index_select_kernel.cu |   4 +-
 .../kernels/gpu/interpolate_grad_kernel.cu    | 160 ++---
 paddle/phi/kernels/gpu/interpolate_kernel.cu  |   2 +-
 paddle/phi/kernels/gpu/linspace_kernel.cu     |   2 +-
 .../phi/kernels/gpu/nanmedian_grad_kernel.cu  |   4 +-
 paddle/phi/kernels/gpu/nanmedian_kernel.cu    |  12 +-
 paddle/phi/kernels/gpu/nll_loss.h             |   6 +-
 paddle/phi/kernels/gpu/nms_kernel.cu          |   2 +-
 paddle/phi/kernels/gpu/one_hot_kernel.cu      |   4 +-
 paddle/phi/kernels/gpu/pad3d_grad_kernel.cu   |  16 +-
 paddle/phi/kernels/gpu/pad3d_kernel.cu        |   4 +-
 .../phi/kernels/gpu/psroi_pool_grad_kernel.cu |   4 +-
 .../phi/kernels/gpu/roi_align_grad_kernel.cu  |  15 +-
 .../phi/kernels/gpu/roi_pool_grad_kernel.cu   |   4 +-
 paddle/phi/kernels/gpu/roll_grad_kernel.cu    |   2 +-
 paddle/phi/kernels/gpu/roll_kernel.cu         |   2 +-
 paddle/phi/kernels/gpu/roll_kernel_impl.h     |   4 +-
 .../kernels/gpu/send_ue_recv_grad_kernel.cu   |   8 +-
 paddle/phi/kernels/gpu/send_ue_recv_kernel.cu |   2 +-
 paddle/phi/kernels/gpu/send_uv_grad_kernel.cu |   4 +-
 paddle/phi/kernels/gpu/send_uv_kernel.cu      |   2 +-
 paddle/phi/kernels/gpu/sgd_kernel.cu          |   4 +-
 paddle/phi/kernels/gpu/shard_index_kernel.cu  |   4 +-
 paddle/phi/kernels/gpu/transpose_kernel.cu    |   2 +-
 paddle/phi/kernels/gpu/trunc_grad_kernel.cu   |   4 +-
 paddle/phi/kernels/gpu/trunc_kernel.cu        |   4 +-
 .../kernels/gpudnn/affine_grid_grad_kernel.cu |   2 +-
 .../phi/kernels/gpudnn/affine_grid_kernel.cu  |   2 +-
 .../impl/repeat_interleave_grad_kernel_impl.h |   6 +-
 .../impl/repeat_interleave_kernel_impl.h      |  14 +-
 64 files changed, 895 insertions(+), 328 deletions(-)
 create mode 100644 paddle/phi/backends/gpu/gpu_primitives.h

diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h
new file mode 100644
index 00000000000..be08f29aa81
--- /dev/null
+++ b/paddle/phi/backends/gpu/gpu_primitives.h
@@ -0,0 +1,610 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+#include <stdio.h>
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+
+template <typename T>
+using complex = phi::dtype::complex<T>;
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+
+namespace phi {
+
+#define CUDA_ATOMIC_WRAPPER(op, T) \
+  __device__ __forceinline__ T CudaAtomic##op(T *address, const T val)
+
+#define USE_CUDA_ATOMIC(op, T) \
+  CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
+
+// Default thread count per block(or block size).
+// TODO(typhoonzero): need to benchmark against setting this value
+//                    to 1024.
+constexpr int PADDLE_CUDA_NUM_THREADS = 512;
+
+// For atomicAdd.
+USE_CUDA_ATOMIC(Add, float);
+USE_CUDA_ATOMIC(Add, int);
+USE_CUDA_ATOMIC(Add, unsigned int);
+// CUDA API uses unsigned long long int, we cannot use uint64_t here.
+// It because unsigned long long int is not necessarily uint64_t
+USE_CUDA_ATOMIC(Add, unsigned long long int);  // NOLINT
+
+CUDA_ATOMIC_WRAPPER(Add, int64_t) {
+  // Here, we check long long int must be int64_t.
+  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
+                "long long should be int64");
+  return CudaAtomicAdd(
+      reinterpret_cast<unsigned long long int *>(address),  // NOLINT
+      static_cast<unsigned long long int>(val));            // NOLINT
+}
+
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
+USE_CUDA_ATOMIC(Add, double);
+#else
+CUDA_ATOMIC_WRAPPER(Add, double) {
+  unsigned long long int *address_as_ull =                  // NOLINT
+      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull,
+                    assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN
+  } while (assumed != old);
+
+  return __longlong_as_double(old);
+}
+#endif
+
+#ifdef PADDLE_CUDA_FP16
+// NOTE(dzhwinter): cuda do not have atomicCAS for half.
+// Just use the half address as a unsigned value address and
+// do the atomicCAS. According to the value store at high 16 bits
+// or low 16 bits, then do a different sum and CAS.
+// Given most warp-threads will failed on the atomicCAS, so this
+// implemented should be avoided in high concurrency. It's will be
+// slower than the way convert value into 32bits and do a full atomicCAS.
+
+// convert the value into float and do the add arithmetic.
+// then store the result into a uint32.
+inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) {
+  float16 low_half;
+  // the float16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half = static_cast<float16>(static_cast<float>(low_half) + x);
+  return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
+  float16 high_half;
+  // the float16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half = static_cast<float16>(static_cast<float>(high_half) + x);
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+static __device__ __forceinline__ float16 CUDAFP16ToPDFP16(__half x) {
+  return *reinterpret_cast<float16 *>(&x);
+}
+
+static __device__ __forceinline__ __half PDFP16ToCUDAFP16(float16 x) {
+  return *reinterpret_cast<__half *>(&x);
+}
+
+CUDA_ATOMIC_WRAPPER(Add, float16) {
+  return CUDAFP16ToPDFP16(
+      atomicAdd(reinterpret_cast<__half *>(address), PDFP16ToCUDAFP16(val)));
+}
+#else
+CUDA_ATOMIC_WRAPPER(Add, float16) {
+  // concrete packed float16 value may exsits in lower or higher 16bits
+  // of the 32bits address.
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t sum;
+  uint32_t newval;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // the float16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    float16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // the float16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, add_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    float16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+#endif
+
+// The performance of "atomicAdd(half* )" is bad, but for "atomicAdd(half2* )"
+// is good. So for fp16 type, we can use "atomicAdd(half2* )" to speed up.
+template <
+    typename T,
+    typename std::enable_if<std::is_same<float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void fastAtomicAdd(T *tensor,
+                                              size_t index,
+                                              const size_t numel,
+                                              T value) {
+#if ((CUDA_VERSION < 10000) || \
+     (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
+  CudaAtomicAdd(reinterpret_cast<float16 *>(tensor) + index,
+                static_cast<float16>(value));
+#else
+  // whether the address is 32-byte aligned.
+  __half *target_addr = reinterpret_cast<__half *>(tensor + index);
+  bool aligned_half2 =
+      (reinterpret_cast<std::uintptr_t>(target_addr) % sizeof(__half2) == 0);
+
+  if (aligned_half2 && index < (numel - 1)) {
+    __half2 value2;
+    value2.x = *reinterpret_cast<__half *>(&value);
+    value2.y = __int2half_rz(0);
+    atomicAdd(reinterpret_cast<__half2 *>(target_addr), value2);
+
+  } else if (!aligned_half2 && index > 0) {
+    __half2 value2;
+    value2.x = __int2half_rz(0);
+    value2.y = *reinterpret_cast<__half *>(&value);
+    atomicAdd(reinterpret_cast<__half2 *>(target_addr - 1), value2);
+
+  } else {
+    atomicAdd(reinterpret_cast<__half *>(tensor) + index,
+              *reinterpret_cast<__half *>(&value));
+  }
+#endif
+}
+
+template <
+    typename T,
+    typename std::enable_if<!std::is_same<float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void fastAtomicAdd(T *arr,
+                                              size_t index,
+                                              const size_t numel,
+                                              T value) {
+  CudaAtomicAdd(arr + index, value);
+}
+#endif
+
+// NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16.
+inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) {
+  bfloat16 low_half;
+  // the bfloat16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half = static_cast<bfloat16>(static_cast<float>(low_half) + x);
+  return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) {
+  bfloat16 high_half;
+  // the bfloat16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half = static_cast<bfloat16>(static_cast<float>(high_half) + x);
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+static __device__ __forceinline__ bfloat16 CUDABF16ToPDBF16(__nv_bfloat16 x) {
+  return *reinterpret_cast<bfloat16 *>(&x);
+}
+
+static __device__ __forceinline__ __nv_bfloat16 PDBF16ToCUDABF16(bfloat16 x) {
+  return *reinterpret_cast<__nv_bfloat16 *>(&x);
+}
+
+CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
+  return CUDABF16ToPDBF16(atomicAdd(reinterpret_cast<__nv_bfloat16 *>(address),
+                                    PDBF16ToCUDABF16(val)));
+}
+#else
+CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
+  // concrete packed bfloat16 value may exsits in lower or higher 16bits
+  // of the 32bits address.
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t sum;
+  uint32_t newval;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // the bfloat16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_add_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    bfloat16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // the bfloat16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_add_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    bfloat16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+#endif
+
+CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
+  float *real = reinterpret_cast<float *>(address);
+  float *imag = real + 1;
+  return complex<float>(CudaAtomicAdd(real, val.real),
+                        CudaAtomicAdd(imag, val.imag));
+}
+
+CUDA_ATOMIC_WRAPPER(Add, complex<double>) {
+  double *real = reinterpret_cast<double *>(address);
+  double *imag = real + 1;
+  return complex<double>(CudaAtomicAdd(real, val.real),
+                         CudaAtomicAdd(imag, val.imag));
+}
+
+// For atomicMax
+USE_CUDA_ATOMIC(Max, int);
+USE_CUDA_ATOMIC(Max, unsigned int);
+// CUDA API uses unsigned long long int, we cannot use uint64_t here.
+// It because unsigned long long int is not necessarily uint64_t
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
+USE_CUDA_ATOMIC(Max, unsigned long long int);  // NOLINT
+#else
+CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) {  // NOLINT
+  if (*address >= val) {
+    return *address;
+  }
+
+  unsigned long long int old = *address, assumed;  // NOLINT
+
+  do {
+    assumed = old;
+    if (assumed >= val) {
+      break;
+    }
+
+    old = atomicCAS(address, assumed, val);
+  } while (assumed != old);
+}
+#endif
+
+CUDA_ATOMIC_WRAPPER(Max, int64_t) {
+  // Here, we check long long int must be int64_t.
+  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
+                "long long should be int64");
+  long long int res = *address;  // NOLINT
+  while (val > res) {
+    long long int old = res;                                           // NOLINT
+    res = (long long int)atomicCAS((unsigned long long int *)address,  // NOLINT
+                                   (unsigned long long int)old,        // NOLINT
+                                   (unsigned long long int)val);       // NOLINT
+    if (res == old) {
+      break;
+    }
+  }
+  return res;
+}
+
+CUDA_ATOMIC_WRAPPER(Max, float) {
+  if (*address >= val) {
+    return *address;
+  }
+
+  int *const address_as_i = reinterpret_cast<int *>(address);
+  int old = *address_as_i, assumed;
+
+  do {
+    assumed = old;
+    if (__int_as_float(assumed) >= val) {
+      break;
+    }
+
+    old = atomicCAS(address_as_i, assumed, __float_as_int(val));
+  } while (assumed != old);
+
+  return __int_as_float(old);
+}
+
+CUDA_ATOMIC_WRAPPER(Max, double) {
+  if (*address >= val) {
+    return *address;
+  }
+
+  unsigned long long int *const address_as_ull =            // NOLINT
+      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
+
+  do {
+    assumed = old;
+    if (__longlong_as_double(assumed) >= val) {
+      break;
+    }
+
+    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
+  } while (assumed != old);
+
+  return __longlong_as_double(old);
+}
+
+#ifdef PADDLE_CUDA_FP16
+inline static __device__ uint32_t max_to_low_half(uint32_t val, float x) {
+  float16 low_half;
+  // The float16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half = static_cast<float16>(max(static_cast<float>(low_half), x));
+  return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t max_to_high_half(uint32_t val, float x) {
+  float16 high_half;
+  // The float16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half = static_cast<float16>(max(static_cast<float>(high_half), x));
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+CUDA_ATOMIC_WRAPPER(Max, float16) {
+  if (*address >= val) {
+    return *address;
+  }
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // The float16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, max_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    float16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // The float16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, max_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    float16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+#endif
+
+// For atomicMin
+USE_CUDA_ATOMIC(Min, int);
+USE_CUDA_ATOMIC(Min, unsigned int);
+// CUDA API uses unsigned long long int, we cannot use uint64_t here.
+// It because unsigned long long int is not necessarily uint64_t
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
+USE_CUDA_ATOMIC(Min, unsigned long long int);  // NOLINT
+#else
+CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) {  // NOLINT
+  if (*address <= val) {
+    return *address;
+  }
+
+  unsigned long long int old = *address, assumed;  // NOLINT
+
+  do {
+    assumed = old;
+    if (assumed <= val) {
+      break;
+    }
+
+    old = atomicCAS(address, assumed, val);
+  } while (assumed != old);
+}
+#endif
+
+CUDA_ATOMIC_WRAPPER(Min, int64_t) {
+  // Here, we check long long int must be int64_t.
+  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
+                "long long should be int64");
+  long long int res = *address;  // NOLINT
+  while (val < res) {
+    long long int old = res;                                           // NOLINT
+    res = (long long int)atomicCAS((unsigned long long int *)address,  // NOLINT
+                                   (unsigned long long int)old,        // NOLINT
+                                   (unsigned long long int)val);       // NOLINT
+    if (res == old) {
+      break;
+    }
+  }
+  return res;
+}
+
+CUDA_ATOMIC_WRAPPER(Min, float) {
+  if (*address <= val) {
+    return *address;
+  }
+
+  int *const address_as_i = reinterpret_cast<int *>(address);
+  int old = *address_as_i, assumed;
+
+  do {
+    assumed = old;
+    if (__int_as_float(assumed) <= val) {
+      break;
+    }
+
+    old = atomicCAS(address_as_i, assumed, __float_as_int(val));
+  } while (assumed != old);
+
+  return __int_as_float(old);
+}
+
+CUDA_ATOMIC_WRAPPER(Min, double) {
+  if (*address <= val) {
+    return *address;
+  }
+
+  unsigned long long int *const address_as_ull =            // NOLINT
+      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
+
+  do {
+    assumed = old;
+    if (__longlong_as_double(assumed) <= val) {
+      break;
+    }
+
+    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
+  } while (assumed != old);
+
+  return __longlong_as_double(old);
+}
+
+#ifdef PADDLE_CUDA_FP16
+inline static __device__ uint32_t min_to_low_half(uint32_t val, float x) {
+  float16 low_half;
+  // The float16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half = static_cast<float16>(min(static_cast<float>(low_half), x));
+  return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t min_to_high_half(uint32_t val, float x) {
+  float16 high_half;
+  // The float16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half = static_cast<float16>(min(static_cast<float>(high_half), x));
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+CUDA_ATOMIC_WRAPPER(Min, float16) {
+  if (*address <= val) {
+    return *address;
+  }
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // The float16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, min_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    float16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // The float16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, min_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    float16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+#endif
+
+#ifdef PADDLE_CUDA_FP16
+#ifdef PADDLE_WITH_CUDA
+/*
+ * One thead block deals with elementwise atomicAdd for vector of len.
+ * @in: [x1, x2, x3, ...]
+ * @out:[y1+x1, y2+x2, y3+x3, ...]
+ * */
+template <
+    typename T,
+    typename std::enable_if<!std::is_same<float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
+    const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
+  for (int i = tid; i < len; i += threads_per_block) {
+    CudaAtomicAdd(&out[i], in[i]);
+  }
+}
+
+// Note: assume that len is even. If len is odd, call fastAtomicAdd directly.
+template <
+    typename T,
+    typename std::enable_if<std::is_same<float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
+    const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
+#if ((CUDA_VERSION < 10000) || \
+     (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
+  for (int i = tid; i < len; i += threads_per_block) {
+    CudaAtomicAdd(&out[i], in[i]);
+  }
+#else
+  int i = 0;
+  int loops = len / 2 * 2;
+
+  bool aligned_half2 =
+      (reinterpret_cast<std::uintptr_t>(out) % sizeof(__half2) == 0);
+
+  if (aligned_half2) {
+    for (i = tid * 2; i < loops; i += threads_per_block * 2) {
+      __half2 value2;
+      T value_1 = in[i];
+      T value_2 = in[i + 1];
+      value2.x = *reinterpret_cast<__half *>(&value_1);
+      value2.y = *reinterpret_cast<__half *>(&value_2);
+      atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2);
+    }
+    for (; i < len; i += threads_per_block) {
+      fastAtomicAdd(out, i, len, in[i]);
+    }
+  } else {
+    for (int i = tid; i < len; i += threads_per_block) {
+      fastAtomicAdd(out, i, len, in[i]);
+    }
+  }
+#endif
+}
+#endif
+#endif
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
index 93232d8f7f4..9eac6e60273 100644
--- a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include <type_traits>
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 #include "paddle/phi/kernels/funcs/gru_compute.h"
 
diff --git a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
index 9b5c24abc67..a8083b13887 100644
--- a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include <type_traits>
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 #include "paddle/phi/kernels/funcs/lstm_compute.h"
 
@@ -202,15 +202,12 @@ __global__ void KeLstmBackward(Op op,
   if (is_batch) {
     if (value.prev_state_value) {
       if (grad.check_ig_grad)
-        paddle::platform::CudaAtomicAdd(grad.check_ig_grad + frame_idx,
-                                        r_checkIGrad);
+        phi::CudaAtomicAdd(grad.check_ig_grad + frame_idx, r_checkIGrad);
       if (grad.check_fg_grad)
-        paddle::platform::CudaAtomicAdd(grad.check_fg_grad + frame_idx,
-                                        r_checkFGrad);
+        phi::CudaAtomicAdd(grad.check_fg_grad + frame_idx, r_checkFGrad);
     }
     if (grad.check_og_grad)
-      paddle::platform::CudaAtomicAdd(grad.check_og_grad + frame_idx,
-                                      r_checkOGrad);
+      phi::CudaAtomicAdd(grad.check_og_grad + frame_idx, r_checkOGrad);
   } else {
     if (value.prev_state_value) {
       if (grad.check_ig_grad) grad.check_ig_grad[frame_idx] += r_checkIGrad;
diff --git a/paddle/phi/kernels/funcs/gather.cu.h b/paddle/phi/kernels/funcs/gather.cu.h
index f660560fceb..ac8487db8f6 100644
--- a/paddle/phi/kernels/funcs/gather.cu.h
+++ b/paddle/phi/kernels/funcs/gather.cu.h
@@ -18,8 +18,8 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/memcpy.h"
 // TODO(paddle-dev): move gpu_primitives.h to phi
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -217,7 +217,7 @@ __global__ void GatherGradGPUKernel(const T* input,
     int64_t out_index =
         inner_dim_index * (outer_dim_size * out_index_dim_size) +
         index[index_dim_index] * outer_dim_size + out_dim_index;
-    paddle::platform::CudaAtomicAdd(out + out_index, *(input + idx));
+    phi::CudaAtomicAdd(out + out_index, *(input + idx));
   }
 }
 
diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu
index 59d4d21ec46..39cd26a455c 100644
--- a/paddle/phi/kernels/funcs/pooling.cu
+++ b/paddle/phi/kernels/funcs/pooling.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/primitive/datamover_primitives.h"
@@ -428,8 +428,7 @@ __global__ void KernelMaxPool2DGrad(const int nthreads,
 
     if (maxIndex != -1) {
       // atomic add
-      paddle::platform::CudaAtomicAdd(input_grad + maxIndex,
-                                      output_grad[index]);
+      phi::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]);
     }
   }
 }
@@ -1330,7 +1329,7 @@ __global__ void KernelMaxPool3DGrad(const int nthreads,
     }
     if (maxIdx != -1) {
       // atomic add
-      paddle::platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
+      phi::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
     }
   }
 }
@@ -2359,7 +2358,7 @@ __global__ void KernelMaxPool3DWithIdxGrad(
           w_offset;
       int max_index = mask[output_index];
       if (max_index != -1) {
-        paddle::platform::CudaAtomicAdd(
+        phi::CudaAtomicAdd(
             &input_grad[nc_offset * input_depth * input_height * input_width +
                         max_index],
             output_grad[output_index]);
diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h
index d42538edb75..6aeb09b232b 100644
--- a/paddle/phi/kernels/funcs/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/scatter.cu.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -70,7 +70,7 @@ __global__ void ScatterCUDAKernel(const T* params,
     if (overwrite) {
       *(output + out_i) = *(params + i);
     } else {
-      paddle::platform::CudaAtomicAdd(output + out_i, *(params + i));
+      phi::CudaAtomicAdd(output + out_i, *(params + i));
     }
   }
 }
@@ -104,7 +104,7 @@ __global__ void ScatterNdCUDAKernel(const T* update,
       temp *= output_dims[j];
     }
     int64_t output_i = gather_i + slice_i;
-    paddle::platform::CudaAtomicAdd(output + output_i, *(update + i));
+    phi::CudaAtomicAdd(output + output_i, *(update + i));
   }
 }
 
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
index 0b608367bbe..99efa783d8e 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cu
+++ b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include <algorithm>
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/segment_pooling.h"
@@ -60,7 +60,7 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids,
         }
         if (j > 0) {
           if (last_segment_id == first_segment_id) {
-            paddle::platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+            phi::CudaAtomicAdd(summed_ids + last_segment_id, sum);
           } else {
             *(summed_ids + last_segment_id) = sum;
           }
@@ -70,7 +70,7 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids,
       sum += T(1);
       last_segment_id = current_segment_id;
     }
-    paddle::platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+    phi::CudaAtomicAdd(summed_ids + last_segment_id, sum);
   }
 }
 
@@ -111,8 +111,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids,
               last_segment_id * inner_dim_size + segment_offset;
 
           if (last_segment_id == first_segment_id) {
-            paddle::platform::CudaAtomicAdd(
-                output + output_index, sum / *(summed_ids + last_segment_id));
+            phi::CudaAtomicAdd(output + output_index,
+                               sum / *(summed_ids + last_segment_id));
           } else {
             *(output + output_index) = sum / *(summed_ids + last_segment_id);
           }
@@ -123,8 +123,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids,
       last_segment_id = current_segment_id;
     }
     Index output_index = last_segment_id * inner_dim_size + segment_offset;
-    paddle::platform::CudaAtomicAdd(output + output_index,
-                                    sum / *(summed_ids + last_segment_id));
+    phi::CudaAtomicAdd(output + output_index,
+                       sum / *(summed_ids + last_segment_id));
   }
 }
 
@@ -215,7 +215,7 @@ class MaxPool {
   DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return paddle::platform::CudaAtomicMax(address, val);
+    return phi::CudaAtomicMax(address, val);
   }
 };
 
@@ -225,7 +225,7 @@ class MinPool {
   DEVICE inline T initial() { return static_cast<T>(FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y < x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return paddle::platform::CudaAtomicMin(address, val);
+    return phi::CudaAtomicMin(address, val);
   }
 };
 
@@ -235,7 +235,7 @@ class SumPool {
   DEVICE inline T initial() { return static_cast<T>(0); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y + x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return paddle::platform::CudaAtomicAdd(address, val);
+    return phi::CudaAtomicAdd(address, val);
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cu b/paddle/phi/kernels/funcs/selected_rows_functor.cu
index c21402693b9..e08fea2b353 100644
--- a/paddle/phi/kernels/funcs/selected_rows_functor.cu
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <set>
 #include <vector>
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -127,7 +127,7 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
     // Since index in rows of SelectedRows can be duplicate, we can not use
     // tensor_out[index] += selected_rows[index]; Instead, we have to use
     // AtomicAdd to avoid concurrent write error.
-    paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
+    phi::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
   }
 }
 }  // namespace
@@ -279,7 +279,7 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
   for (int index = tid; index < row_numel; index += block_size) {
     // Since index in rows of SelectedRows can be duplicate, we have to use
     // Atomic Operation to avoid concurrent write error.
-    paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
+    phi::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
   }
 }
 }  // namespace
@@ -360,7 +360,7 @@ __global__ void MergeAddKernel(const T* input,
   input += ty * row_numel;
   out += out_idx * row_numel;
   for (int index = tid; index < row_numel; index += block_size) {
-    paddle::platform::CudaAtomicAdd(out + index, input[index]);
+    phi::CudaAtomicAdd(out + index, input[index]);
   }
 }
 
@@ -623,9 +623,9 @@ struct UpdateToTensor<phi::GPUContext, T> {
     auto* in1_data = in1_value.template data<T>();
     auto* in2_data = input2->data<T>();
 
-    dim3 threads(paddle::platform::PADDLE_CUDA_NUM_THREADS, 1);
+    dim3 threads(phi::PADDLE_CUDA_NUM_THREADS, 1);
     dim3 grid(in1_rows.size(), 1);
-    UpdateToTensorKernel<T, paddle::platform::PADDLE_CUDA_NUM_THREADS>
+    UpdateToTensorKernel<T, phi::PADDLE_CUDA_NUM_THREADS>
         <<<grid, threads, 0, context.stream()>>>(
             in1_data, in1_rows.cuda_data(), op, in2_data, in1_row_numel);
   }
diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu
index 771b189f040..ef3e5b9af24 100644
--- a/paddle/phi/kernels/gpu/accuracy_kernel.cu
+++ b/paddle/phi/kernels/gpu/accuracy_kernel.cu
@@ -17,14 +17,14 @@
 #include <thrust/execution_policy.h>
 #include <thrust/reduce.h>
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <int BlockSize>
 __global__ void AccuracyCudaKernel(const int N,
diff --git a/paddle/phi/kernels/gpu/adagrad_kernel.cu b/paddle/phi/kernels/gpu/adagrad_kernel.cu
index 150b18bdbd6..53169c8bcfb 100644
--- a/paddle/phi/kernels/gpu/adagrad_kernel.cu
+++ b/paddle/phi/kernels/gpu/adagrad_kernel.cu
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/adagrad_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/selected_rows_functor.h"
@@ -47,7 +47,7 @@ __global__ void MergeGradKernel(const T* grad,
   grad += ty * row_numel;
   grad_merge += grad_merge_idx * row_numel;
   for (int index = tid; index < row_numel; index += block_size) {
-    paddle::platform::CudaAtomicAdd(grad_merge + index, grad[index]);
+    phi::CudaAtomicAdd(grad_merge + index, grad[index]);
   }
 }
 
@@ -69,9 +69,9 @@ __global__ void SparseAdagradFunctorKernel(const T* grad,
   for (int index = tid; index < row_numel; index += block_size) {
     // Since index in rows of SelectedRows can be duplicate, we have to use
     // Atomic Operation to avoid concurrent write error.
-    paddle::platform::CudaAtomicAdd(param + index,
-                                    -1.0 * learning_rate[0] * grad[index] /
-                                        (sqrt(moment[index]) + epsilon));
+    phi::CudaAtomicAdd(param + index,
+                       -1.0 * learning_rate[0] * grad[index] /
+                           (sqrt(moment[index]) + epsilon));
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu
index 5cfa8cf3067..a7a82236a40 100644
--- a/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu
@@ -18,9 +18,9 @@
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/affine_grid_utils.h"
@@ -75,18 +75,14 @@ __global__ void affine_grid_grad_kernel_4d(const int count,
 
     int theta_offset = n * 6;  // 2 * 3;
     T out_grad_x = out_grad[index * 2];
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset,
-                                    out_grad_x * w_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 1,
-                                    out_grad_x * h_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x);
+    phi::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * w_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * h_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x);
 
     T out_grad_y = out_grad[index * 2 + 1];
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 3,
-                                    out_grad_y * w_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 4,
-                                    out_grad_y * h_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_y * w_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * h_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y);
   }
 }
 
@@ -116,31 +112,22 @@ __global__ void affine_grid_grad_kernel_5d(const int count,
 
     int theta_offset = n * 12;  // 3 * 4;
     T out_grad_x = out_grad[index * 3];
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset,
-                                    out_grad_x * w_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 1,
-                                    out_grad_x * h_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 2,
-                                    out_grad_x * d_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_x);
+    phi::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * w_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * h_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x * d_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_x);
 
     T out_grad_y = out_grad[index * 3 + 1];
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 4,
-                                    out_grad_y * w_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 5,
-                                    out_grad_y * h_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 6,
-                                    out_grad_y * d_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 7, out_grad_y);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y * h_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 6, out_grad_y * d_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 7, out_grad_y);
 
     T out_grad_z = out_grad[index * 3 + 2];
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 8,
-                                    out_grad_z * w_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 9,
-                                    out_grad_z * h_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 10,
-                                    out_grad_z * d_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 11, out_grad_z);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 8, out_grad_z * w_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 9, out_grad_z * h_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 10, out_grad_z * d_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 11, out_grad_z);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/affine_grid_kernel.cu b/paddle/phi/kernels/gpu/affine_grid_kernel.cu
index 0f42960502b..499ed260eef 100644
--- a/paddle/phi/kernels/gpu/affine_grid_kernel.cu
+++ b/paddle/phi/kernels/gpu/affine_grid_kernel.cu
@@ -18,9 +18,9 @@
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/affine_grid_utils.h"
diff --git a/paddle/phi/kernels/gpu/auc_kernel.cu b/paddle/phi/kernels/gpu/auc_kernel.cu
index 44c0df3b522..c815f33a667 100644
--- a/paddle/phi/kernels/gpu/auc_kernel.cu
+++ b/paddle/phi/kernels/gpu/auc_kernel.cu
@@ -14,13 +14,13 @@
 
 #include "paddle/phi/kernels/auc_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 __global__ void ClearObsoleteDataKernel(int64_t *pos,
                                         int64_t *neg,
@@ -74,9 +74,9 @@ __global__ void AddDataKernel(const int64_t *label_data,
                    "The predict data must gather or equal 0.");
     uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
     if (label_data[i]) {
-      paddle::platform::CudaAtomicAdd(pos + cur_step_begin + binIdx, 1);
+      phi::CudaAtomicAdd(pos + cur_step_begin + binIdx, 1);
     } else {
-      paddle::platform::CudaAtomicAdd(neg + cur_step_begin + binIdx, 1);
+      phi::CudaAtomicAdd(neg + cur_step_begin + binIdx, 1);
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/bincount_kernel.cu b/paddle/phi/kernels/gpu/bincount_kernel.cu
index 3b1e41d92e6..162df476982 100644
--- a/paddle/phi/kernels/gpu/bincount_kernel.cu
+++ b/paddle/phi/kernels/gpu/bincount_kernel.cu
@@ -14,15 +14,15 @@
 
 #include "paddle/phi/kernels/bincount_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 inline int GET_BLOCKS(const int N) {
   return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
@@ -36,12 +36,11 @@ __global__ void KernelBincount(const InputT* input,
                                OutT* output) {
   if (!has_weights) {
     for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
-      paddle::platform::CudaAtomicAdd(&output[input[i]], 1L);
+      phi::CudaAtomicAdd(&output[input[i]], 1L);
     }
   } else {
     for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
-      paddle::platform::CudaAtomicAdd(&output[input[i]],
-                                      static_cast<OutT>(weights[i]));
+      phi::CudaAtomicAdd(&output[input[i]], static_cast<OutT>(weights[i]));
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/box_coder.cu b/paddle/phi/kernels/gpu/box_coder.cu
index bca18c25c3e..6dd53c04d29 100644
--- a/paddle/phi/kernels/gpu/box_coder.cu
+++ b/paddle/phi/kernels/gpu/box_coder.cu
@@ -18,8 +18,8 @@
 #include <thrust/host_vector.h>
 
 #include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/box_coder.h"
 
diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
index b46f1f4a331..55c8a9f96fd 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/deformable_conv_grad_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h"
 
@@ -107,8 +107,8 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
                                            height,
                                            width);
 
-          paddle::platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos,
-                                          weight * cur_top_grad);
+          phi::CudaAtomicAdd(grad_im + cur_bottom_grad_pos,
+                             weight * cur_top_grad);
         }
       }
     }
diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
index 21d0faf0b0d..5da0ae96e6b 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv.h
+++ b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -28,7 +28,7 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -981,7 +981,7 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC(
       }
 #undef gaid
     }
-    platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
+    phi::CudaAtomicAdd(&filter_grad_data[gbid], s);
   }
 }
 
@@ -1057,7 +1057,7 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC(
     }
     for (int i = 0; i < c_filter * c_filter; ++i) {
       T* weight = filter_grad_data + i * output_channels + kernel_id;
-      platform::CudaAtomicAdd(&weight[0], r_weight[i]);
+      phi::CudaAtomicAdd(&weight[0], r_weight[i]);
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
index 1fd1e446991..05a57426fcb 100644
--- a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
@@ -15,13 +15,13 @@
 #include "paddle/phi/kernels/diagonal_grad_kernel.h"
 
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/diagonal.h"
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T, typename Context>
 void DiagonalGradKernel(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/gpu/diagonal_kernel.cu b/paddle/phi/kernels/gpu/diagonal_kernel.cu
index 169cb3f2c78..74bad0ecd9a 100644
--- a/paddle/phi/kernels/gpu/diagonal_kernel.cu
+++ b/paddle/phi/kernels/gpu/diagonal_kernel.cu
@@ -15,12 +15,12 @@
 #include "paddle/phi/kernels/diagonal_kernel.h"
 
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/diagonal.h"
 
 namespace phi {
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 template <typename T, typename Context>
 void DiagonalKernel(const Context& dev_ctx,
                     const DenseTensor& x,
diff --git a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
index 130dc99ab17..bcce09649a8 100644
--- a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
@@ -32,7 +32,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace phi {
 
@@ -69,7 +69,7 @@ __global__ void GPUDistFpnProposalsHelper(const int nthreads,
     tgt_lvl = min(max_level, max(tgt_lvl, min_level));
     target_lvls[i] = tgt_lvl;
     // compute number of rois in the same batch and same target level
-    paddle::platform::CudaAtomicAdd(
+    phi::CudaAtomicAdd(
         sub_lod_list + (tgt_lvl - min_level) * lod_size + roi_batch_ind, 1);
   }
 }
diff --git a/paddle/phi/kernels/gpu/edit_distance_kernel.cu b/paddle/phi/kernels/gpu/edit_distance_kernel.cu
index 993b4771cc9..d63430d5279 100644
--- a/paddle/phi/kernels/gpu/edit_distance_kernel.cu
+++ b/paddle/phi/kernels/gpu/edit_distance_kernel.cu
@@ -18,14 +18,14 @@
 #include <vector>
 
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T>
 __global__ void FillFirstRow(T* dist, const int N) {
diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
index e10d01ce9e4..0cfe2e43d18 100644
--- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
@@ -16,8 +16,8 @@
 
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -51,10 +51,10 @@ __global__ void EmbeddingGrad(T* table,
     const T* out = output + idy * D;
     T* tab = table + id * D;
 #ifdef PADDLE_WITH_CUDA
-    paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
+    phi::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
 #else
     for (int i = idx; i < D; i += blockDim.x) {
-      paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
+      phi::CudaAtomicAdd(&tab[i], out[i]);
     }
 #endif
     idy += blockDim.y * gridDim.x;
diff --git a/paddle/phi/kernels/gpu/graph_reindex_funcs.h b/paddle/phi/kernels/gpu/graph_reindex_funcs.h
index 0a6d6a549a7..aee6e5c4d46 100644
--- a/paddle/phi/kernels/gpu/graph_reindex_funcs.h
+++ b/paddle/phi/kernels/gpu/graph_reindex_funcs.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/graph_reindex_kernel.h"
 
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
index 180a9dfac85..9aacba8a7a3 100644
--- a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
+++ b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
@@ -19,8 +19,8 @@
 #include <algorithm>
 #include <vector>
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/send_u_recv_kernel.h"
 
@@ -32,7 +32,7 @@ struct GraphSendRecvSumCUDAFunctor {
                                 T* output,
                                 const IndexT& in_i,
                                 const IndexT& out_i) {
-    paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i));
+    phi::CudaAtomicAdd(output + out_i, *(params + in_i));
   }
 };
 
@@ -42,7 +42,7 @@ struct GraphSendRecvMaxCUDAFunctor {
                                 T* output,
                                 const IndexT& in_i,
                                 const IndexT& out_i) {
-    paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i));
+    phi::CudaAtomicMax(output + out_i, *(params + in_i));
   }
 };
 
@@ -52,7 +52,7 @@ struct GraphSendRecvMinCUDAFunctor {
                                 T* output,
                                 const IndexT& in_i,
                                 const IndexT& out_i) {
-    paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i));
+    phi::CudaAtomicMin(output + out_i, *(params + in_i));
   }
 };
 
@@ -106,7 +106,7 @@ __global__ void ComputeCountCUDAKernel(int32_t* count,
                                        size_t index_size) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) {
     IndexT dst_i = dst_indices[i];
-    paddle::platform::CudaAtomicAdd(count + dst_i, 1);
+    phi::CudaAtomicAdd(count + dst_i, 1);
   }
 }
 
@@ -140,8 +140,8 @@ __global__ void ManipulateMeanGradCUDAKernel(const T* params,
     IndexT dst_i = dst_indices[indices_i];
     int64_t in_i = src_i * slice_size + slice_i;
     int64_t out_i = dst_i * slice_size + slice_i;
-    paddle::platform::CudaAtomicAdd(
-        output + out_i, *(params + in_i) / static_cast<T>(dst_count[src_i]));
+    phi::CudaAtomicAdd(output + out_i,
+                       *(params + in_i) / static_cast<T>(dst_count[src_i]));
   }
 }
 
@@ -162,10 +162,9 @@ __global__ void ManipulateMinMaxGradCUDAKernel(const T* params,
     IndexT dst_i = dst_indices[indices_i];
     int64_t in_i = src_i * slice_size + slice_i;
     int64_t out_i = dst_i * slice_size + slice_i;
-    paddle::platform::CudaAtomicAdd(
-        output + out_i,
-        *(params + in_i) *
-            static_cast<T>(*(ptr_input + out_i) == *(ptr_output + in_i)));
+    phi::CudaAtomicAdd(output + out_i,
+                       *(params + in_i) * static_cast<T>(*(ptr_input + out_i) ==
+                                                         *(ptr_output + in_i)));
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
index 1bc841a6d8b..bff91078865 100644
--- a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
+++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
@@ -17,8 +17,8 @@
 #include <thrust/device_vector.h>
 #include <thrust/fill.h>
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/impl/graph_message_passing_impl.h"
 
@@ -29,25 +29,25 @@ namespace phi {
 #define CUDA_MAX_NUM_BLOCKS_Z 0xFFFF
 
 inline void CopyBCastOff(const BroadCastInfo& bcast_info,
-                         thrust::device_vector<int64_t>& l_bcastoff,
-                         thrust::device_vector<int64_t>& r_bcastoff) {
-  l_bcastoff.resize(bcast_info.out_len);
-  r_bcastoff.resize(bcast_info.out_len);
+                         thrust::device_vector<int64_t>* l_bcastoff,
+                         thrust::device_vector<int64_t>* r_bcastoff) {
+  l_bcastoff->resize(bcast_info.out_len);
+  r_bcastoff->resize(bcast_info.out_len);
 #ifdef PADDLE_WITH_HIP
-  hipMemcpy(thrust::raw_pointer_cast(l_bcastoff.data()),
+  hipMemcpy(thrust::raw_pointer_cast(l_bcastoff->data()),
             bcast_info.l_offset.data(),
             sizeof(int64_t) * bcast_info.out_len,
             hipMemcpyHostToDevice);
-  hipMemcpy(thrust::raw_pointer_cast(r_bcastoff.data()),
+  hipMemcpy(thrust::raw_pointer_cast(r_bcastoff->data()),
             bcast_info.r_offset.data(),
             sizeof(int64_t) * bcast_info.out_len,
             hipMemcpyHostToDevice);
 #else
-  cudaMemcpy(thrust::raw_pointer_cast(l_bcastoff.data()),
+  cudaMemcpy(thrust::raw_pointer_cast(l_bcastoff->data()),
              bcast_info.l_offset.data(),
              sizeof(int64_t) * bcast_info.out_len,
              cudaMemcpyHostToDevice);
-  cudaMemcpy(thrust::raw_pointer_cast(r_bcastoff.data()),
+  cudaMemcpy(thrust::raw_pointer_cast(r_bcastoff->data()),
              bcast_info.r_offset.data(),
              sizeof(int64_t) * bcast_info.out_len,
              cudaMemcpyHostToDevice);
@@ -102,21 +102,21 @@ inline int FindNumBlocks(char axis, int nblocks, int max_num_blocks = -1) {
 template <typename T>
 struct GraphSendUERecvSumCUDAFunctor {
   DEVICE inline void operator()(T* output, T val) {
-    paddle::platform::CudaAtomicAdd(output, val);
+    phi::CudaAtomicAdd(output, val);
   }
 };
 
 template <typename T>
 struct GraphSendUERecvMaxCUDAFunctor {
   DEVICE inline void operator()(T* output, T val) {
-    paddle::platform::CudaAtomicMax(output, val);
+    phi::CudaAtomicMax(output, val);
   }
 };
 
 template <typename T>
 struct GraphSendUERecvMinCUDAFunctor {
   DEVICE inline void operator()(T* output, T val) {
-    paddle::platform::CudaAtomicMin(output, val);
+    phi::CudaAtomicMin(output, val);
   }
 };
 
@@ -192,8 +192,7 @@ __global__ void ManipulateMeanGradCUDAKernelForMulX(const T* out_grad_data,
       int64_t o_add = use_bcast ? l_bcastoff[tx] : tx;
       int64_t e_add = use_bcast ? r_bcastoff[tx] : tx;
       T val = out_grad_off[o_add] * e_off[e_add];
-      paddle::platform::CudaAtomicAdd(x_grad_off + tx,
-                                      val / static_cast<T>(dst_count[src]));
+      phi::CudaAtomicAdd(x_grad_off + tx, val / static_cast<T>(dst_count[src]));
       tx += stride_x;
     }
     ty += stride_y;
@@ -222,7 +221,7 @@ __global__ void ManipulateSumGradCUDAKernelForAddE(const T* out_grad_data,
     const T* out_grad_off = out_grad_data + dst * out_len;
     while (tx < out_len) {
       int64_t e_add = use_bcast ? r_bcastoff[tx] : tx;
-      paddle::platform::CudaAtomicAdd(e_grad_off + e_add, out_grad_off[tx]);
+      phi::CudaAtomicAdd(e_grad_off + e_add, out_grad_off[tx]);
       tx += stride_x;
     }
     ty += stride_y;
@@ -258,8 +257,7 @@ __global__ void ManipulateSumGradCUDAKernelForMulE(const T* x_data,
     while (tx < out_len) {
       int64_t x_add = use_bcast ? l_bcastoff[tx] : tx;
       int64_t e_add = use_bcast ? r_bcastoff[tx] : tx;
-      paddle::platform::CudaAtomicAdd(e_grad_off + e_add,
-                                      out_grad_off[tx] * x_off[x_add]);
+      phi::CudaAtomicAdd(e_grad_off + e_add, out_grad_off[tx] * x_off[x_add]);
       tx += stride_x;
     }
     ty += stride_y;
@@ -289,9 +287,8 @@ __global__ void ManipulateMeanGradCUDAKernelForAddE(const T* out_grad_data,
     const T* out_grad_off = out_grad_data + dst * out_len;
     while (tx < out_len) {
       int64_t e_add = use_bcast ? r_bcastoff[tx] : tx;
-      paddle::platform::CudaAtomicAdd(
-          e_grad_off + e_add,
-          out_grad_off[tx] / static_cast<T>(dst_count[dst]));
+      phi::CudaAtomicAdd(e_grad_off + e_add,
+                         out_grad_off[tx] / static_cast<T>(dst_count[dst]));
       tx += stride_x;
     }
     ty += stride_y;
@@ -328,7 +325,7 @@ __global__ void ManipulateMeanGradCUDAKernelForMulE(const T* x_data,
     while (tx < out_len) {
       int64_t x_add = use_bcast ? l_bcastoff[tx] : tx;
       int64_t e_add = use_bcast ? r_bcastoff[tx] : tx;
-      paddle::platform::CudaAtomicAdd(
+      phi::CudaAtomicAdd(
           e_grad_off + e_add,
           out_grad_off[tx] * x_off[x_add] / static_cast<T>(dst_count[dst]));
       tx += stride_x;
@@ -373,12 +370,10 @@ __global__ void ManipulateMinMaxGradCUDAKernelForAdd(const T* x_data,
       int64_t x_add = use_bcast ? xbcast_off[tx] : tx;
       int64_t e_add = use_bcast ? ebcast_off[tx] : tx;
       T val = x_off[x_add] + e_off[e_add];
-      paddle::platform::CudaAtomicAdd(
-          x_grad_off + x_add,
-          out_grad_off[tx] * static_cast<T>(val == out_off[tx]));
-      paddle::platform::CudaAtomicAdd(
-          e_grad_off + e_add,
-          out_grad_off[tx] * static_cast<T>(val == out_off[tx]));
+      phi::CudaAtomicAdd(x_grad_off + x_add,
+                         out_grad_off[tx] * static_cast<T>(val == out_off[tx]));
+      phi::CudaAtomicAdd(e_grad_off + e_add,
+                         out_grad_off[tx] * static_cast<T>(val == out_off[tx]));
       tx += stride_x;
     }
     ty += stride_y;
@@ -421,10 +416,10 @@ __global__ void ManipulateMinMaxGradCUDAKernelForMul(const T* x_data,
       int64_t x_add = use_bcast ? xbcast_off[tx] : tx;
       int64_t e_add = use_bcast ? ebcast_off[tx] : tx;
       T val = x_off[x_add] * e_off[e_add];
-      paddle::platform::CudaAtomicAdd(
+      phi::CudaAtomicAdd(
           x_grad_off + x_add,
           out_grad_off[tx] * static_cast<T>(val == out_off[tx]) * e_off[e_add]);
-      paddle::platform::CudaAtomicAdd(
+      phi::CudaAtomicAdd(
           e_grad_off + e_add,
           out_grad_off[tx] * static_cast<T>(val == out_off[tx]) * x_off[x_add]);
       tx += stride_x;
diff --git a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
index 436faeaabf9..8f4beaa2677 100644
--- a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
@@ -15,9 +15,9 @@
 #include "paddle/phi/kernels/grid_sample_grad_kernel.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpu/grid_sample_utils.h"
@@ -28,7 +28,7 @@ template <typename T>
 static __forceinline__ __device__ void AtomicAdd(
     T* data, int h, int w, int sH, int sW, int H, int W, T delta) {
   if (InBounds(h, w, H, W)) {
-    paddle::platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
+    phi::CudaAtomicAdd(data + h * sH + w * sW, delta);
   }
 }
 
@@ -45,7 +45,7 @@ static __forceinline__ __device__ void AtomicAdd3D(T* data,
                                                    int W,
                                                    T delta) {
   if (InBounds3D(d, h, w, D, H, W)) {
-    paddle::platform::CudaAtomicAdd(data + d * sD + h * sH + w * sW, delta);
+    phi::CudaAtomicAdd(data + d * sD + h * sH + w * sW, delta);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
index cbe8c18cbff..54006cd8d5a 100644
--- a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
@@ -71,14 +71,14 @@ __global__ void GroupNormBackwardGetMeanAndVar(const T* x,
 
   if (flags & kHasScale) {
 #if CUDA_VERSION >= 11070
-    paddle::platform::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data);
+    phi::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data);
 #else
     CudaAtomicAddWithWarp(&(d_scale[ccid]), d_scale_data);
 #endif
   }
   if (flags & kHasBias) {
 #if CUDA_VERSION >= 11070
-    paddle::platform::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data);
+    phi::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data);
 #else
     CudaAtomicAddWithWarp(&(d_bias[ccid]), d_bias_data);
 #endif
diff --git a/paddle/phi/kernels/gpu/group_norm_utils.h b/paddle/phi/kernels/gpu/group_norm_utils.h
index 6af7b96ca21..00986817c61 100644
--- a/paddle/phi/kernels/gpu/group_norm_utils.h
+++ b/paddle/phi/kernels/gpu/group_norm_utils.h
@@ -23,7 +23,7 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 namespace phi {
@@ -51,7 +51,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
   typedef cub::WarpReduce<T> WarpReduce;
   typename WarpReduce::TempStorage temp_storage;
   value = WarpReduce(temp_storage).Sum(value);
-  if (cub::LaneId() == 0) paddle::platform::CudaAtomicAdd(sum, value);
+  if (cub::LaneId() == 0) phi::CudaAtomicAdd(sum, value);
 }
 
 template <typename T, typename AccT, int VecSize, int Num>
diff --git a/paddle/phi/kernels/gpu/histogram_kernel.cu b/paddle/phi/kernels/gpu/histogram_kernel.cu
index 4cc6bc35578..02f5bbb530a 100644
--- a/paddle/phi/kernels/gpu/histogram_kernel.cu
+++ b/paddle/phi/kernels/gpu/histogram_kernel.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/histogram_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
@@ -25,7 +25,7 @@
 namespace phi {
 
 using IndexType = int64_t;
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 inline int GET_BLOCKS(const int N) {
   return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
@@ -61,13 +61,13 @@ __global__ void KernelHistogram(const T* input,
     if (input_value >= min_value && input_value <= max_value) {
       const IndexType output_index =
           GetBin<T, IndexType>(input_value, min_value, max_value, nbins);
-      paddle::platform::CudaAtomicAdd(&buf_hist[output_index], 1);
+      phi::CudaAtomicAdd(&buf_hist[output_index], 1);
     }
   }
   __syncthreads();
 
   for (int i = threadIdx.x; i < nbins; i += blockDim.x) {
-    paddle::platform::CudaAtomicAdd(&output[i], buf_hist[i]);
+    phi::CudaAtomicAdd(&output[i], buf_hist[i]);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/index_add_grad_kernel.cu b/paddle/phi/kernels/gpu/index_add_grad_kernel.cu
index ddc8a65ad51..88e42a16dba 100644
--- a/paddle/phi/kernels/gpu/index_add_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_add_grad_kernel.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/index_add_grad_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -24,7 +24,7 @@
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T, typename Context>
 void IndexAddGradKernel(const Context& ctx,
diff --git a/paddle/phi/kernels/gpu/index_add_kernel.cu b/paddle/phi/kernels/gpu/index_add_kernel.cu
index 047e54b99aa..215b28085a9 100644
--- a/paddle/phi/kernels/gpu/index_add_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_add_kernel.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/index_add_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 
@@ -24,7 +24,7 @@ DECLARE_bool(cudnn_deterministic);
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T, typename IndexT>
 __global__ void index_add_cuda_kernel(const T* input,
@@ -41,7 +41,7 @@ __global__ void index_add_cuda_kernel(const T* input,
     IndexT src_dim_idx = index[dim_idx];
     int64_t input_idx =
         idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
-    paddle::platform::CudaAtomicAdd(&output[input_idx], add_value[idx]);
+    phi::CudaAtomicAdd(&output[input_idx], add_value[idx]);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
index 5368d98c56a..6c94e14492b 100644
--- a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
@@ -18,9 +18,9 @@
 #include <vector>
 
 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -50,8 +50,8 @@ __global__ void IndexSampleGrad(const IndexT* index,
       unsigned int in_idx = index_j * input_length + index_i;
       IndexT sample_idx = index[index_idx];
       if (same_data_in_row) {
-        paddle::platform::CudaAtomicAdd(
-            &(in_grad[in_idx - index_i + sample_idx]), out_grad[sample_idx]);
+        phi::CudaAtomicAdd(&(in_grad[in_idx - index_i + sample_idx]),
+                           out_grad[sample_idx]);
       } else {
         in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx];
       }
diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
index fb9157db557..1cd998810ee 100644
--- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/index_select_grad_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -25,7 +25,7 @@ DECLARE_bool(cudnn_deterministic);
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T, typename IndexT>
 __global__ void index_select_grad_cuda_kernel(const T* output_grad,
@@ -42,7 +42,7 @@ __global__ void index_select_grad_cuda_kernel(const T* output_grad,
     IndexT src_dim_idx = index[dim_idx];
     int64_t input_idx =
         idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
-    paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]);
+    phi::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/index_select_impl.h b/paddle/phi/kernels/gpu/index_select_impl.h
index da9cdbf5278..deeb6e5eb20 100644
--- a/paddle/phi/kernels/gpu/index_select_impl.h
+++ b/paddle/phi/kernels/gpu/index_select_impl.h
@@ -14,15 +14,15 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T, typename IndexT>
 __global__ void index_select_cuda_kernel(const T* input,
diff --git a/paddle/phi/kernels/gpu/index_select_kernel.cu b/paddle/phi/kernels/gpu/index_select_kernel.cu
index 135ae526518..925feee4ccf 100644
--- a/paddle/phi/kernels/gpu/index_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_kernel.cu
@@ -14,16 +14,16 @@
 
 #include "paddle/phi/kernels/index_select_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/gpu/index_select_impl.h"
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T, typename Context>
 void IndexSelectKernel(const Context& ctx,
diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
index ee24d9f9005..51a5f50560e 100644
--- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/interpolate_grad_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -96,12 +96,11 @@ __global__ void KeLinearInterpBw(T* in,
     const T* out_pos = &out[out_id_w];
 
     if (data_layout == DataLayout::kNCHW) {
-      paddle::platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
-      paddle::platform::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]);
     } else {
-      paddle::platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
-      paddle::platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
-                                      w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[w_id * num_channels], w1lambda * out_pos[0]);
     }
   }
 }
@@ -141,7 +140,7 @@ __global__ void KeNearestNeighborInterpNCHWBw(T* in,
     while (nc_id < nc) {
       T* in_pos = &in[in_index];
       const T out_pos = out[out_index];
-      paddle::platform::CudaAtomicAdd(in_pos, out_pos);
+      phi::CudaAtomicAdd(in_pos, out_pos);
       in_index += in_index_stride;
       out_index += out_index_stride;
       nc_id += nc_stride;
@@ -194,7 +193,7 @@ __global__ void KeNearestNeighborInterpBw(
                     in_img_idx * num_channels + channel_id];
 
     const T out_pos = out[tid];
-    paddle::platform::CudaAtomicAdd(in_pos, out_pos);
+    phi::CudaAtomicAdd(in_pos, out_pos);
   }
 }
 
@@ -218,7 +217,7 @@ __inline__ __device__ T PartialBlockMin(T val,
     }
   } else {
     shared_last_val = std::numeric_limits<T>::max();
-    paddle::platform::CudaAtomicMin(&shared_last_val, val);
+    phi::CudaAtomicMin(&shared_last_val, val);
     shared[wid] = shared_last_val;
     shared_last_idx = wid;
   }
@@ -308,33 +307,27 @@ __global__ void KeBilinearInterpBwShareMemory(T* in,
                                     ? (in_top_max_index - in_top_min_index)
                                     : (in_bot_max_index - in_bot_min_index);
     if (h_id != 0) {
-      paddle::platform::CudaAtomicAdd(
-          &s_data[0][input_index - in_top_min_index],
-          h2lambda * w2lambda * value);
-      paddle::platform::CudaAtomicAdd(
-          &s_data[0][top_right_index - in_top_min_index],
-          h2lambda * w1lambda * value);
-      paddle::platform::CudaAtomicAdd(
-          &s_data[1][bot_left_index - in_bot_min_index],
-          h1lambda * w2lambda * value);
-      paddle::platform::CudaAtomicAdd(
-          &s_data[1][bot_right_index - in_bot_min_index],
-          h1lambda * w1lambda * value);
+      phi::CudaAtomicAdd(&s_data[0][input_index - in_top_min_index],
+                         h2lambda * w2lambda * value);
+      phi::CudaAtomicAdd(&s_data[0][top_right_index - in_top_min_index],
+                         h2lambda * w1lambda * value);
+      phi::CudaAtomicAdd(&s_data[1][bot_left_index - in_bot_min_index],
+                         h1lambda * w2lambda * value);
+      phi::CudaAtomicAdd(&s_data[1][bot_right_index - in_bot_min_index],
+                         h1lambda * w1lambda * value);
     } else {
-      paddle::platform::CudaAtomicAdd(
-          &s_data[0][top_right_index - in_top_min_index],
-          (h2lambda + h1lambda) * w1lambda * value);
-      paddle::platform::CudaAtomicAdd(
-          &s_data[1][bot_left_index - in_bot_min_index],
-          (h1lambda + h2lambda) * w2lambda * value);
+      phi::CudaAtomicAdd(&s_data[0][top_right_index - in_top_min_index],
+                         (h2lambda + h1lambda) * w1lambda * value);
+      phi::CudaAtomicAdd(&s_data[1][bot_left_index - in_bot_min_index],
+                         (h1lambda + h2lambda) * w2lambda * value);
     }
     __syncthreads();
 
     if (threadIdx.x <= upper_limit_share_idx) {
-      paddle::platform::CudaAtomicAdd(&in[in_top_min_index + threadIdx.x],
-                                      s_data[0][threadIdx.x]);
-      paddle::platform::CudaAtomicAdd(&in[in_bot_min_index + threadIdx.x],
-                                      s_data[1][threadIdx.x]);
+      phi::CudaAtomicAdd(&in[in_top_min_index + threadIdx.x],
+                         s_data[0][threadIdx.x]);
+      phi::CudaAtomicAdd(&in[in_bot_min_index + threadIdx.x],
+                         s_data[1][threadIdx.x]);
     }
   }
 }
@@ -387,17 +380,14 @@ __global__ void KeBilinearInterpNCHWBw(T* in,
 
     T d2val = out[index];
 
-    paddle::platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1),
-                                    h0lambda * w0lambda * d2val);
-    paddle::platform::CudaAtomicAdd(
-        in + GetInputIndex(nc, in_h, in_w, h1, w1 + x_id),
-        h0lambda * w1lambda * d2val);
-    paddle::platform::CudaAtomicAdd(
-        in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1),
-        h1lambda * w0lambda * d2val);
-    paddle::platform::CudaAtomicAdd(
-        in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1 + x_id),
-        h1lambda * w1lambda * d2val);
+    phi::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1),
+                       h0lambda * w0lambda * d2val);
+    phi::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1 + x_id),
+                       h0lambda * w1lambda * d2val);
+    phi::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1),
+                       h1lambda * w0lambda * d2val);
+    phi::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1 + x_id),
+                       h1lambda * w1lambda * d2val);
   }
 }
 
@@ -446,12 +436,12 @@ __global__ void KeBilinearInterpBw(T* in,
     T value = out[tid];
     T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels +
                     in_img_idx * num_channels + channel_id];
-    paddle::platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value);
-    paddle::platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
-                                    h2lambda * w1lambda * value);
-    paddle::platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels],
-                                    h1lambda * w2lambda * value);
-    paddle::platform::CudaAtomicAdd(
+    phi::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value);
+    phi::CudaAtomicAdd(&in_pos[w_id * num_channels],
+                       h2lambda * w1lambda * value);
+    phi::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels],
+                       h1lambda * w2lambda * value);
+    phi::CudaAtomicAdd(
         &in_pos[h_id * in_w * num_channels + w_id * num_channels],
         h1lambda * w1lambda * value);
   }
@@ -530,8 +520,8 @@ __global__ void KeBicubicInterpBw(T* in,
           in_pos = &in[out_id_h * input_w + access_y * in_img_w * num_channels +
                        access_x * num_channels + channel_id];
         }
-        paddle::platform::CudaAtomicAdd(
-            &in_pos[0], (out_pos[0] * y_coeffs[j] * x_coeffs[i]));
+        phi::CudaAtomicAdd(&in_pos[0],
+                           (out_pos[0] * y_coeffs[j] * x_coeffs[i]));
       }
     }
   }
@@ -629,26 +619,22 @@ __global__ void KeTrilinearInterpBw(T* in,
       const T* out_pos = &out[out_id_h * output_w + out_id_w];
 
       // trilinear interpolation grad
-      paddle::platform::CudaAtomicAdd(
-          &in_pos1[0], d2lambda * h2lambda * w2lambda * out_pos[0]);
-      paddle::platform::CudaAtomicAdd(
-          &in_pos1[w_id], d2lambda * h2lambda * w1lambda * out_pos[0]);
-      paddle::platform::CudaAtomicAdd(
-          &in_pos1[h_id * in_img_w],
-          d2lambda * h1lambda * w2lambda * out_pos[0]);
-      paddle::platform::CudaAtomicAdd(
-          &in_pos1[h_id * in_img_w + w_id],
-          d2lambda * h1lambda * w1lambda * out_pos[0]);
-      paddle::platform::CudaAtomicAdd(
-          &in_pos2[0], d1lambda * h2lambda * w2lambda * out_pos[0]);
-      paddle::platform::CudaAtomicAdd(
-          &in_pos2[w_id], d1lambda * h2lambda * w1lambda * out_pos[0]);
-      paddle::platform::CudaAtomicAdd(
-          &in_pos2[h_id * in_img_w],
-          d1lambda * h1lambda * w2lambda * out_pos[0]);
-      paddle::platform::CudaAtomicAdd(
-          &in_pos2[h_id * in_img_w + w_id],
-          d1lambda * h1lambda * w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos1[0],
+                         d2lambda * h2lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos1[w_id],
+                         d2lambda * h2lambda * w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w],
+                         d2lambda * h1lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id],
+                         d2lambda * h1lambda * w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos2[0],
+                         d1lambda * h2lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos2[w_id],
+                         d1lambda * h2lambda * w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w],
+                         d1lambda * h1lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id],
+                         d1lambda * h1lambda * w1lambda * out_pos[0]);
     } else {
       int in_pos1_idx = out_id_h * input_w +
                         in_img_idt * in_img_h * in_img_w * num_channels +
@@ -661,26 +647,22 @@ __global__ void KeTrilinearInterpBw(T* in,
       const T* out_pos = &out[out_id_h * output_w + out_id_w];
 
       // trilinear interpolation grad
-      paddle::platform::CudaAtomicAdd(
-          &in_pos1[0], d2lambda * h2lambda * w2lambda * out_pos[0]);
-      paddle::platform::CudaAtomicAdd(
-          &in_pos1[w_id * num_channels],
-          d2lambda * h2lambda * w1lambda * out_pos[0]);
-      paddle::platform::CudaAtomicAdd(
-          &in_pos1[h_id * in_img_w * num_channels],
-          d2lambda * h1lambda * w2lambda * out_pos[0]);
-      paddle::platform::CudaAtomicAdd(
+      phi::CudaAtomicAdd(&in_pos1[0],
+                         d2lambda * h2lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos1[w_id * num_channels],
+                         d2lambda * h2lambda * w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels],
+                         d2lambda * h1lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(
           &in_pos1[h_id * in_img_w * num_channels + w_id * num_channels],
           d2lambda * h1lambda * w1lambda * out_pos[0]);
-      paddle::platform::CudaAtomicAdd(
-          &in_pos2[0], d1lambda * h2lambda * w2lambda * out_pos[0]);
-      paddle::platform::CudaAtomicAdd(
-          &in_pos2[w_id * num_channels],
-          d1lambda * h2lambda * w1lambda * out_pos[0]);
-      paddle::platform::CudaAtomicAdd(
-          &in_pos2[h_id * in_img_w * num_channels],
-          d1lambda * h1lambda * w2lambda * out_pos[0]);
-      paddle::platform::CudaAtomicAdd(
+      phi::CudaAtomicAdd(&in_pos2[0],
+                         d1lambda * h2lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos2[w_id * num_channels],
+                         d1lambda * h2lambda * w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels],
+                         d1lambda * h1lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(
           &in_pos2[h_id * in_img_w * num_channels + w_id * num_channels],
           d1lambda * h1lambda * w1lambda * out_pos[0]);
     }
@@ -751,7 +733,7 @@ __global__ void KeNearestNeighbor3DInterpBw(T* in,
                    in_img_idx * num_channels + channel_id];
     }
     const T out_pos = out[out_id_h * output_w + out_id_w];
-    paddle::platform::CudaAtomicAdd(in_pos, out_pos);
+    phi::CudaAtomicAdd(in_pos, out_pos);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/interpolate_kernel.cu b/paddle/phi/kernels/gpu/interpolate_kernel.cu
index c8f6a40104b..625718e8f4b 100644
--- a/paddle/phi/kernels/gpu/interpolate_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_kernel.cu
@@ -15,9 +15,9 @@
 #include "paddle/phi/kernels/interpolate_kernel.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/common/layout.h"
diff --git a/paddle/phi/kernels/gpu/linspace_kernel.cu b/paddle/phi/kernels/gpu/linspace_kernel.cu
index 9db11381cbc..eaf2955010c 100644
--- a/paddle/phi/kernels/gpu/linspace_kernel.cu
+++ b/paddle/phi/kernels/gpu/linspace_kernel.cu
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/linspace_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
index 30fd93533ed..99b1c1a8c0a 100644
--- a/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/nanmedian_grad_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -24,7 +24,7 @@
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 inline int GET_BLOCKS(const int N) {
   return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
 }
diff --git a/paddle/phi/kernels/gpu/nanmedian_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
index 132b9fa10b7..2765f96321c 100644
--- a/paddle/phi/kernels/gpu/nanmedian_kernel.cu
+++ b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
@@ -15,9 +15,9 @@
 #include "paddle/phi/kernels/nanmedian_kernel.h"
 
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/impl/nanmedian_kernel_impl.h"
@@ -25,7 +25,7 @@
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 inline int GET_BLOCKS(const int N) {
   return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
@@ -56,15 +56,15 @@ __global__ void KernelNanCounts(const T* input,
     const T x = input[index];
     if (isnan(static_cast<float>(x))) {
       auto bin = static_cast<int64_t>(index / stride);
-      paddle::platform::CudaAtomicAdd(&buf[bin], 1);
+      phi::CudaAtomicAdd(&buf[bin], 1);
     }
   }
   __syncthreads();
 
   for (int i = threadIdx.x; i < pre_dim; i += blockDim.x) {
-    paddle::platform::CudaAtomicAdd(&nan_counts[i], buf[i]);
-    paddle::platform::CudaAtomicAdd(&nan_total[0], buf[i]);
-    paddle::platform::CudaAtomicMax(&nan_total[1], stride - buf[i]);
+    phi::CudaAtomicAdd(&nan_counts[i], buf[i]);
+    phi::CudaAtomicAdd(&nan_total[0], buf[i]);
+    phi::CudaAtomicMax(&nan_total[1], stride - buf[i]);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/nll_loss.h b/paddle/phi/kernels/gpu/nll_loss.h
index bb47a2f06f4..37a67b4767a 100644
--- a/paddle/phi/kernels/gpu/nll_loss.h
+++ b/paddle/phi/kernels/gpu/nll_loss.h
@@ -20,7 +20,7 @@
 #include <string>
 
 #include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 
@@ -270,8 +270,8 @@ __global__ void GPUNLLLossForward2D_with_reduce(T* out_data,
       partial_sums, blockDim.x, acc_weight, thrust::plus<T>(), (T)0);
 
   if (threadIdx.x == 0) {
-    paddle::platform::CudaAtomicAdd(total_weight_data, acc_weight);
-    paddle::platform::CudaAtomicAdd(out_data, input_sum);
+    phi::CudaAtomicAdd(total_weight_data, acc_weight);
+    phi::CudaAtomicAdd(out_data, input_sum);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/nms_kernel.cu b/paddle/phi/kernels/gpu/nms_kernel.cu
index dcc6d6e2b45..79b0b8dfb18 100644
--- a/paddle/phi/kernels/gpu/nms_kernel.cu
+++ b/paddle/phi/kernels/gpu/nms_kernel.cu
@@ -16,8 +16,8 @@
 
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/phi/kernels/gpu/one_hot_kernel.cu b/paddle/phi/kernels/gpu/one_hot_kernel.cu
index abe7757df72..d055e5ad73e 100644
--- a/paddle/phi/kernels/gpu/one_hot_kernel.cu
+++ b/paddle/phi/kernels/gpu/one_hot_kernel.cu
@@ -15,14 +15,14 @@
 #include "paddle/phi/kernels/one_hot_kernel.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename InT, typename OutT>
 __global__ void FillOutputKernel(const InT* p_in_data,
diff --git a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
index fb7f1a23257..ca26d9be4f9 100644
--- a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
@@ -15,14 +15,14 @@
 #include "paddle/phi/kernels/pad3d_grad_kernel.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T>
 __global__ void Pad3DGradConstNCDHW(const int in_size,
@@ -133,7 +133,7 @@ __global__ void Pad3DGradReflectNCDHW(const int out_size,
     in_h = min(in_h, 2 * in_height - in_h - 2);
     in_w = min(in_w, 2 * in_width - in_w - 2);
 
-    paddle::platform::CudaAtomicAdd(
+    phi::CudaAtomicAdd(
         &d_in_data[nc * in_depth * in_height * in_width +
                    in_d * in_height * in_width + in_h * in_width + in_w],
         d_out_data[out_index]);
@@ -176,7 +176,7 @@ __global__ void Pad3DGradReflectNDHWC(const int out_size,
     in_d = min(in_d, in_depth * 2 - in_d - 2);
     in_h = min(in_h, in_height * 2 - in_h - 2);
     in_w = min(in_w, in_width * 2 - in_w - 2);
-    paddle::platform::CudaAtomicAdd(
+    phi::CudaAtomicAdd(
         &d_in_data[n * in_depth * in_height * in_width * channels +
                    in_d * in_height * in_width * channels +
                    in_h * in_width * channels + in_w * channels + c],
@@ -211,7 +211,7 @@ __global__ void Pad3DGradReplicateNCDHW(const int out_size,
     const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
     const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
 
-    paddle::platform::CudaAtomicAdd(
+    phi::CudaAtomicAdd(
         &d_in_data[nc * in_depth * in_height * in_width +
                    in_d * in_height * in_width + in_h * in_width + in_w],
         d_out_data[out_index]);
@@ -247,7 +247,7 @@ __global__ void Pad3DGradReplicateNDHWC(const int out_size,
     const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
     const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
 
-    paddle::platform::CudaAtomicAdd(
+    phi::CudaAtomicAdd(
         &d_in_data[n * in_depth * in_height * in_width * channels +
                    in_d * in_height * in_width * channels +
                    in_h * in_width * channels + in_w * channels + c],
@@ -282,7 +282,7 @@ __global__ void Pad3DGradCircularNCDHW(const int out_size,
     int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
     int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
 
-    paddle::platform::CudaAtomicAdd(
+    phi::CudaAtomicAdd(
         &d_in_data[nc * in_depth * in_height * in_width +
                    in_d * in_height * in_width + in_h * in_width + in_w],
         d_out_data[out_index]);
@@ -318,7 +318,7 @@ __global__ void Pad3DGradCircularNDHWC(const int out_size,
     int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
     int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
 
-    paddle::platform::CudaAtomicAdd(
+    phi::CudaAtomicAdd(
         &d_in_data[n * in_depth * in_height * in_width * channels +
                    in_d * in_height * in_width * channels +
                    in_h * in_width * channels + in_w * channels + c],
diff --git a/paddle/phi/kernels/gpu/pad3d_kernel.cu b/paddle/phi/kernels/gpu/pad3d_kernel.cu
index fa85c650bc8..241ffefe5d1 100644
--- a/paddle/phi/kernels/gpu/pad3d_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad3d_kernel.cu
@@ -17,14 +17,14 @@
 #include <algorithm>
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T>
 __global__ void Pad3DConstNCDHW(const int nthreads,
diff --git a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
index 6ecaaef1870..433bb5e4cf1 100644
--- a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
@@ -16,7 +16,7 @@
 #include <vector>
 
 #include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -97,7 +97,7 @@ __global__ void GPUPSROIPoolBackward(const int nthreads,
     for (int ih = hstart; ih < hend; ++ih) {
       for (int iw = wstart; iw < wend; ++iw) {
         int input_index = ih * width + iw;
-        paddle::platform::CudaAtomicAdd(offset_dx_data + input_index, diff_val);
+        phi::CudaAtomicAdd(offset_dx_data + input_index, diff_val);
       }
     }
   }
diff --git a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
index 0673eda8d5f..09dce366bea 100644
--- a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
@@ -15,9 +15,9 @@
 #include "paddle/phi/kernels/roi_align_grad_kernel.h"
 
 #include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -153,14 +153,11 @@ __global__ void GPURoiAlignBackward(const int nthreads,
         T diff3 = out_grad_this_bin * w3 / count;
         T diff4 = out_grad_this_bin * w4 / count;
         if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          paddle::platform::CudaAtomicAdd(
-              offset_input_grad + y_low * width + x_low, diff1);
-          paddle::platform::CudaAtomicAdd(
-              offset_input_grad + y_low * width + x_high, diff2);
-          paddle::platform::CudaAtomicAdd(
-              offset_input_grad + y_high * width + x_low, diff3);
-          paddle::platform::CudaAtomicAdd(
-              offset_input_grad + y_high * width + x_high, diff4);
+          phi::CudaAtomicAdd(offset_input_grad + y_low * width + x_low, diff1);
+          phi::CudaAtomicAdd(offset_input_grad + y_low * width + x_high, diff2);
+          phi::CudaAtomicAdd(offset_input_grad + y_high * width + x_low, diff3);
+          phi::CudaAtomicAdd(offset_input_grad + y_high * width + x_high,
+                             diff4);
         }
       }
     }
diff --git a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
index 6b888b200e1..527060ae681 100644
--- a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
@@ -15,9 +15,9 @@
 #include "paddle/phi/kernels/roi_pool_grad_kernel.h"
 
 #include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -63,7 +63,7 @@ __global__ void GPURoiPoolBackward(const int nthreads,
 
     int arg_max = offset_arg_max_data[ph * pooled_width + pw];
     if (arg_max != -1) {
-      paddle::platform::CudaAtomicAdd(
+      phi::CudaAtomicAdd(
           offset_input_grad + arg_max,
           static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
     }
diff --git a/paddle/phi/kernels/gpu/roll_grad_kernel.cu b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
index 82e0fa72ab0..ff3ac29117e 100644
--- a/paddle/phi/kernels/gpu/roll_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
@@ -20,7 +20,7 @@
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T, typename Context>
 void RollGradKernel(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/gpu/roll_kernel.cu b/paddle/phi/kernels/gpu/roll_kernel.cu
index 5d3584e4f44..60db02d8463 100644
--- a/paddle/phi/kernels/gpu/roll_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_kernel.cu
@@ -21,7 +21,7 @@
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T, typename Context>
 void RollKernel(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h
index 823164f3fbc..99ef7515e2c 100644
--- a/paddle/phi/kernels/gpu/roll_kernel_impl.h
+++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h
@@ -14,13 +14,13 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/utils/array.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T, size_t Rank>
 __global__ void RollCudaKernel(const T* input,
diff --git a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
index 281bf6278af..14ec4462fc5 100644
--- a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
@@ -45,7 +45,7 @@ void CalculateXEGradForMinMax(const Context& ctx,
   const auto& bcast_info = phi::CalcBCastInfo(x_dims, e_dims);
   thrust::device_vector<int64_t> l_bcastoff, r_bcastoff;
   if (bcast_info.use_bcast) {
-    CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff);
+    CopyBCastOff(bcast_info, &l_bcastoff, &r_bcastoff);
   }
 
   int64_t out_len = bcast_info.out_len;
@@ -177,7 +177,7 @@ void CalculateXGrad(const Context& ctx,
       const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, e_dims);
       thrust::device_vector<int64_t> l_bcastoff, r_bcastoff;
       if (bcast_info.use_bcast) {
-        CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff);
+        CopyBCastOff(bcast_info, &l_bcastoff, &r_bcastoff);
       }
       int64_t out_len = bcast_info.out_len;
       const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock());
@@ -300,7 +300,7 @@ void CalculateXGrad(const Context& ctx,
       const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, e_dims);
       thrust::device_vector<int64_t> l_bcastoff, r_bcastoff;
       if (bcast_info.use_bcast) {
-        CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff);
+        CopyBCastOff(bcast_info, &l_bcastoff, &r_bcastoff);
       }
       int64_t out_len = bcast_info.out_len;
       const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock());
@@ -386,7 +386,7 @@ void CalculateEGrad(const Context& ctx,
   const auto& bcast_info = phi::CalcBCastInfo(x_dims, e_dims);
   thrust::device_vector<int64_t> l_bcastoff, r_bcastoff;
   if (bcast_info.use_bcast) {
-    CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff);
+    CopyBCastOff(bcast_info, &l_bcastoff, &r_bcastoff);
   }
   int64_t out_len = bcast_info.out_len;
   const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock());
diff --git a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
index 482077b7f93..aaae915f9df 100644
--- a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
@@ -89,7 +89,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx,
 
   thrust::device_vector<int64_t> x_bcastoff, e_bcastoff;
   if (bcast_info.use_bcast) {
-    CopyBCastOff(bcast_info, x_bcastoff, e_bcastoff);
+    CopyBCastOff(bcast_info, &x_bcastoff, &e_bcastoff);
   }
 
   int64_t out_len = bcast_info.out_len;
diff --git a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
index ad904c8ae2d..e33d259bd44 100644
--- a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
@@ -45,7 +45,7 @@ __global__ void GraphSendUVGradCUDAKernel(const T* out_grad,
     const T* out_grad_off = out_grad + ty * slice_size;
     T* x_grad_off = x_grad + dst * slice_size;
     while (tx < slice_size) {
-      paddle::platform::CudaAtomicAdd(x_grad_off + tx, out_grad_off[tx]);
+      phi::CudaAtomicAdd(x_grad_off + tx, out_grad_off[tx]);
       tx += stride_x;
     }
     ty += stride_y;
@@ -127,7 +127,7 @@ void CalculateGrad(const Context& ctx,
     const auto& bcast_info = phi::CalcBCastInfo(y.dims(), out_grad_dims);
     thrust::device_vector<int64_t> l_bcastoff, r_bcastoff;
     if (bcast_info.use_bcast) {
-      CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff);
+      CopyBCastOff(bcast_info, &l_bcastoff, &r_bcastoff);
     }
     int64_t out_len = bcast_info.out_len;
     const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock());
diff --git a/paddle/phi/kernels/gpu/send_uv_kernel.cu b/paddle/phi/kernels/gpu/send_uv_kernel.cu
index 69c1515e812..860a900dac8 100644
--- a/paddle/phi/kernels/gpu/send_uv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_uv_kernel.cu
@@ -94,7 +94,7 @@ void GraphSendUVOpCUDAKernelLaunchHelper(const Context& ctx,
 
   thrust::device_vector<int64_t> x_bcastoff, y_bcastoff;
   if (bcast_info.use_bcast) {
-    CopyBCastOff(bcast_info, x_bcastoff, y_bcastoff);
+    CopyBCastOff(bcast_info, &x_bcastoff, &y_bcastoff);
   }
 
   int64_t out_len = bcast_info.out_len;
diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu
index b2b73b2812c..ea257ebd1cc 100644
--- a/paddle/phi/kernels/gpu/sgd_kernel.cu
+++ b/paddle/phi/kernels/gpu/sgd_kernel.cu
@@ -16,9 +16,9 @@
 
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_helper.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
@@ -56,7 +56,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
     for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
       // Since index in rows of SelectedRows can be duplicate, we have to use
       // Atomic Operation to avoid concurrent write error.
-      paddle::platform::CudaAtomicAdd(
+      phi::CudaAtomicAdd(
           tensor_out_ptr + index,
           -static_cast<T>(1.0) * learning_rate[0] * selected_rows_ptr[index]);
     }
diff --git a/paddle/phi/kernels/gpu/shard_index_kernel.cu b/paddle/phi/kernels/gpu/shard_index_kernel.cu
index 96fd3911c0d..f8e51eb98d7 100644
--- a/paddle/phi/kernels/gpu/shard_index_kernel.cu
+++ b/paddle/phi/kernels/gpu/shard_index_kernel.cu
@@ -14,13 +14,13 @@
 
 #include "paddle/phi/kernels/shard_index_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T>
 __global__ void ShardIndexInner(const T* in_data,
diff --git a/paddle/phi/kernels/gpu/transpose_kernel.cu b/paddle/phi/kernels/gpu/transpose_kernel.cu
index 9b895adb0a3..a970902d800 100644
--- a/paddle/phi/kernels/gpu/transpose_kernel.cu
+++ b/paddle/phi/kernels/gpu/transpose_kernel.cu
@@ -18,9 +18,9 @@
 
 #include "paddle/fluid/framework/gpu_utils.h"
 #include "paddle/fluid/operators/transpose_op.cu.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/gpu/trunc_grad_kernel.cu b/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
index 92797833918..8a88383e6e4 100644
--- a/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
@@ -14,14 +14,14 @@
 
 #include "paddle/phi/kernels/trunc_grad_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T>
 __global__ void TruncGrad(T* dx, int64_t N) {
diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu
index 1f374714a6f..dfc4f6589e9 100644
--- a/paddle/phi/kernels/gpu/trunc_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_kernel.cu
@@ -14,14 +14,14 @@
 
 #include "paddle/phi/kernels/trunc_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T>
 class TruncFunctor {
diff --git a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
index 13c93c9d3fe..4bc8c205025 100644
--- a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
@@ -18,9 +18,9 @@
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu b/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
index 7a1cb6d53af..98f200480d4 100644
--- a/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
@@ -18,9 +18,9 @@
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h b/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
index ddaaebafbc9..5e900285273 100644
--- a/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
@@ -18,7 +18,7 @@
 #include "paddle/phi/kernels/cpu/index_select_impl.h"
 #include "paddle/phi/kernels/repeat_interleave_grad_kernel.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -33,7 +33,7 @@ namespace cub = hipcub;
 namespace phi {
 
 #if defined(__NVCC__) || defined(__HIPCC__)
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T, typename IndexT>
 __global__ void index_select_grad_cuda_kernel(const T* output_grad,
@@ -53,7 +53,7 @@ __global__ void index_select_grad_cuda_kernel(const T* output_grad,
   int64_t dim_idx = idx % (stride * size) / stride;
   IndexT src_dim_idx = index[dim_idx];
   int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
-  paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]);
+  phi::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]);
 }
 
 template <typename T>
diff --git a/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h b/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
index dd950a14f67..8548785aaca 100644
--- a/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
+++ b/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
@@ -18,9 +18,9 @@
 #include "paddle/phi/kernels/cpu/index_select_impl.h"
 #include "paddle/phi/kernels/repeat_interleave_kernel.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_resources.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 #endif
@@ -30,7 +30,7 @@
 namespace phi {
 
 #if defined(__NVCC__) || defined(__HIPCC__)
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 template <typename T, typename IndexT>
 __global__ void index_select_cuda_kernel(const T* input,
                                          T* output,
@@ -81,9 +81,8 @@ void RepeatInterleaveKernel(const Context& ctx,
     output_dim[dim] = index_size;
     out->Resize(phi::make_ddim(output_dim));
     phi::IndexSelectInner<Context, T, int>(ctx, &x_copy, index, out, dim);
-  }
 #if defined(__NVCC__) || defined(__HIPCC__)
-  else {
+  } else {
     auto stride_dim = phi::stride(input_dim);
     int64_t stride = stride_dim[dim];
     paddle::framework::TensorFromVector<int>(index_vec, ctx, &index);
@@ -105,6 +104,8 @@ void RepeatInterleaveKernel(const Context& ctx,
            stream>>>(
             x.data<T>(), out_data, index_data, numel, stride, size, delta);
   }
+#else
+  }
 #endif
 }
 
@@ -163,9 +164,8 @@ void RepeatInterleaveWithTensorIndexKernel(const Context& ctx,
       out->Resize(phi::make_ddim(output_dim));
       IndexSelectInner<Context, T, int64_t>(ctx, &x_copy, index, out, dim);
     }
-  }
 #if defined(__NVCC__) || defined(__HIPCC__)
-  else {
+  } else {
     auto stride_dim = phi::stride(input_dim);
     int64_t stride = stride_dim[dim];
     auto stream = ctx.stream();
@@ -209,6 +209,8 @@ void RepeatInterleaveWithTensorIndexKernel(const Context& ctx,
               in_data, out_data, index_data, numel, stride, size, delta);
     }
   }
+#else
+  }
 #endif
 }
 
-- 
GitLab