From dc7597e3775a8a08b235e184bd780bcb1a1eae9d Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 21 Dec 2021 05:26:53 -0600
Subject: [PATCH] [PTen] Rename cuda dir and context to gpu (#38296)

* rename cuda to gpu

* revert CMake change

* resolve conflit

* rename other cuda to gpu

* poish details
---
 paddle/fluid/operators/conj_op.h              |  2 +-
 paddle/pten/CMakeLists.txt                    |  2 +-
 paddle/pten/api/include/tensor.h              |  2 +-
 paddle/pten/api/lib/ext_compat_utils.cc       |  2 +-
 paddle/pten/api/lib/kernel_declare.h          | 12 ++++----
 paddle/pten/api/lib/utils.cc                  |  2 +-
 paddle/pten/backends/all_context.h            |  2 +-
 .../cuda_context.h => gpu/gpu_context.h}      |  2 +-
 paddle/pten/common/backend.h                  |  6 ++--
 paddle/pten/core/convert_utils.cc             |  4 +--
 paddle/pten/core/kernel_registry.h            |  2 +-
 paddle/pten/core/kernel_utils.h               |  2 +-
 paddle/pten/include/linalg.h                  |  2 +-
 paddle/pten/include/manipulation.h            |  2 +-
 paddle/pten/include/math.h                    |  4 +--
 paddle/pten/kernels/CMakeLists.txt            |  3 +-
 paddle/pten/kernels/cuda/CMakeLists.txt       | 17 -----------
 paddle/pten/kernels/gpu/CMakeLists.txt        | 17 +++++++++++
 .../pten/kernels/{cuda => gpu}/conj_kernel.cu | 10 +++----
 .../pten/kernels/{cuda => gpu}/conj_kernel.h  |  4 +--
 .../pten/kernels/{cuda => gpu}/full_kernel.cu |  6 ++--
 paddle/pten/kernels/{cuda => gpu}/linalg.cu   | 14 ++++-----
 paddle/pten/kernels/{cuda => gpu}/linalg.h    |  6 ++--
 .../kernels/{cuda => gpu}/manipulation.cu     | 24 +++++++--------
 .../pten/kernels/{cuda => gpu}/manipulation.h | 10 +++----
 paddle/pten/kernels/{cuda => gpu}/math.cu     | 25 ++++++++--------
 paddle/pten/kernels/{cuda => gpu}/math.h      | 18 +++++------
 .../kernels/{cuda => gpu}/scale_kernel.cu     |  4 +--
 paddle/pten/kernels/{cuda => gpu}/utils.cu    |  6 ++--
 paddle/pten/kernels/{cuda => gpu}/utils.h     |  4 +--
 paddle/pten/kernels/hybird/CMakeLists.txt     |  4 +--
 .../kernels/hybird/cuda/cast_kernel_impl.h    |  4 +--
 .../pten/kernels/hybird/cuda/reduce/reduce.h  |  4 +--
 .../hybird/cuda/reduce/reduce_cuda_impl.h     |  4 +--
 .../kernels/hybird/general/elementwise_base.h | 12 ++++----
 paddle/pten/kernels/hybird/transpose.cu       |  8 ++---
 paddle/pten/tests/api/scale_api.h             | 30 +++++++++----------
 paddle/pten/tests/api/test_matmul_api.cc      |  2 +-
 paddle/pten/tests/api/test_tensor_utils.cc    |  2 +-
 paddle/pten/tests/api/test_to_api.cc          |  4 +--
 paddle/pten/tests/common/test_backend.cc      |  4 +--
 41 files changed, 146 insertions(+), 148 deletions(-)
 rename paddle/pten/backends/{cuda/cuda_context.h => gpu/gpu_context.h} (93%)
 delete mode 100644 paddle/pten/kernels/cuda/CMakeLists.txt
 create mode 100644 paddle/pten/kernels/gpu/CMakeLists.txt
 rename paddle/pten/kernels/{cuda => gpu}/conj_kernel.cu (81%)
 rename paddle/pten/kernels/{cuda => gpu}/conj_kernel.h (86%)
 rename paddle/pten/kernels/{cuda => gpu}/full_kernel.cu (93%)
 rename paddle/pten/kernels/{cuda => gpu}/linalg.cu (90%)
 rename paddle/pten/kernels/{cuda => gpu}/linalg.h (89%)
 rename paddle/pten/kernels/{cuda => gpu}/manipulation.cu (88%)
 rename paddle/pten/kernels/{cuda => gpu}/manipulation.h (86%)
 rename paddle/pten/kernels/{cuda => gpu}/math.cu (89%)
 rename paddle/pten/kernels/{cuda => gpu}/math.h (86%)
 rename paddle/pten/kernels/{cuda => gpu}/scale_kernel.cu (93%)
 rename paddle/pten/kernels/{cuda => gpu}/utils.cu (98%)
 rename paddle/pten/kernels/{cuda => gpu}/utils.h (91%)

diff --git a/paddle/fluid/operators/conj_op.h b/paddle/fluid/operators/conj_op.h
index 90724403d4..0b5a35f515 100644
--- a/paddle/fluid/operators/conj_op.h
+++ b/paddle/fluid/operators/conj_op.h
@@ -21,7 +21,7 @@
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/cpu/conj_kernel.h"
-#include "paddle/pten/kernels/cuda/conj_kernel.h"
+#include "paddle/pten/kernels/gpu/conj_kernel.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index eb9a149dd6..799ec885b9 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -27,7 +27,7 @@ set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_cont
 set(PTEN_DEPS ${PTEN_DEPS} math_cpu linalg_cpu manipulation_cpu conj_kernel_cpu scale_kernel_cpu full_kernel_cpu)
 set(PTEN_DEPS ${PTEN_DEPS} nary unary binary)
 if(WITH_GPU OR WITH_ROCM)
-  set(PTEN_DEPS ${PTEN_DEPS} math_cuda linalg_cuda manipulation_cuda conj_kernel_cuda scale_kernel_cuda full_kernel_cuda)
+  set(PTEN_DEPS ${PTEN_DEPS} math_gpu linalg_gpu manipulation_gpu conj_kernel_gpu scale_kernel_gpu full_kernel_gpu)
 endif()
 if(WITH_XPU)
   set(PTEN_DEPS ${PTEN_DEPS} manipulation_xpu)
diff --git a/paddle/pten/api/include/tensor.h b/paddle/pten/api/include/tensor.h
index c8ef22c2ec..935c7d8e32 100644
--- a/paddle/pten/api/include/tensor.h
+++ b/paddle/pten/api/include/tensor.h
@@ -464,7 +464,7 @@ class PADDLE_API Tensor final {
    * unified to Tensor, but Tensor itself is heterogeneous.
    *
    * Tensor can generally be represented by void* and size_t, place.
-   * This is suitable for most scenarios including CPU, CUDA, HIP, CPU, etc.,
+   * This is suitable for most scenarios including CPU, GPU, HIP, CPU, etc.,
    * but there are a few cases where this definition cannot be described,
    * such as the Tensor representation in third-party lib such as Metal,
    * OpenCL, etc., as well as some special Tensor implementations, including
diff --git a/paddle/pten/api/lib/ext_compat_utils.cc b/paddle/pten/api/lib/ext_compat_utils.cc
index 791a8526f3..6f1763eac3 100644
--- a/paddle/pten/api/lib/ext_compat_utils.cc
+++ b/paddle/pten/api/lib/ext_compat_utils.cc
@@ -56,7 +56,7 @@ Backend ConvertExtPlaceToBackend(PlaceType p) {
       return Backend::CPU;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     case PlaceType::kGPU:
-      return Backend::CUDA;
+      return Backend::GPU;
 #endif
     default:
       PADDLE_THROW(
diff --git a/paddle/pten/api/lib/kernel_declare.h b/paddle/pten/api/lib/kernel_declare.h
index e748a51082..4dbd46bff6 100644
--- a/paddle/pten/api/lib/kernel_declare.h
+++ b/paddle/pten/api/lib/kernel_declare.h
@@ -28,12 +28,12 @@ PT_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(conj, CPU, ALL_LAYOUT);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_KERNEL(full_like, CUDA, ALL_LAYOUT);
-PT_DECLARE_KERNEL(dot, CUDA, ALL_LAYOUT);
-PT_DECLARE_KERNEL(flatten, CUDA, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sign, CUDA, ALL_LAYOUT);
-PT_DECLARE_KERNEL(scale, CUDA, ALL_LAYOUT);
-PT_DECLARE_KERNEL(conj, CUDA, ALL_LAYOUT);
+PT_DECLARE_KERNEL(full_like, GPU, ALL_LAYOUT);
+PT_DECLARE_KERNEL(dot, GPU, ALL_LAYOUT);
+PT_DECLARE_KERNEL(flatten, GPU, ALL_LAYOUT);
+PT_DECLARE_KERNEL(sign, GPU, ALL_LAYOUT);
+PT_DECLARE_KERNEL(scale, GPU, ALL_LAYOUT);
+PT_DECLARE_KERNEL(conj, GPU, ALL_LAYOUT);
 #endif
 
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/pten/api/lib/utils.cc b/paddle/pten/api/lib/utils.cc
index 06d604cb83..ddb29c8833 100644
--- a/paddle/pten/api/lib/utils.cc
+++ b/paddle/pten/api/lib/utils.cc
@@ -28,7 +28,7 @@ limitations under the License. */
 PT_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_KERNEL(copy, CUDA, ALL_LAYOUT);
+PT_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
 #endif
 
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/pten/backends/all_context.h b/paddle/pten/backends/all_context.h
index d056af1b3a..a7cb4abc2f 100644
--- a/paddle/pten/backends/all_context.h
+++ b/paddle/pten/backends/all_context.h
@@ -21,7 +21,7 @@ limitations under the License. */
 // path replacement after implementing pten DeviceContext
 
 #include "paddle/pten/backends/cpu/cpu_context.h"
-#include "paddle/pten/backends/cuda/cuda_context.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/backends/npu/npu_context.h"
 #include "paddle/pten/backends/xpu/xpu_context.h"
 
diff --git a/paddle/pten/backends/cuda/cuda_context.h b/paddle/pten/backends/gpu/gpu_context.h
similarity index 93%
rename from paddle/pten/backends/cuda/cuda_context.h
rename to paddle/pten/backends/gpu/gpu_context.h
index 332fdd2fda..1adfd155ce 100644
--- a/paddle/pten/backends/cuda/cuda_context.h
+++ b/paddle/pten/backends/gpu/gpu_context.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 
 namespace pten {
-using CUDAContext = paddle::platform::CUDADeviceContext;
+using GPUContext = paddle::platform::CUDADeviceContext;
 }  // namespace pten
 
 #endif
diff --git a/paddle/pten/common/backend.h b/paddle/pten/common/backend.h
index 95bbc88681..9944083248 100644
--- a/paddle/pten/common/backend.h
+++ b/paddle/pten/common/backend.h
@@ -43,7 +43,7 @@ enum class Backend : uint8_t {
   CPU,
 
   // various acceleration devices' backends
-  CUDA,
+  GPU,
   XPU,  // XPU currently does not exist at the same time as CUDA
   NPU,  // NPU currently does not exist at the same time as CUDA
 
@@ -99,8 +99,8 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
     case Backend::CPU:
       os << "CPU";
       break;
-    case Backend::CUDA:
-      os << "CUDA";
+    case Backend::GPU:
+      os << "GPU";
       break;
     case Backend::XPU:
       os << "XPU";
diff --git a/paddle/pten/core/convert_utils.cc b/paddle/pten/core/convert_utils.cc
index 936d4effdf..bb8b416128 100644
--- a/paddle/pten/core/convert_utils.cc
+++ b/paddle/pten/core/convert_utils.cc
@@ -23,7 +23,7 @@ Backend TransToPtenBackend(const paddle::platform::Place& place) {
   if (paddle::platform::is_cpu_place(place)) {
     return Backend::CPU;
   } else if (paddle::platform::is_gpu_place(place)) {
-    return Backend::CUDA;
+    return Backend::GPU;
   } else {
     return Backend::UNDEFINED;
   }
@@ -84,7 +84,7 @@ paddle::platform::Place TransToFluidPlace(const Backend& backend) {
     case pten::Backend::CPU:
       return paddle::platform::CPUPlace();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    case pten::Backend::CUDA:
+    case pten::Backend::GPU:
       return paddle::platform::CUDAPlace(
           paddle::platform::GetCurrentDeviceId());
 #endif
diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h
index 62a46e128e..a33b13dac2 100644
--- a/paddle/pten/core/kernel_registry.h
+++ b/paddle/pten/core/kernel_registry.h
@@ -57,7 +57,7 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
       if (arg_type == std::type_index(typeid(const CPUContext&))
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
           ||
-          arg_type == std::type_index(typeid(const CUDAContext&))) {
+          arg_type == std::type_index(typeid(const GPUContext&))) {
 #else
               ) {
 #endif
diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h
index 82ffa57387..7a7ae28330 100644
--- a/paddle/pten/core/kernel_utils.h
+++ b/paddle/pten/core/kernel_utils.h
@@ -181,7 +181,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
 
   PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CUDAContext);
+  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext);
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(NPUContext);
diff --git a/paddle/pten/include/linalg.h b/paddle/pten/include/linalg.h
index 60ec451be2..8f627f5fc8 100644
--- a/paddle/pten/include/linalg.h
+++ b/paddle/pten/include/linalg.h
@@ -18,7 +18,7 @@
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/include/infermeta.h"
 #include "paddle/pten/kernels/cpu/linalg.h"
-#include "paddle/pten/kernels/cuda/linalg.h"
+#include "paddle/pten/kernels/gpu/linalg.h"
 
 namespace pten {
 
diff --git a/paddle/pten/include/manipulation.h b/paddle/pten/include/manipulation.h
index e138c51e30..e94f2a6180 100644
--- a/paddle/pten/include/manipulation.h
+++ b/paddle/pten/include/manipulation.h
@@ -18,7 +18,7 @@
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/include/infermeta.h"
 #include "paddle/pten/kernels/cpu/manipulation.h"
-#include "paddle/pten/kernels/cuda/manipulation.h"
+#include "paddle/pten/kernels/gpu/manipulation.h"
 #include "paddle/pten/kernels/xpu/manipulation.h"
 
 namespace pten {
diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h
index 8295c57654..83471692c8 100644
--- a/paddle/pten/include/math.h
+++ b/paddle/pten/include/math.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/pten/include/infermeta.h"
 #include "paddle/pten/kernels/cpu/conj_kernel.h"
 #include "paddle/pten/kernels/cpu/math.h"
-#include "paddle/pten/kernels/cuda/conj_kernel.h"
-#include "paddle/pten/kernels/cuda/math.h"
+#include "paddle/pten/kernels/gpu/conj_kernel.h"
+#include "paddle/pten/kernels/gpu/math.h"
 #include "paddle/pten/kernels/scale_kernel.h"
 
 namespace pten {
diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt
index d87def812d..818ce6cb77 100644
--- a/paddle/pten/kernels/CMakeLists.txt
+++ b/paddle/pten/kernels/CMakeLists.txt
@@ -5,8 +5,7 @@ add_subdirectory(hybird)
 
 add_subdirectory(cpu)
 if(WITH_GPU OR WITH_ROCM)
-  # NOTE(chenweihang): if hip can split from cuda impl, we should add hip dir
-  add_subdirectory(cuda)
+  add_subdirectory(gpu)
 endif()
 if(WITH_MKLDNN)
   # mkldnn will be deprecated and use the new name dnnl
diff --git a/paddle/pten/kernels/cuda/CMakeLists.txt b/paddle/pten/kernels/cuda/CMakeLists.txt
deleted file mode 100644
index 428b2762ca..0000000000
--- a/paddle/pten/kernels/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-if(WITH_GPU)
-  nv_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_cuda)
-  nv_library(linalg_cuda SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
-  nv_library(utils_cuda SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
-  nv_library(manipulation_cuda SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_cuda unary)
-  nv_library(scale_kernel_cuda SRCS scale_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
-  nv_library(full_kernel_cuda SRCS full_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
-  nv_library(conj_kernel_cuda SRCS conj_kernel.cu DEPS dense_tensor kernel_context kernel_factory)
-elseif(WITH_ROCM)
-  hip_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_cuda)
-  hip_library(linalg_cuda SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
-  hip_library(utils_cuda SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
-  hip_library(manipulation_cuda SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_cuda unary)
-  hip_library(scale_kernel_cuda SRCS scale_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
-  hip_library(full_kernel_cuda SRCS full_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
-  hip_library(conj_kernel_cuda SRCS conj_kernel.cu DEPS dense_tensor kernel_context kernel_factory)
-endif()
diff --git a/paddle/pten/kernels/gpu/CMakeLists.txt b/paddle/pten/kernels/gpu/CMakeLists.txt
new file mode 100644
index 0000000000..11ff1608b8
--- /dev/null
+++ b/paddle/pten/kernels/gpu/CMakeLists.txt
@@ -0,0 +1,17 @@
+if(WITH_GPU)
+  nv_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu)
+  nv_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
+  nv_library(utils_gpu SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
+  nv_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_gpu unary)
+  nv_library(scale_kernel_gpu SRCS scale_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
+  nv_library(full_kernel_gpu SRCS full_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
+  nv_library(conj_kernel_gpu SRCS conj_kernel.cu DEPS dense_tensor kernel_context kernel_factory)
+elseif(WITH_ROCM)
+  hip_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu)
+  hip_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
+  hip_library(utils_gpu SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
+  hip_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_gpu unary)
+  hip_library(scale_kernel_gpu SRCS scale_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
+  hip_library(full_kernel_gpu SRCS full_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
+  hip_library(conj_kernel_gpu SRCS conj_kernel.cu DEPS dense_tensor kernel_context kernel_factory)
+endif()
diff --git a/paddle/pten/kernels/cuda/conj_kernel.cu b/paddle/pten/kernels/gpu/conj_kernel.cu
similarity index 81%
rename from paddle/pten/kernels/cuda/conj_kernel.cu
rename to paddle/pten/kernels/gpu/conj_kernel.cu
index f3d2296f56..cb4fef883f 100644
--- a/paddle/pten/kernels/cuda/conj_kernel.cu
+++ b/paddle/pten/kernels/gpu/conj_kernel.cu
@@ -12,23 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/pten/kernels/cuda/conj_kernel.h"
+#include "paddle/pten/kernels/gpu/conj_kernel.h"
 
-#include "paddle/pten/backends/cuda/cuda_context.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/kernels/hybird/math/conj_impl.h"
 
 namespace pten {
 
 template <typename T>
-void Conj(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  ConjImpl<T, CUDAContext>(dev_ctx, x, out);
+void Conj(const GPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  ConjImpl<T, GPUContext>(dev_ctx, x, out);
 }
 
 }  // namespace pten
 
 PT_REGISTER_KERNEL(conj,
-                   CUDA,
+                   GPU,
                    ALL_LAYOUT,
                    pten::Conj,
                    paddle::platform::complex<float>,
diff --git a/paddle/pten/kernels/cuda/conj_kernel.h b/paddle/pten/kernels/gpu/conj_kernel.h
similarity index 86%
rename from paddle/pten/kernels/cuda/conj_kernel.h
rename to paddle/pten/kernels/gpu/conj_kernel.h
index 8ed0049d87..7541f9290d 100644
--- a/paddle/pten/kernels/cuda/conj_kernel.h
+++ b/paddle/pten/kernels/gpu/conj_kernel.h
@@ -17,13 +17,13 @@ limitations under the License. */
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-#include "paddle/pten/backends/cuda/cuda_context.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
 
 namespace pten {
 
 template <typename T>
-void Conj(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out);
+void Conj(const GPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
 }  // namespace pten
 
diff --git a/paddle/pten/kernels/cuda/full_kernel.cu b/paddle/pten/kernels/gpu/full_kernel.cu
similarity index 93%
rename from paddle/pten/kernels/cuda/full_kernel.cu
rename to paddle/pten/kernels/gpu/full_kernel.cu
index 8a6639a2dc..16389d7749 100644
--- a/paddle/pten/kernels/cuda/full_kernel.cu
+++ b/paddle/pten/kernels/gpu/full_kernel.cu
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #include "paddle/pten/kernels/full_kernel.h"
 
-#include "paddle/pten/backends/cuda/cuda_context.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/kernels/impl/full_kernel_impl.h"
 
 PT_REGISTER_CTX_KERNEL(full,
-                       CUDA,
+                       GPU,
                        ALL_LAYOUT,
                        pten::Full,
                        float,
@@ -34,7 +34,7 @@ PT_REGISTER_CTX_KERNEL(full,
                        paddle::platform::complex<double>) {}
 
 PT_REGISTER_CTX_KERNEL(full_like,
-                       CUDA,
+                       GPU,
                        ALL_LAYOUT,
                        pten::FullLike,
                        float,
diff --git a/paddle/pten/kernels/cuda/linalg.cu b/paddle/pten/kernels/gpu/linalg.cu
similarity index 90%
rename from paddle/pten/kernels/cuda/linalg.cu
rename to paddle/pten/kernels/gpu/linalg.cu
index da6511e2c8..c9bc4cbd07 100644
--- a/paddle/pten/kernels/cuda/linalg.cu
+++ b/paddle/pten/kernels/gpu/linalg.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/pten/kernels/cuda/linalg.h"
+#include "paddle/pten/kernels/gpu/linalg.h"
 
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/kernels/hybird/eigen/dot.h"
@@ -24,15 +24,15 @@
 namespace pten {
 
 template <typename T>
-void Dot(const CUDAContext& dev_ctx,
+void Dot(const GPUContext& dev_ctx,
          const DenseTensor& x,
          const DenseTensor& y,
          DenseTensor* out) {
-  eigen::Dot<CUDAContext, T>(dev_ctx, x, y, out);
+  eigen::Dot<GPUContext, T>(dev_ctx, x, y, out);
 }
 
 template <typename T>
-void Matmul(const CUDAContext& dev_ctx,
+void Matmul(const GPUContext& dev_ctx,
             const DenseTensor& x,
             const DenseTensor& y,
             bool transpose_x,
@@ -48,7 +48,7 @@ void Matmul(const CUDAContext& dev_ctx,
                     paddle::platform::errors::InvalidArgument(
                         "The Input(Y) dims size must not be equal 0,"
                         " but reviced dims size is 0. "));
-  math::MatMulFunction<CUDAContext, T>(
+  math::MatMulFunction<GPUContext, T>(
       dev_ctx, x, y, out, transpose_x, transpose_y);
 }
 
@@ -59,7 +59,7 @@ using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
 PT_REGISTER_KERNEL(dot,
-                   CUDA,
+                   GPU,
                    ALL_LAYOUT,
                    pten::Dot,
                    float,
@@ -70,7 +70,7 @@ PT_REGISTER_KERNEL(dot,
                    complex128) {}
 
 PT_REGISTER_KERNEL(matmul,
-                   CUDA,
+                   GPU,
                    ALL_LAYOUT,
                    pten::Matmul,
                    float,
diff --git a/paddle/pten/kernels/cuda/linalg.h b/paddle/pten/kernels/gpu/linalg.h
similarity index 89%
rename from paddle/pten/kernels/cuda/linalg.h
rename to paddle/pten/kernels/gpu/linalg.h
index 84f48ca609..a848f55c7b 100644
--- a/paddle/pten/kernels/cuda/linalg.h
+++ b/paddle/pten/kernels/gpu/linalg.h
@@ -17,19 +17,19 @@
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-#include "paddle/pten/backends/cuda/cuda_context.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
 
 namespace pten {
 
 template <typename T>
-void Dot(const CUDAContext& dev_ctx,
+void Dot(const GPUContext& dev_ctx,
          const DenseTensor& x,
          const DenseTensor& y,
          DenseTensor* out);
 
 template <typename T>
-void Matmul(const CUDAContext& dev_ctx,
+void Matmul(const GPUContext& dev_ctx,
             const DenseTensor& x,
             const DenseTensor& y,
             bool transpose_x,
diff --git a/paddle/pten/kernels/cuda/manipulation.cu b/paddle/pten/kernels/gpu/manipulation.cu
similarity index 88%
rename from paddle/pten/kernels/cuda/manipulation.cu
rename to paddle/pten/kernels/gpu/manipulation.cu
index 49bbf1b61c..5a82e3e030 100644
--- a/paddle/pten/kernels/cuda/manipulation.cu
+++ b/paddle/pten/kernels/gpu/manipulation.cu
@@ -14,15 +14,15 @@
 
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/infermeta/unary.h"
-#include "paddle/pten/kernels/cuda/manipulation.h"
-#include "paddle/pten/kernels/cuda/utils.h"
+#include "paddle/pten/kernels/gpu/manipulation.h"
+#include "paddle/pten/kernels/gpu/utils.h"
 #include "paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h"
 #include "paddle/pten/kernels/hybird/general/manipulation.h"
 
 namespace pten {
 
 template <typename T>
-void Flatten(const CUDAContext& dev_ctx,
+void Flatten(const GPUContext& dev_ctx,
              const DenseTensor& x,
              int start_axis,
              int stop_axis,
@@ -36,7 +36,7 @@ void Flatten(const CUDAContext& dev_ctx,
 // Output Tensor，
 // is there a more flexible way to deal with this case?
 template <typename T>
-void FlattenWithXShape(const CUDAContext& dev_ctx,
+void FlattenWithXShape(const GPUContext& dev_ctx,
                        const DenseTensor& x,
                        int start_axis,
                        int stop_axis,
@@ -46,7 +46,7 @@ void FlattenWithXShape(const CUDAContext& dev_ctx,
   general::SetXShape(x, xshape);
 }
 
-void Reshape(const CUDAContext& dev_ctx,
+void Reshape(const GPUContext& dev_ctx,
              const DenseTensor& x,
              const ScalarArray& shape,
              DenseTensor* out) {
@@ -60,7 +60,7 @@ void Reshape(const CUDAContext& dev_ctx,
   out->ResetLoD(x.lod());
 }
 
-void ReshapeWithXShape(const CUDAContext& dev_ctx,
+void ReshapeWithXShape(const GPUContext& dev_ctx,
                        const DenseTensor& x,
                        const ScalarArray& shape,
                        DenseTensor* xshape,
@@ -70,7 +70,7 @@ void ReshapeWithXShape(const CUDAContext& dev_ctx,
 }
 
 template <typename T>
-void Cast(const CUDAContext& dev_ctx,
+void Cast(const GPUContext& dev_ctx,
           const DenseTensor& x,
           DataType out_dtype,
           DataType in_dtype,
@@ -85,7 +85,7 @@ void Cast(const CUDAContext& dev_ctx,
 using float16 = paddle::platform::float16;
 
 PT_REGISTER_KERNEL(flatten,
-                   CUDA,
+                   GPU,
                    ALL_LAYOUT,
                    pten::Flatten,
                    float,
@@ -96,7 +96,7 @@ PT_REGISTER_KERNEL(flatten,
                    int,
                    int64_t) {}
 PT_REGISTER_KERNEL(flatten_with_xshape,
-                   CUDA,
+                   GPU,
                    ALL_LAYOUT,
                    pten::FlattenWithXShape,
                    float,
@@ -108,7 +108,7 @@ PT_REGISTER_KERNEL(flatten_with_xshape,
 
 #define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
   PT_REGISTER_KERNEL(cast,                              \
-                     CUDA,                              \
+                     GPU,                               \
                      ALL_LAYOUT,                        \
                      pten::Cast,                        \
                      float,                             \
@@ -132,6 +132,6 @@ PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16)
 PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
 #endif
 
-PT_REGISTER_NO_TEMPLATE_KERNEL(reshape, CUDA, ANY, pten::Reshape, ALL_DTYPE) {}
+PT_REGISTER_NO_TEMPLATE_KERNEL(reshape, GPU, ANY, pten::Reshape, ALL_DTYPE) {}
 PT_REGISTER_NO_TEMPLATE_KERNEL(
-    reshape_with_xshape, CUDA, ANY, pten::ReshapeWithXShape, ALL_DTYPE) {}
+    reshape_with_xshape, GPU, ANY, pten::ReshapeWithXShape, ALL_DTYPE) {}
diff --git a/paddle/pten/kernels/cuda/manipulation.h b/paddle/pten/kernels/gpu/manipulation.h
similarity index 86%
rename from paddle/pten/kernels/cuda/manipulation.h
rename to paddle/pten/kernels/gpu/manipulation.h
index 165b08ad59..b47fadd70b 100644
--- a/paddle/pten/kernels/cuda/manipulation.h
+++ b/paddle/pten/kernels/gpu/manipulation.h
@@ -17,7 +17,7 @@
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-#include "paddle/pten/backends/cuda/cuda_context.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
@@ -25,25 +25,25 @@
 namespace pten {
 
 template <typename T>
-void Flatten(const CUDAContext& dev_ctx,
+void Flatten(const GPUContext& dev_ctx,
              const DenseTensor& x,
              int start_axis,
              int stop_axis,
              DenseTensor* out);
 
 template <typename T>
-void Cast(const CUDAContext& dev_ctx,
+void Cast(const GPUContext& dev_ctx,
           const DenseTensor& x,
           DataType out_dtype,
           DataType in_dtype,
           DenseTensor* out);
 
-void Reshape(const CUDAContext& dev_ctx,
+void Reshape(const GPUContext& dev_ctx,
              const DenseTensor& x,
              const ScalarArray& shape,
              DenseTensor* out);
 
-void ReshapeWithXShape(const CUDAContext& dev_ctx,
+void ReshapeWithXShape(const GPUContext& dev_ctx,
                        const DenseTensor& x,
                        const ScalarArray& shape,
                        DenseTensor* xshape,
diff --git a/paddle/pten/kernels/cuda/math.cu b/paddle/pten/kernels/gpu/math.cu
similarity index 89%
rename from paddle/pten/kernels/cuda/math.cu
rename to paddle/pten/kernels/gpu/math.cu
index 3dacc01e8b..59d816d237 100644
--- a/paddle/pten/kernels/cuda/math.cu
+++ b/paddle/pten/kernels/gpu/math.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/pten/kernels/cuda/math.h"
+#include "paddle/pten/kernels/gpu/math.h"
 
 #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
 #include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h"
@@ -58,12 +58,12 @@ struct DivideFunctor {
  */
 
 template <typename T>
-void Sign(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  eigen::Sign<CUDAContext, T>(dev_ctx, x, out);
+void Sign(const GPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  eigen::Sign<GPUContext, T>(dev_ctx, x, out);
 }
 
 template <typename T>
-void Mean(const CUDAContext& dev_ctx,
+void Mean(const GPUContext& dev_ctx,
           const DenseTensor& x,
           const std::vector<int64_t>& dims,
           bool keep_dim,
@@ -84,7 +84,7 @@ DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
 DEFINE_CUDA_ELEMENTWISE_OP(Divide)
 
 template <typename T>
-void Sum(const CUDAContext& dev_ctx,
+void Sum(const GPUContext& dev_ctx,
          const DenseTensor& x,
          const std::vector<int64_t>& dims,
          bool keep_dim,
@@ -101,11 +101,10 @@ using float16 = paddle::platform::float16;
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_KERNEL(sign, CUDA, ALL_LAYOUT, pten::Sign, float, double, float16) {
-}
-PT_REGISTER_KERNEL(mean, CUDA, ALL_LAYOUT, pten::Mean, float, double, bool) {}
+PT_REGISTER_KERNEL(sign, GPU, ALL_LAYOUT, pten::Sign, float, double, float16) {}
+PT_REGISTER_KERNEL(mean, GPU, ALL_LAYOUT, pten::Mean, float, double, bool) {}
 PT_REGISTER_KERNEL(add,
-                   CUDA,
+                   GPU,
                    ALL_LAYOUT,
                    pten::Add,
                    float,
@@ -116,7 +115,7 @@ PT_REGISTER_KERNEL(add,
                    complex64,
                    complex128) {}
 PT_REGISTER_KERNEL(subtract,
-                   CUDA,
+                   GPU,
                    ALL_LAYOUT,
                    pten::Subtract,
                    float,
@@ -127,7 +126,7 @@ PT_REGISTER_KERNEL(subtract,
                    complex64,
                    complex128) {}
 PT_REGISTER_KERNEL(divide,
-                   CUDA,
+                   GPU,
                    ALL_LAYOUT,
                    pten::Divide,
                    float,
@@ -138,7 +137,7 @@ PT_REGISTER_KERNEL(divide,
                    complex64,
                    complex128) {}
 PT_REGISTER_KERNEL(multiply,
-                   CUDA,
+                   GPU,
                    ALL_LAYOUT,
                    pten::Multiply,
                    float,
@@ -150,7 +149,7 @@ PT_REGISTER_KERNEL(multiply,
                    complex64,
                    complex128) {}
 PT_REGISTER_KERNEL(sum,
-                   CUDA,
+                   GPU,
                    ALL_LAYOUT,
                    pten::Sum,
                    bool,
diff --git a/paddle/pten/kernels/cuda/math.h b/paddle/pten/kernels/gpu/math.h
similarity index 86%
rename from paddle/pten/kernels/cuda/math.h
rename to paddle/pten/kernels/gpu/math.h
index 9cb379bcf7..5a872542fb 100644
--- a/paddle/pten/kernels/cuda/math.h
+++ b/paddle/pten/kernels/gpu/math.h
@@ -17,17 +17,17 @@ limitations under the License. */
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-#include "paddle/pten/backends/cuda/cuda_context.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/dense_tensor.h"
 
 namespace pten {
 
 template <typename T>
-void Sign(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out);
+void Sign(const GPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
 template <typename T>
-void Mean(const CUDAContext& dev_ctx,
+void Mean(const GPUContext& dev_ctx,
           const DenseTensor& x,
           const std::vector<int64_t>& dims,
           bool keep_dim,
@@ -35,35 +35,35 @@ void Mean(const CUDAContext& dev_ctx,
           DenseTensor* out);
 
 template <typename T>
-void Add(const CUDAContext& dev_ctx,
+void Add(const GPUContext& dev_ctx,
          const DenseTensor& x,
          const DenseTensor& y,
          int axis,
          DenseTensor* out);
 
 template <typename T>
-void Subtract(const CUDAContext& dev_ctx,
+void Subtract(const GPUContext& dev_ctx,
               const DenseTensor& x,
               const DenseTensor& y,
               int axis,
               DenseTensor* out);
 
 template <typename T>
-void Divide(const CUDAContext& dev_ctx,
+void Divide(const GPUContext& dev_ctx,
             const DenseTensor& x,
             const DenseTensor& y,
             int axis,
             DenseTensor* out);
 
 template <typename T>
-void Multiply(const CUDAContext& dev_ctx,
+void Multiply(const GPUContext& dev_ctx,
               const DenseTensor& x,
               const DenseTensor& y,
               int axis,
               DenseTensor* out);
 
 template <typename T>
-void Sum(const CUDAContext& dev_ctx,
+void Sum(const GPUContext& dev_ctx,
          const DenseTensor& x,
          const std::vector<int64_t>& dims,
          bool keep_dim,
@@ -75,7 +75,7 @@ void Sum(const CUDAContext& dev_ctx,
 
 #define DEFINE_CUDA_ELEMENTWISE_OP(name)                               \
   template <typename T>                                                \
-  void name(const CUDAContext& dev_ctx,                                \
+  void name(const GPUContext& dev_ctx,                                 \
             const DenseTensor& x,                                      \
             const DenseTensor& y,                                      \
             int axis,                                                  \
diff --git a/paddle/pten/kernels/cuda/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu
similarity index 93%
rename from paddle/pten/kernels/cuda/scale_kernel.cu
rename to paddle/pten/kernels/gpu/scale_kernel.cu
index 904976ae95..e67fd4cfdc 100644
--- a/paddle/pten/kernels/cuda/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/pten/kernels/scale_kernel.h"
 
-#include "paddle/pten/backends/cuda/cuda_context.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/kernels/impl/scale_kernel_impl.h"
 
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 
 PT_REGISTER_CTX_KERNEL(scale,
-                       CUDA,
+                       GPU,
                        ALL_LAYOUT,
                        pten::Scale,
                        float,
diff --git a/paddle/pten/kernels/cuda/utils.cu b/paddle/pten/kernels/gpu/utils.cu
similarity index 98%
rename from paddle/pten/kernels/cuda/utils.cu
rename to paddle/pten/kernels/gpu/utils.cu
index cf1407e720..4d080be11e 100644
--- a/paddle/pten/kernels/cuda/utils.cu
+++ b/paddle/pten/kernels/gpu/utils.cu
@@ -16,11 +16,11 @@ limitations under the License. */
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/cuda/utils.h"
+#include "paddle/pten/kernels/gpu/utils.h"
 
 namespace pten {
 
-void Copy(const CUDAContext& dev_ctx,
+void Copy(const GPUContext& dev_ctx,
           const DenseTensor& src,
           bool blocking,
           DenseTensor* dst) {
@@ -234,4 +234,4 @@ void Copy(const CUDAContext& dev_ctx,
 }
 }  // namespace pten
 
-PT_REGISTER_NO_TEMPLATE_KERNEL(copy, CUDA, ALL_LAYOUT, pten::Copy, ALL_DTYPE) {}
+PT_REGISTER_NO_TEMPLATE_KERNEL(copy, GPU, ALL_LAYOUT, pten::Copy, ALL_DTYPE) {}
diff --git a/paddle/pten/kernels/cuda/utils.h b/paddle/pten/kernels/gpu/utils.h
similarity index 91%
rename from paddle/pten/kernels/cuda/utils.h
rename to paddle/pten/kernels/gpu/utils.h
index bd29347e31..3a455ad70c 100644
--- a/paddle/pten/kernels/cuda/utils.h
+++ b/paddle/pten/kernels/gpu/utils.h
@@ -17,13 +17,13 @@ limitations under the License. */
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-#include "paddle/pten/backends/cuda/cuda_context.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
 
 namespace pten {
 
-void Copy(const CUDAContext& dev_ctx,
+void Copy(const GPUContext& dev_ctx,
           const DenseTensor& src,
           bool blocking,
           DenseTensor* dst);
diff --git a/paddle/pten/kernels/hybird/CMakeLists.txt b/paddle/pten/kernels/hybird/CMakeLists.txt
index 9d4d86f067..1304aa1798 100644
--- a/paddle/pten/kernels/hybird/CMakeLists.txt
+++ b/paddle/pten/kernels/hybird/CMakeLists.txt
@@ -4,7 +4,7 @@ add_subdirectory(general)
 
 cc_library(pten_transpose_cpu SRCS transpose.cc DEPS dense_tensor pten_context)
 if(WITH_GPU)
-  nv_library(pten_transpose_cuda SRCS transpose.cu DEPS dense_tensor malloc pten_context)
+  nv_library(pten_transpose_gpu SRCS transpose.cu DEPS dense_tensor malloc pten_context)
 elseif(WITH_ROCM)
-  hip_library(pten_transpose_cuda SRCS transpose.cu DEPS dense_tensor malloc pten_context)
+  hip_library(pten_transpose_gpu SRCS transpose.cu DEPS dense_tensor malloc pten_context)
 endif()
diff --git a/paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h b/paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h
index 54a584d78d..d8c58448c9 100644
--- a/paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h
+++ b/paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h
@@ -15,7 +15,7 @@
 #pragma once
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/backends/cuda/cuda_context.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
 
 #include "paddle/fluid/platform/aligned_vector.h"
@@ -50,7 +50,7 @@ __global__ void CastCUDAKernel(const InT* in, const int64_t N, OutT* out) {
 }
 
 template <typename InT, typename OutT>
-void CastCUDAKernelImpl(const CUDAContext& dev_ctx,
+void CastCUDAKernelImpl(const GPUContext& dev_ctx,
                         const DenseTensor& x,
                         DenseTensor* out) {
   auto* in_data = x.data<InT>();
diff --git a/paddle/pten/kernels/hybird/cuda/reduce/reduce.h b/paddle/pten/kernels/hybird/cuda/reduce/reduce.h
index f55d483de1..793e8505ec 100644
--- a/paddle/pten/kernels/hybird/cuda/reduce/reduce.h
+++ b/paddle/pten/kernels/hybird/cuda/reduce/reduce.h
@@ -17,7 +17,7 @@
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-#include "paddle/pten/backends/cuda/cuda_context.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h"
@@ -49,7 +49,7 @@ static inline std::vector<int64_t> GetReduceDim(
 }
 
 template <typename T, template <typename, typename> class ReduceFunctor>
-void Reduce(const CUDAContext& dev_ctx,
+void Reduce(const GPUContext& dev_ctx,
             const DenseTensor& x,
             bool reduce_all,
             const std::vector<int64_t>& dims,
diff --git a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
index e7aecf3b27..bdb883c1df 100644
--- a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
+++ b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
@@ -42,7 +42,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/kernel_primitives/compute_primitives.h"
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/api/include/tensor.h"
-#include "paddle/pten/kernels/cuda/utils.h"
+#include "paddle/pten/kernels/gpu/utils.h"
 #include "paddle/pten/kernels/hybird/math/cast_func.h"
 
 // Reduce split or not, Whether to use ReduceHigherDim
@@ -820,7 +820,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
       y->Resize(out_dims);
     } else {
       PD_VISIT_ALL_TYPES(y->dtype(), "CastKernelImpl", ([&] {
-                           pten::math::CastKernelImpl<CUDAContext, Tx, data_t>(
+                           pten::math::CastKernelImpl<GPUContext, Tx, data_t>(
                                *dev_ctx, x, y);
                          }));
     }
diff --git a/paddle/pten/kernels/hybird/general/elementwise_base.h b/paddle/pten/kernels/hybird/general/elementwise_base.h
index 827af86812..20154a8744 100644
--- a/paddle/pten/kernels/hybird/general/elementwise_base.h
+++ b/paddle/pten/kernels/hybird/general/elementwise_base.h
@@ -132,11 +132,11 @@ class MidWiseTransformIterator<T, CPUContext>
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T>
-class RowwiseTransformIterator<T, CUDAContext>
-    : public thrust::iterator_adaptor<RowwiseTransformIterator<T, CUDAContext>,
+class RowwiseTransformIterator<T, GPUContext>
+    : public thrust::iterator_adaptor<RowwiseTransformIterator<T, GPUContext>,
                                       const T *> {
  public:
-  typedef thrust::iterator_adaptor<RowwiseTransformIterator<T, CUDAContext>,
+  typedef thrust::iterator_adaptor<RowwiseTransformIterator<T, GPUContext>,
                                    const T *>
       super_t;
   HOSTDEVICE RowwiseTransformIterator(const T *x, int n)
@@ -152,11 +152,11 @@ class RowwiseTransformIterator<T, CUDAContext>
 };
 
 template <typename T>
-class MidWiseTransformIterator<T, CUDAContext>
-    : public thrust::iterator_adaptor<MidWiseTransformIterator<T, CUDAContext>,
+class MidWiseTransformIterator<T, GPUContext>
+    : public thrust::iterator_adaptor<MidWiseTransformIterator<T, GPUContext>,
                                       const T *> {
  public:
-  typedef thrust::iterator_adaptor<MidWiseTransformIterator<T, CUDAContext>,
+  typedef thrust::iterator_adaptor<MidWiseTransformIterator<T, GPUContext>,
                                    const T *>
       super_t;
   HOSTDEVICE MidWiseTransformIterator(const T *x, int n, int post)
diff --git a/paddle/pten/kernels/hybird/transpose.cu b/paddle/pten/kernels/hybird/transpose.cu
index 5c5bd29117..195277c216 100644
--- a/paddle/pten/kernels/hybird/transpose.cu
+++ b/paddle/pten/kernels/hybird/transpose.cu
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/pten/backends/cuda/cuda_context.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/kernels/hybird/math/cast_func.h"
 #include "paddle/pten/kernels/hybird/transpose.h"
@@ -52,9 +52,9 @@ __global__ void TransposeNormalKernel(const T* in_ptr,
 }
 
 template <typename T>
-struct TransposeNormal<CUDAContext, T> {
+struct TransposeNormal<GPUContext, T> {
   // for dims >= 7 situation
-  void operator()(const CUDAContext& dev_ctx,
+  void operator()(const GPUContext& dev_ctx,
                   const pten::DenseTensor& in,
                   pten::DenseTensor* out,
                   const std::vector<int64_t>& axis) {
@@ -106,7 +106,7 @@ struct TransposeNormal<CUDAContext, T> {
 
 // define transpose normal
 #define DEFINE_GPU_TRANS_NORMAL(TYPE) \
-  template struct TransposeNormal<CUDAContext, TYPE>
+  template struct TransposeNormal<GPUContext, TYPE>
 
 DEFINE_GPU_TRANS_NORMAL(bool);
 DEFINE_GPU_TRANS_NORMAL(int8_t);
diff --git a/paddle/pten/tests/api/scale_api.h b/paddle/pten/tests/api/scale_api.h
index 1defbd02dd..d525b305c7 100644
--- a/paddle/pten/tests/api/scale_api.h
+++ b/paddle/pten/tests/api/scale_api.h
@@ -142,13 +142,13 @@ static void ScaleCPU(DataType kernel_dtype,
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-static void ScaleCUDA(DataType kernel_dtype,
-                      const pten::CUDAContext& dev_ctx,
-                      const pten::DenseTensor& x,
-                      const Scalar& scale,
-                      float bias,
-                      bool bias_after_scale,
-                      pten::DenseTensor* dense_out) {
+static void ScaleGPU(DataType kernel_dtype,
+                     const pten::GPUContext& dev_ctx,
+                     const pten::DenseTensor& x,
+                     const Scalar& scale,
+                     float bias,
+                     bool bias_after_scale,
+                     pten::DenseTensor* dense_out) {
   switch (kernel_dtype) {
     case pten::DataType::FLOAT64: {
       pten::Scale<double>(
@@ -255,14 +255,14 @@ Tensor scale_switch_case(const Tensor& x,
                dense_out.get());
       break;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    case Backend::CUDA:
-      ScaleCUDA(kernel_data_type,
-                static_cast<const pten::CUDAContext&>(*dev_ctx),
-                *dense_x,
-                scale,
-                bias,
-                bias_after_scale,
-                dense_out.get());
+    case Backend::GPU:
+      ScaleGPU(kernel_data_type,
+               static_cast<const pten::GPUContext&>(*dev_ctx),
+               *dense_x,
+               scale,
+               bias,
+               bias_after_scale,
+               dense_out.get());
       break;
 #endif
     default:
diff --git a/paddle/pten/tests/api/test_matmul_api.cc b/paddle/pten/tests/api/test_matmul_api.cc
index 01ca4aad64..e29fa11d58 100644
--- a/paddle/pten/tests/api/test_matmul_api.cc
+++ b/paddle/pten/tests/api/test_matmul_api.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/cuda/utils.h"
+#include "paddle/pten/kernels/gpu/utils.h"
 
 namespace paddle {
 namespace tests {
diff --git a/paddle/pten/tests/api/test_tensor_utils.cc b/paddle/pten/tests/api/test_tensor_utils.cc
index c445e18679..b59cee5dc7 100644
--- a/paddle/pten/tests/api/test_tensor_utils.cc
+++ b/paddle/pten/tests/api/test_tensor_utils.cc
@@ -110,7 +110,7 @@ TEST(PtenUtils, VarToPtTensor) {
   pten::Backend expect_backend = pten::Backend::CPU;
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  expect_backend = pten::Backend::CUDA;
+  expect_backend = pten::Backend::GPU;
 #endif
   auto tensor_def = pten::TensorArgDef(
       expect_backend, pten::DataLayout::NCHW, pten::DataType::INT32);
diff --git a/paddle/pten/tests/api/test_to_api.cc b/paddle/pten/tests/api/test_to_api.cc
index 5bb6f386c7..47e8ff7c2c 100644
--- a/paddle/pten/tests/api/test_to_api.cc
+++ b/paddle/pten/tests/api/test_to_api.cc
@@ -64,7 +64,7 @@ TEST(API, copy_to) {
 
 // 2. test API
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  auto tmp = paddle::experimental::copy_to(x, pten::Backend::CUDA, false);
+  auto tmp = paddle::experimental::copy_to(x, pten::Backend::GPU, false);
   auto out = paddle::experimental::copy_to(tmp, pten::Backend::CPU, true);
 #else
   auto out = paddle::experimental::copy_to(x, pten::Backend::CPU, false);
@@ -80,7 +80,7 @@ TEST(Tensor, copy_to) {
 
 // 2. test API
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  auto tmp = x.copy_to(pten::Backend::CUDA, false);
+  auto tmp = x.copy_to(pten::Backend::GPU, false);
   auto out = tmp.copy_to(pten::Backend::CPU, true);
 #else
   auto out = x.copy_to(pten::Backend::CPU, false);
diff --git a/paddle/pten/tests/common/test_backend.cc b/paddle/pten/tests/common/test_backend.cc
index 148785d4b6..8b006ee0aa 100644
--- a/paddle/pten/tests/common/test_backend.cc
+++ b/paddle/pten/tests/common/test_backend.cc
@@ -29,8 +29,8 @@ TEST(Backend, OStream) {
   oss << pten::Backend::CPU;
   EXPECT_EQ(oss.str(), "CPU");
   oss.str("");
-  oss << pten::Backend::CUDA;
-  EXPECT_EQ(oss.str(), "CUDA");
+  oss << pten::Backend::GPU;
+  EXPECT_EQ(oss.str(), "GPU");
   oss.str("");
   oss << pten::Backend::XPU;
   EXPECT_EQ(oss.str(), "XPU");
-- 
GitLab