unify fluid::CUDADeviceContext and phi::GpuContext (#44723)

* remove cudaDeviceContext * remove more template * fix rocm compile

unify fluid::CUDADeviceContext and phi::GpuContext (#44723)
* remove cudaDeviceContext * remove more template * fix rocm compile
88490567 · Leo Chen · GitHub · 0a2db7c8 · 88490567 · 88490567
26 changed file
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -23,12 +23,6 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"

-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace framework {
 class GarbageCollector;

--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -25,11 +25,6 @@
 #include "paddle/fluid/platform/place.h"

 namespace paddle {
-
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
-
 namespace memory {
 namespace allocation {


--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -23,13 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/miopen_lstm_cache.h"
 #endif

-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {


--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
@@ -182,7 +182,7 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
 #endif

  size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
-  platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N);
+  platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(dev_ctx, N);
  // first sum pool
  FusedSeqpoolKernelNormal<<<config.block_per_grid.x,
                             config.thread_per_block.x,
@@ -209,7 +209,8 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
    // not need show click input
    N = static_cast<size_t>(batch_size * slot_num *
                            (embedding_size - cvm_offset));
-    platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N);
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(dev_ctx, N);
    FusedCVMKernelNoCVM<<<config.block_per_grid.x,
                          config.thread_per_block.x,
                          0,
@@ -391,7 +392,7 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
 #endif

  size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
-  auto config = GetGpuLaunchConfig1D(dev_ctx, N);
+  auto config = platform::GetGpuLaunchConfig1D(dev_ctx, N);
  if (use_cvm) {
    // join grad
    FusedSeqpoolCVMGradKernelWithCVM<<<config.block_per_grid.x,

--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/gru_op.h"

-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {


--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -150,11 +150,6 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
  }
 }

-template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
-template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
-template class CrossEntropyFunctor<platform::CUDADeviceContext,
-                                   platform::float16>;
-
 template class CrossEntropyFunctor<phi::GPUContext, float>;
 template class CrossEntropyFunctor<phi::GPUContext, double>;
 template class CrossEntropyFunctor<phi::GPUContext, platform::float16>;

--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -308,24 +308,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
  }
 };

-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext,
-                             float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext,
-                             double>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                             phi::GPUContext,
                             float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                             phi::GPUContext,
                             double>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext,
-                             float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext,
-                             double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                             phi::GPUContext,
                             float>;
@@ -576,12 +564,6 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
  }
 };

-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext,
-                             float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext,
-                             double>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                             phi::GPUContext,
                             float>;
@@ -589,12 +571,6 @@ template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                             phi::GPUContext,
                             double>;

-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext,
-                             float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext,
-                             double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                             phi::GPUContext,
                             float>;

--- a/paddle/fluid/operators/math/maxouting.cu
+++ b/paddle/fluid/operators/math/maxouting.cu
@@ -173,12 +173,6 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
                                                              axis);
 }

-template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
-template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
-
-template class MaxOutFunctor<platform::CUDADeviceContext, float>;
-template class MaxOutFunctor<platform::CUDADeviceContext, double>;
-
 template class MaxOutGradFunctor<phi::GPUContext, float>;
 template class MaxOutGradFunctor<phi::GPUContext, double>;


--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -22,12 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/sampler.h"
 #include "paddle/phi/core/ddim.h"

-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {

--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -133,77 +133,6 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
 }
 }  // namespace

-template <typename T>
-struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const phi::SelectedRows& input1,
-                  const framework::Tensor& input2,
-                  framework::Tensor* output) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2.dims();
-    auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(
-        in1_height,
-        in2_dims[0],
-        platform::errors::InvalidArgument(
-            "The two inputs height must be equal."
-            "But received first input height = [%d], first input height = [%d]",
-            in1_height,
-            in2_dims[0]));
-    PADDLE_ENFORCE_EQ(
-        in1_height,
-        out_dims[0],
-        platform::errors::InvalidArgument(
-            "The input and output height must be equal."
-            "But received input height = [%d], output height = [%d]",
-            in1_height,
-            out_dims[0]));
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(
-        in1_row_numel,
-        input2.numel() / in1_height,
-        platform::errors::InvalidArgument(
-            "The two inputs width must be equal."
-            "But received first input width = [%d], second input width = [%d]",
-            in1_row_numel,
-            input2.numel() / in1_height));
-    PADDLE_ENFORCE_EQ(
-        in1_row_numel,
-        output->numel() / in1_height,
-        platform::errors::InvalidArgument(
-            "The input and output width must be equal."
-            "But received input width = [%d], output width = [%d]",
-            in1_row_numel,
-            output->numel() / in1_height));
-
-    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = input2.data<T>();
-    auto* out_data = output->data<T>();
-
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(context, output, static_cast<T>(0));
-
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid(in1_rows.size(), 1);
-    paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
-    SelectedRowsAddTensorKernel<T, block_size>
-        <<<grid, threads, 0, context.stream()>>>(
-            in1_data,
-            mixv_in1_rows.CUDAData(context.GetPlace()),
-            out_data,
-            in1_row_numel);
-
-    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
-    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
-    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
-  }
-};
-
 template <typename T>
 struct SelectedRowsAddTensor<phi::GPUContext, T> {
  void operator()(const phi::GPUContext& context,
@@ -275,12 +204,6 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
  }
 };

-template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
-template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;
-template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
-                                      platform::float16>;
-
 template struct SelectedRowsAddTensor<phi::GPUContext, float>;
 template struct SelectedRowsAddTensor<phi::GPUContext, double>;
 template struct SelectedRowsAdd<phi::GPUContext, platform::float16>;
@@ -363,50 +286,6 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
 }
 }  // namespace

-template <typename T>
-struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const phi::SelectedRows& input1,
-                  framework::Tensor* input2) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(
-        in1_height,
-        in2_dims[0],
-        platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But received first input height = "
-                                          "[%d], second input height = [%d]",
-                                          in1_height,
-                                          in2_dims[0]));
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(
-        in1_row_numel,
-        input2->numel() / in1_height,
-        platform::errors::InvalidArgument(
-            "The two inputs width must be equal."
-            "But received first input width = [%d], second input width = [%d]",
-            in1_row_numel,
-            input2->numel() / in1_height));
-
-    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = input2->data<T>();
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid(in1_rows.size(), 1);
-    paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
-    SelectedRowsAddToTensorKernel<T, block_size>
-        <<<grid, threads, 0, context.stream()>>>(
-            in1_data,
-            mixv_in1_rows.CUDAData(context.GetPlace()),
-            in2_data,
-            in1_row_numel);
-  }
-};
-
 template <typename T>
 struct SelectedRowsAddToTensor<phi::GPUContext, T> {
  void operator()(const phi::GPUContext& context,
@@ -451,12 +330,6 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
  }
 };

-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
-                                        platform::float16>;
 template struct SelectedRowsAddToTensor<phi::GPUContext, float>;
 template struct SelectedRowsAddToTensor<phi::GPUContext, double>;
 template struct SelectedRowsAddToTensor<phi::GPUContext, int>;
@@ -625,34 +498,6 @@ struct MergeAddImpl {
  }
 };

-template <typename T>
-struct MergeAdd<platform::CUDADeviceContext, T> {
-  // unary functor, merge by adding duplicated rows in
-  // the input SelectedRows object.
-  phi::SelectedRows operator()(const platform::CUDADeviceContext& context,
-                               const phi::SelectedRows& input,
-                               const bool sorted_result) {
-    return MergeAddImpl<platform::CUDADeviceContext, T>()(
-        context, input, sorted_result);
-  }
-
-  void operator()(const platform::CUDADeviceContext& context,
-                  const phi::SelectedRows& input,
-                  phi::SelectedRows* output,
-                  const bool sorted_result) {
-    MergeAddImpl<platform::CUDADeviceContext, T>()(
-        context, input, output, sorted_result);
-  }
-
-  void operator()(const platform::CUDADeviceContext& context,
-                  const std::vector<const phi::SelectedRows*>& inputs,
-                  phi::SelectedRows* output,
-                  const bool sorted_result) {
-    MergeAddImpl<platform::CUDADeviceContext, T>()(
-        context, inputs, output, sorted_result);
-  }
-};
-
 template <typename T>
 struct MergeAdd<phi::GPUContext, T> {
  // unary functor, merge by adding duplicated rows in
@@ -678,10 +523,8 @@ struct MergeAdd<phi::GPUContext, T> {
  }
 };

-#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype)                    \
-  template struct MergeAddImpl<platform::CUDADeviceContext, dtype>; \
-  template struct MergeAddImpl<phi::GPUContext, dtype>;             \
-  template struct MergeAdd<platform::CUDADeviceContext, dtype>;     \
+#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype)        \
+  template struct MergeAddImpl<phi::GPUContext, dtype>; \
  template struct MergeAdd<phi::GPUContext, dtype>;

 TEMPLATE_SPECIALIZED_FOR_MERGEADD(float)

--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -57,88 +57,6 @@ __global__ void SequencePaddingKernel(T* dst,
  }
 }

-template <typename T>
-class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::LoDTensor& seq_tensor,
-                  framework::LoDTensor* pad_tensor,
-                  const framework::LoDTensor& pad_value,
-                  int pad_seq_len = -1,
-                  int lod_level = 0,
-                  bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth) {
-    auto seq_lod = seq_tensor.lod();
-    auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
-    const auto& seq_tensor_dims = seq_tensor.dims();
-    const auto& pad_tensor_dims = pad_tensor->dims();
-    int max_seq_len = MaximumSequenceLength(seq_offsets);
-    if (pad_seq_len == -1) {
-      pad_seq_len = max_seq_len;
-    }
-    PADDLE_ENFORCE_GE(
-        pad_seq_len,
-        max_seq_len,
-        platform::errors::InvalidArgument(
-            "The pad_seq_len must be equal to or greater than the "
-            "original max sequence length. Expected %ld >= %ld, but got %ld < "
-            "%ld. Please check the input value.",
-            pad_seq_len,
-            max_seq_len,
-            pad_seq_len,
-            max_seq_len));
-    int step_width = seq_tensor.numel() / seq_tensor_dims[0];
-    int seq_num = seq_offsets.size() - 1;
-
-    CheckDims(seq_tensor_dims,
-              pad_tensor_dims,
-              seq_offsets,
-              pad_seq_len,
-              step_width,
-              layout);
-    PADDLE_ENFORCE_EQ(
-        pad_value.numel() == 1 || pad_value.numel() == step_width,
-        true,
-        platform::errors::InvalidArgument(
-            "The numel of 'pad_value' can only be 1 or be equal to "
-            "the 'step_width', but got %ld != 1 and %ld. Please check the "
-            "input value.",
-            pad_value.numel(),
-            step_width));
-
-    const int kBlockSize = 512;
-
-    /* At least use 32 threads to copy sequence_width elements,
-     * and at least 8 elements for each thread.
-     */
-    size_t block_dim_x =
-        std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
-    size_t block_dim_y = kBlockSize / block_dim_x;
-    dim3 threads(block_dim_x, block_dim_y);
-
-    size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
-    size_t grid_dim_y = seq_num;
-    dim3 grid(grid_dim_x, grid_dim_y);
-
-    const T* seq_data = seq_tensor.data<T>();
-    T* pad_data = pad_tensor->data<T>();
-    const T* pad_value_data = pad_value.data<T>();
-
-    paddle::framework::MixVector<size_t> mix_vector_seq_offsets(&seq_offsets);
-    SequencePaddingKernel<T, kSeqToPad><<<grid, threads, 0, context.stream()>>>(
-        pad_data,
-        seq_data,
-        pad_value_data,
-        pad_value.numel() == 1,
-        mix_vector_seq_offsets.CUDAData(context.GetPlace()),
-        seq_num,
-        pad_seq_len,
-        step_width,
-        norm_by_times,
-        layout);
-  }
-};
-
 template <typename T>
 class PaddingLoDTensorFunctor<phi::GPUContext, T> {
 public:
@@ -221,73 +139,6 @@ class PaddingLoDTensorFunctor<phi::GPUContext, T> {
  }
 };

-template <typename T>
-class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::LoDTensor& pad_tensor,
-                  framework::LoDTensor* seq_tensor,
-                  int pad_seq_len = -1,
-                  int lod_level = 0,
-                  bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth) {
-    auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
-    const auto& seq_tensor_dims = seq_tensor->dims();
-    const auto& pad_tensor_dims = pad_tensor.dims();
-    int max_seq_len = MaximumSequenceLength(seq_offsets);
-    if (pad_seq_len == -1) {
-      pad_seq_len = max_seq_len;
-    }
-    int step_width = seq_tensor->numel() / seq_tensor_dims[0];
-    int seq_num = seq_offsets.size() - 1;
-
-    CheckDims(seq_tensor_dims,
-              pad_tensor_dims,
-              seq_offsets,
-              pad_seq_len,
-              step_width,
-              layout);
-    /*
-    if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
-      paddle::framework::TensorCopy(pad_tensor, context.GetPlace(), context,
-    seq_tensor);
-      seq_tensor->Resize(seq_tensor_dims);
-      return;
-    }
-    */
-
-    const int kBlockSize = 512;
-
-    /* At least use 32 threads to copy sequence_width elements,
-     * and at least 8 elements for each thread.
-     */
-    size_t block_dim_x =
-        std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
-    size_t block_dim_y = kBlockSize / block_dim_x;
-    dim3 threads(block_dim_x, block_dim_y);
-
-    size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
-    size_t grid_dim_y = seq_num;
-    dim3 grid(grid_dim_x, grid_dim_y);
-
-    const T* pad_data = pad_tensor.data<T>();
-    T* seq_data = seq_tensor->data<T>();
-
-    paddle::framework::MixVector<size_t> mixv_seq_offsets(&seq_offsets);
-    SequencePaddingKernel<T, kPadToSeq><<<grid, threads, 0, context.stream()>>>(
-        seq_data,
-        pad_data,
-        nullptr,
-        false,
-        mixv_seq_offsets.CUDAData(context.GetPlace()),
-        seq_num,
-        pad_seq_len,
-        step_width,
-        norm_by_times,
-        layout);
-  }
-};
-
 template <typename T>
 class UnpaddingLoDTensorFunctor<phi::GPUContext, T> {
 public:
@@ -355,16 +206,6 @@ class UnpaddingLoDTensorFunctor<phi::GPUContext, T> {
  }
 };

-template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, int>;
-template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, int64_t>;
-template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
-template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, double>;
-
-template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, int>;
-template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, int64_t>;
-template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
-template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, double>;
-
 template class PaddingLoDTensorFunctor<phi::GPUContext, int>;
 template class PaddingLoDTensorFunctor<phi::GPUContext, int64_t>;
 template class PaddingLoDTensorFunctor<phi::GPUContext, float>;

--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@@ -35,43 +35,6 @@ __global__ void SequenceScaleKernel(T* seq,
  }
 }

-template <typename T>
-class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const T* scales,
-                  framework::LoDTensor* seq) {
-    const size_t level = 0;
-    auto lod = seq->lod();
-    const size_t num_seq = lod[level].size() - 1;
-    const size_t seq_width = seq->numel() / seq->dims()[0];
-    auto abs_offset_lod = framework::ToAbsOffset(lod);
-    T* seq_data = seq->mutable_data<T>(context.GetPlace());
-    paddle::framework::MixVector<size_t> mix_vector(&(abs_offset_lod[level]));
-
-#ifdef PADDLE_WITH_HIP
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS>),
-        dim3(num_seq),
-        dim3(PADDLE_CUDA_NUM_THREADS),
-        0,
-        context.stream(),
-        seq_data,
-        mix_vector.CUDAMutableData(context.GetPlace()),
-        scales,
-        seq_width);
-#else
-    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS>
-        <<<num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
-            seq_data,
-            mix_vector.CUDAMutableData(context.GetPlace()),
-            scales,
-            seq_width);
-#endif
-    mix_vector.CopyToCPU();
-  }
-};
-
 template <typename T>
 class ScaleLoDTensorFunctor<phi::GPUContext, T> {
 public:
@@ -109,9 +72,6 @@ class ScaleLoDTensorFunctor<phi::GPUContext, T> {
  }
 };

-template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, float>;
-template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, double>;
-
 template class ScaleLoDTensorFunctor<phi::GPUContext, float>;
 template class ScaleLoDTensorFunctor<phi::GPUContext, double>;


--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -141,56 +141,21 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
 #endif
 }

-template class SoftmaxCUDNNFunctor<float, platform::CUDADeviceContext>;
-template class SoftmaxCUDNNFunctor<platform::float16,
-                                   platform::CUDADeviceContext>;
-template class SoftmaxGradCUDNNFunctor<float, platform::CUDADeviceContext>;
-template class SoftmaxGradCUDNNFunctor<platform::float16,
-                                       platform::CUDADeviceContext>;
 template class SoftmaxCUDNNFunctor<float, phi::GPUContext>;
 template class SoftmaxCUDNNFunctor<platform::float16, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<float, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<platform::float16, phi::GPUContext>;
 #if CUDNN_VERSION_MIN(8, 1, 0)
-template class SoftmaxCUDNNFunctor<platform::bfloat16,
-                                   platform::CUDADeviceContext>;
-template class SoftmaxGradCUDNNFunctor<platform::bfloat16,
-                                       platform::CUDADeviceContext>;
 template class SoftmaxCUDNNFunctor<platform::bfloat16, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<platform::bfloat16, phi::GPUContext>;
 #endif

 // MIOPEN do not support double
 #ifndef PADDLE_WITH_HIP
-template class SoftmaxCUDNNFunctor<double, platform::CUDADeviceContext>;
-template class SoftmaxGradCUDNNFunctor<double, platform::CUDADeviceContext>;
 template class SoftmaxCUDNNFunctor<double, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<double, phi::GPUContext>;
 #endif

-template class SoftmaxFunctor<platform::CUDADeviceContext,
-                              platform::float16,
-                              false>;
-template class SoftmaxFunctor<platform::CUDADeviceContext,
-                              platform::float16,
-                              true>;
-template class SoftmaxFunctor<platform::CUDADeviceContext,
-                              platform::bfloat16,
-                              false>;
-template class SoftmaxFunctor<platform::CUDADeviceContext,
-                              platform::bfloat16,
-                              true>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, float, false>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, double, false>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, float, true>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, double, true>;
-template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
-template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
-template class SoftmaxGradFunctor<platform::CUDADeviceContext,
-                                  platform::float16>;
-template class SoftmaxGradFunctor<platform::CUDADeviceContext,
-                                  platform::bfloat16>;
-
 template class SoftmaxFunctor<phi::GPUContext, platform::float16, false>;
 template class SoftmaxFunctor<phi::GPUContext, platform::float16, true>;
 template class SoftmaxFunctor<phi::GPUContext, platform::bfloat16, false>;

--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -417,13 +417,9 @@ void Col2VolFunctor<DeviceContext, T>::operator()(
 }
 // };

-template class Vol2ColFunctor<platform::CUDADeviceContext, float>;
-template class Vol2ColFunctor<platform::CUDADeviceContext, double>;
 template class Vol2ColFunctor<phi::GPUContext, float>;
 template class Vol2ColFunctor<phi::GPUContext, double>;

-template class Col2VolFunctor<platform::CUDADeviceContext, float>;
-template class Col2VolFunctor<platform::CUDADeviceContext, double>;
 template class Col2VolFunctor<phi::GPUContext, float>;
 template class Col2VolFunctor<phi::GPUContext, double>;


--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
@@ -16,12 +16,6 @@

 #include "paddle/fluid/framework/op_registry.h"

-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 REGISTER_OP_CUDA_KERNEL(
    sequence_concat,
    paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext,

--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -51,7 +51,6 @@ namespace platform {
 //
 // The NCCLComm instance is created and reversed in the NCCLCommContext
 // singleton with a global user specified group id.
-class CUDADeviceContext;

 class NCCLComm {
 public:

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -533,11 +533,6 @@ void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
  allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
 }

-CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
-    : phi::GPUContext(place) {}
-
-CUDADeviceContext::~CUDADeviceContext() = default;
-
 CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
  eigen_device_.reset(new Eigen::DefaultDevice());
 }

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -271,15 +271,7 @@ struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
 class CudnnWorkspaceHandle;
 class EigenCudaStreamDevice;

-class CUDADeviceContext : public phi::GPUContext {
- public:
-  explicit CUDADeviceContext(CUDAPlace place);
-  virtual ~CUDADeviceContext();
-
- private:
-  int place_holder_;  // TO BE REMOVED
-  DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
-};
+using CUDADeviceContext = phi::GPUContext;

 class CudnnWorkspaceHandle {
 public:

--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
@@ -96,66 +96,6 @@ struct Transform<phi::CPUContext> {
 };

 #if defined(__NVCC__) || defined(__HIPCC__)
-template <>
-struct Transform<platform::CUDADeviceContext> {
-  template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(const platform::CUDADeviceContext& context,
-                  InputIter first,
-                  InputIter last,
-                  OutputIter result,
-                  UnaryOperation op) {
-    auto place = context.GetPlace();
-    PADDLE_ENFORCE_EQ(is_gpu_place(place),
-                      true,
-                      platform::errors::PreconditionNotMet(
-                          "The CUDA Transform must be used in GPU place."));
-#ifdef __HIPCC__
-    thrust::transform(thrust::hip::par.on(context.stream()),
-                      details::CastToCUDATransformIterator(first),
-                      details::CastToCUDATransformIterator(last),
-                      details::CastToCUDATransformIterator(result),
-                      op);
-#else
-    thrust::transform(thrust::cuda::par.on(context.stream()),
-                      details::CastToCUDATransformIterator(first),
-                      details::CastToCUDATransformIterator(last),
-                      details::CastToCUDATransformIterator(result),
-                      op);
-#endif
-  }
-
-  template <typename InputIter1,
-            typename InputIter2,
-            typename OutputIter,
-            typename BinaryOperation>
-  void operator()(const platform::CUDADeviceContext& context,
-                  InputIter1 first1,
-                  InputIter1 last1,
-                  InputIter2 first2,
-                  OutputIter result,
-                  BinaryOperation op) {
-    auto place = context.GetPlace();
-    PADDLE_ENFORCE_EQ(is_gpu_place(place),
-                      true,
-                      platform::errors::PreconditionNotMet(
-                          "The CUDA Transform must be used in GPU place."));
-#ifdef __HIPCC__
-    thrust::transform(thrust::hip::par.on(context.stream()),
-                      details::CastToCUDATransformIterator(first1),
-                      details::CastToCUDATransformIterator(last1),
-                      details::CastToCUDATransformIterator(first2),
-                      details::CastToCUDATransformIterator(result),
-                      op);
-#else
-    thrust::transform(thrust::cuda::par.on(context.stream()),
-                      details::CastToCUDATransformIterator(first1),
-                      details::CastToCUDATransformIterator(last1),
-                      details::CastToCUDATransformIterator(first2),
-                      details::CastToCUDATransformIterator(result),
-                      op);
-#endif
-  }
-};

 template <>
 struct Transform<phi::GPUContext> {

--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
--- a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
--- a/paddle/phi/kernels/funcs/fc_functor.cu
+++ b/paddle/phi/kernels/funcs/fc_functor.cu
@@ -313,10 +313,6 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
  AddReluKernel(context.stream(), M, N, Y, B, relu);
 }

-template class FCFunctor<paddle::platform::CUDADeviceContext, float16>;
-template class FCFunctor<paddle::platform::CUDADeviceContext, float>;
-template class FCFunctor<paddle::platform::CUDADeviceContext, double>;
-
 template class FCFunctor<GPUContext, float16>;
 template class FCFunctor<GPUContext, float>;
 template class FCFunctor<GPUContext, double>;

--- a/paddle/phi/kernels/funcs/for_range.h
+++ b/paddle/phi/kernels/funcs/for_range.h
@@ -91,22 +91,6 @@ struct ForRange<phi::GPUContext> {
  size_t limit_;
 };

-// NOTE: After the pten kernel is migrated, it needs to be deleted.
-template <>
-struct ForRange<paddle::platform::CUDADeviceContext> {
-  ForRange(const paddle::platform::CUDADeviceContext& dev_ctx, size_t limit)
-      : dev_ctx_(dev_ctx), limit_(limit) {}
-
-  template <typename Function>
-  inline void operator()(Function func) const {
-    phi::funcs::ForRange<phi::GPUContext> for_range(dev_ctx_, limit_);
-    for_range(func);
-  }
-
-  const paddle::platform::CUDADeviceContext& dev_ctx_;
-  size_t limit_;
-};
-
 #endif

 }  // namespace funcs

--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -31,22 +31,6 @@ namespace funcs {
 using float16 = phi::dtype::float16;
 using bfloat16 = phi::dtype::bfloat16;

-template struct SetConstant<paddle::platform::CUDADeviceContext,
-                            phi::dtype::float16>;
-template struct SetConstant<paddle::platform::CUDADeviceContext,
-                            phi::dtype::bfloat16>;
-template struct SetConstant<paddle::platform::CUDADeviceContext, float>;
-template struct SetConstant<paddle::platform::CUDADeviceContext, double>;
-template struct SetConstant<paddle::platform::CUDADeviceContext, uint8_t>;
-template struct SetConstant<paddle::platform::CUDADeviceContext, int>;
-template struct SetConstant<paddle::platform::CUDADeviceContext, int16_t>;
-template struct SetConstant<paddle::platform::CUDADeviceContext, int64_t>;
-template struct SetConstant<paddle::platform::CUDADeviceContext, bool>;
-template struct SetConstant<paddle::platform::CUDADeviceContext,
-                            phi::dtype::complex<float>>;
-template struct SetConstant<paddle::platform::CUDADeviceContext,
-                            phi::dtype::complex<double>>;
-
 template struct SetConstant<phi::GPUContext, phi::dtype::float16>;
 template struct SetConstant<phi::GPUContext, phi::dtype::bfloat16>;
 template struct SetConstant<phi::GPUContext, float>;
@@ -75,44 +59,18 @@ template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
 template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
                            phi::dtype::complex<double>>;

-#define DEFINE_GPU_TRANS(RANK)                                                 \
-  template struct Transpose<paddle::platform::CUDADeviceContext, bool, RANK>;  \
-  template struct Transpose<paddle::platform::CUDADeviceContext, float, RANK>; \
-  template struct Transpose<paddle::platform::CUDADeviceContext,               \
-                            double,                                            \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CUDADeviceContext,               \
-                            float16,                                           \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CUDADeviceContext,               \
-                            bfloat16,                                          \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CUDADeviceContext,               \
-                            int8_t,                                            \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CUDADeviceContext,               \
-                            int32_t,                                           \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CUDADeviceContext,               \
-                            int64_t,                                           \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CUDADeviceContext,               \
-                            phi::dtype::complex<float>,                        \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CUDADeviceContext,               \
-                            phi::dtype::complex<double>,                       \
-                            RANK>;                                             \
-  template struct Transpose<phi::GPUContext, bool, RANK>;                      \
-  template struct Transpose<phi::GPUContext, float, RANK>;                     \
-  template struct Transpose<phi::GPUContext, double, RANK>;                    \
-  template struct Transpose<phi::GPUContext, float16, RANK>;                   \
-  template struct Transpose<phi::GPUContext, bfloat16, RANK>;                  \
-  template struct Transpose<phi::GPUContext, int8_t, RANK>;                    \
-  template struct Transpose<phi::GPUContext, int32_t, RANK>;                   \
-  template struct Transpose<phi::GPUContext, int64_t, RANK>;                   \
-  template struct Transpose<phi::GPUContext,                                   \
-                            phi::dtype::complex<float>,                        \
-                            RANK>;                                             \
+#define DEFINE_GPU_TRANS(RANK)                                \
+  template struct Transpose<phi::GPUContext, bool, RANK>;     \
+  template struct Transpose<phi::GPUContext, float, RANK>;    \
+  template struct Transpose<phi::GPUContext, double, RANK>;   \
+  template struct Transpose<phi::GPUContext, float16, RANK>;  \
+  template struct Transpose<phi::GPUContext, bfloat16, RANK>; \
+  template struct Transpose<phi::GPUContext, int8_t, RANK>;   \
+  template struct Transpose<phi::GPUContext, int32_t, RANK>;  \
+  template struct Transpose<phi::GPUContext, int64_t, RANK>;  \
+  template struct Transpose<phi::GPUContext,                  \
+                            phi::dtype::complex<float>,       \
+                            RANK>;                            \
  template struct Transpose<phi::GPUContext, phi::dtype::complex<double>, RANK>;

 DEFINE_GPU_TRANS(1);
@@ -240,8 +198,7 @@ struct TransposeNormal<phi::GPUContext, T> {
 };

 // define transpose normal
-#define DEFINE_GPU_TRANS_NORMAL(TYPE)                                         \
-  template struct TransposeNormal<paddle::platform::CUDADeviceContext, TYPE>; \
+#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
  template struct TransposeNormal<phi::GPUContext, TYPE>

 DEFINE_GPU_TRANS_NORMAL(float16);

--- a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
@@ -131,10 +131,5 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
 template class MatrixInverseFunctor<GPUContext, float>;
 template class MatrixInverseFunctor<GPUContext, double>;

-// TODO(chenweihang): remove these instantiations later
-template class MatrixInverseFunctor<paddle::platform::CUDADeviceContext, float>;
-template class MatrixInverseFunctor<paddle::platform::CUDADeviceContext,
-                                    double>;
-
 }  // namespace funcs
 }  // namespace phi
--- a/paddle/phi/kernels/funcs/matrix_solve.cu
+++ b/paddle/phi/kernels/funcs/matrix_solve.cu
@@ -170,9 +170,5 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
 template class MatrixSolveFunctor<GPUContext, float>;
 template class MatrixSolveFunctor<GPUContext, double>;

-// TODO(wuweilong): remove these instantiations later
-template class MatrixSolveFunctor<paddle::platform::CUDADeviceContext, float>;
-template class MatrixSolveFunctor<paddle::platform::CUDADeviceContext, double>;
-
 }  // namespace funcs
 }  // namespace phi