unify fluid::CUDADeviceContext and phi::GpuContext (#44723)

* remove cudaDeviceContext * remove more template * fix rocm compile

unify fluid::CUDADeviceContext and phi::GpuContext (#44723)
* remove cudaDeviceContext * remove more template * fix rocm compile
88490567 · Leo Chen · GitHub · 0a2db7c8 · 88490567 · 88490567
26 changed file
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -23,12 +23,6 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
 namespace paddle {
 namespace framework {
 class GarbageCollector;

--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -25,11 +25,6 @@
 #include "paddle/fluid/platform/place.h"
 namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
 namespace memory {
 namespace allocation {

--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -23,13 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/miopen_lstm_cache.h"
 #endif
-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
@@ -182,7 +182,7 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
 #endif
  size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
-  platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N);
+  platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(dev_ctx, N);
  // first sum pool
  FusedSeqpoolKernelNormal<<<config.block_per_grid.x,
                             config.thread_per_block.x,
@@ -209,7 +209,8 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
    // not need show click input
    N = static_cast<size_t>(batch_size * slot_num *
                            (embedding_size - cvm_offset));
-    platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N);
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(dev_ctx, N);
    FusedCVMKernelNoCVM<<<config.block_per_grid.x,
                          config.thread_per_block.x,
                          0,
@@ -391,7 +392,7 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
 #endif
  size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
-  auto config = GetGpuLaunchConfig1D(dev_ctx, N);
+  auto config = platform::GetGpuLaunchConfig1D(dev_ctx, N);
  if (use_cvm) {
    // join grad
    FusedSeqpoolCVMGradKernelWithCVM<<<config.block_per_grid.x,

--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/gru_op.h"
-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -150,11 +150,6 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
  }
 }
-template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
-template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
-template class CrossEntropyFunctor<platform::CUDADeviceContext,
-                                   platform::float16>;
 template class CrossEntropyFunctor<phi::GPUContext, float>;
 template class CrossEntropyFunctor<phi::GPUContext, double>;
 template class CrossEntropyFunctor<phi::GPUContext, platform::float16>;

--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -308,24 +308,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
  }
 };
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext,
-                             float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext,
-                             double>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                             phi::GPUContext,
                             float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                             phi::GPUContext,
                             double>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext,
-                             float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext,
-                             double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                             phi::GPUContext,
                             float>;
@@ -576,12 +564,6 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
  }
 };
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext,
-                             float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext,
-                             double>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                             phi::GPUContext,
                             float>;
@@ -589,12 +571,6 @@ template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                             phi::GPUContext,
                             double>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext,
-                             float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext,
-                             double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                             phi::GPUContext,
                             float>;

--- a/paddle/fluid/operators/math/maxouting.cu
+++ b/paddle/fluid/operators/math/maxouting.cu
@@ -173,12 +173,6 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
                                                              axis);
 }
-template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
-template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
-template class MaxOutFunctor<platform::CUDADeviceContext, float>;
-template class MaxOutFunctor<platform::CUDADeviceContext, double>;
 template class MaxOutGradFunctor<phi::GPUContext, float>;
 template class MaxOutGradFunctor<phi::GPUContext, double>;

--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -22,12 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/sampler.h"
 #include "paddle/phi/core/ddim.h"
-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
 namespace paddle {
 namespace operators {
 namespace math {

--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -133,77 +133,6 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
 }
 }  // namespace
-template <typename T>
-struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const phi::SelectedRows& input1,
-                  const framework::Tensor& input2,
-                  framework::Tensor* output) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2.dims();
-    auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(
-        in1_height,
-        in2_dims[0],
-        platform::errors::InvalidArgument(
-            "The two inputs height must be equal."
-            "But received first input height = [%d], first input height = [%d]",
-            in1_height,
-            in2_dims[0]));
-    PADDLE_ENFORCE_EQ(
-        in1_height,
-        out_dims[0],
-        platform::errors::InvalidArgument(
-            "The input and output height must be equal."
-            "But received input height = [%d], output height = [%d]",
-            in1_height,
-            out_dims[0]));
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(
-        in1_row_numel,
-        input2.numel() / in1_height,
-        platform::errors::InvalidArgument(
-            "The two inputs width must be equal."
-            "But received first input width = [%d], second input width = [%d]",
-            in1_row_numel,
-            input2.numel() / in1_height));
-    PADDLE_ENFORCE_EQ(
-        in1_row_numel,
-        output->numel() / in1_height,
-        platform::errors::InvalidArgument(
-            "The input and output width must be equal."
-            "But received input width = [%d], output width = [%d]",
-            in1_row_numel,
-            output->numel() / in1_height));
-    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = input2.data<T>();
-    auto* out_data = output->data<T>();
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(context, output, static_cast<T>(0));
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid(in1_rows.size(), 1);
-    paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
-    SelectedRowsAddTensorKernel<T, block_size>
-        <<<grid, threads, 0, context.stream()>>>(
-            in1_data,
-            mixv_in1_rows.CUDAData(context.GetPlace()),
-            out_data,
-            in1_row_numel);
-    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
-    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
-    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
-  }
-};
 template <typename T>
 struct SelectedRowsAddTensor<phi::GPUContext, T> {
  void operator()(const phi::GPUContext& context,
@@ -275,12 +204,6 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
  }
 };
-template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
-template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;
-template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
-                                      platform::float16>;
 template struct SelectedRowsAddTensor<phi::GPUContext, float>;
 template struct SelectedRowsAddTensor<phi::GPUContext, double>;
 template struct SelectedRowsAdd<phi::GPUContext, platform::float16>;
@@ -363,50 +286,6 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
 }
 }  // namespace
-template <typename T>
-struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const phi::SelectedRows& input1,
-                  framework::Tensor* input2) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(
-        in1_height,
-        in2_dims[0],
-        platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But received first input height = "
-                                          "[%d], second input height = [%d]",
-                                          in1_height,
-                                          in2_dims[0]));
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(
-        in1_row_numel,
-        input2->numel() / in1_height,
-        platform::errors::InvalidArgument(
-            "The two inputs width must be equal."
-            "But received first input width = [%d], second input width = [%d]",
-            in1_row_numel,
-            input2->numel() / in1_height));
-    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = input2->data<T>();
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid(in1_rows.size(), 1);
-    paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
-    SelectedRowsAddToTensorKernel<T, block_size>
-        <<<grid, threads, 0, context.stream()>>>(
-            in1_data,
-            mixv_in1_rows.CUDAData(context.GetPlace()),
-            in2_data,
-            in1_row_numel);
-  }
-};
 template <typename T>
 struct SelectedRowsAddToTensor<phi::GPUContext, T> {
  void operator()(const phi::GPUContext& context,
@@ -451,12 +330,6 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
  }
 };
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
-                                        platform::float16>;
 template struct SelectedRowsAddToTensor<phi::GPUContext, float>;
 template struct SelectedRowsAddToTensor<phi::GPUContext, double>;
 template struct SelectedRowsAddToTensor<phi::GPUContext, int>;
@@ -625,34 +498,6 @@ struct MergeAddImpl {
  }
 };
-template <typename T>
-struct MergeAdd<platform::CUDADeviceContext, T> {
-  // unary functor, merge by adding duplicated rows in
-  // the input SelectedRows object.
-  phi::SelectedRows operator()(const platform::CUDADeviceContext& context,
-                               const phi::SelectedRows& input,
-                               const bool sorted_result) {
-    return MergeAddImpl<platform::CUDADeviceContext, T>()(
-        context, input, sorted_result);
-  }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const phi::SelectedRows& input,
-                  phi::SelectedRows* output,
-                  const bool sorted_result) {
-    MergeAddImpl<platform::CUDADeviceContext, T>()(
-        context, input, output, sorted_result);
-  }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const std::vector<const phi::SelectedRows*>& inputs,
-                  phi::SelectedRows* output,
-                  const bool sorted_result) {
-    MergeAddImpl<platform::CUDADeviceContext, T>()(
-        context, inputs, output, sorted_result);
-  }
-};
 template <typename T>
 struct MergeAdd<phi::GPUContext, T> {
  // unary functor, merge by adding duplicated rows in
@@ -678,10 +523,8 @@ struct MergeAdd<phi::GPUContext, T> {
  }
 };
-#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype)                    \
+#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype)        \
-  template struct MergeAddImpl<platform::CUDADeviceContext, dtype>; \
+  template struct MergeAddImpl<phi::GPUContext, dtype>; \
-  template struct MergeAddImpl<phi::GPUContext, dtype>;             \
-  template struct MergeAdd<platform::CUDADeviceContext, dtype>;     \
  template struct MergeAdd<phi::GPUContext, dtype>;
 TEMPLATE_SPECIALIZED_FOR_MERGEADD(float)

--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -57,88 +57,6 @@ __global__ void SequencePaddingKernel(T* dst,
  }
 }
-template <typename T>
-class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::LoDTensor& seq_tensor,
-                  framework::LoDTensor* pad_tensor,
-                  const framework::LoDTensor& pad_value,
-                  int pad_seq_len = -1,
-                  int lod_level = 0,
-                  bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth) {
-    auto seq_lod = seq_tensor.lod();
-    auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
-    const auto& seq_tensor_dims = seq_tensor.dims();
-    const auto& pad_tensor_dims = pad_tensor->dims();
-    int max_seq_len = MaximumSequenceLength(seq_offsets);
-    if (pad_seq_len == -1) {
-      pad_seq_len = max_seq_len;
-    }
-    PADDLE_ENFORCE_GE(
-        pad_seq_len,
-        max_seq_len,
-        platform::errors::InvalidArgument(
-            "The pad_seq_len must be equal to or greater than the "
-            "original max sequence length. Expected %ld >= %ld, but got %ld < "
-            "%ld. Please check the input value.",
-            pad_seq_len,
-            max_seq_len,
-            pad_seq_len,
-            max_seq_len));
-    int step_width = seq_tensor.numel() / seq_tensor_dims[0];
-    int seq_num = seq_offsets.size() - 1;
-    CheckDims(seq_tensor_dims,
-              pad_tensor_dims,
-              seq_offsets,
-              pad_seq_len,
-              step_width,
-              layout);
-    PADDLE_ENFORCE_EQ(
-        pad_value.numel() == 1 || pad_value.numel() == step_width,
-        true,
-        platform::errors::InvalidArgument(
-            "The numel of 'pad_value' can only be 1 or be equal to "
-            "the 'step_width', but got %ld != 1 and %ld. Please check the "
-            "input value.",
-            pad_value.numel(),
-            step_width));
-    const int kBlockSize = 512;
-    /* At least use 32 threads to copy sequence_width elements,
-     * and at least 8 elements for each thread.
-     */
-    size_t block_dim_x =
-        std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
-    size_t block_dim_y = kBlockSize / block_dim_x;
-    dim3 threads(block_dim_x, block_dim_y);
-    size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
-    size_t grid_dim_y = seq_num;
-    dim3 grid(grid_dim_x, grid_dim_y);
-    const T* seq_data = seq_tensor.data<T>();
-    T* pad_data = pad_tensor->data<T>();
-    const T* pad_value_data = pad_value.data<T>();
-    paddle::framework::MixVector<size_t> mix_vector_seq_offsets(&seq_offsets);
-    SequencePaddingKernel<T, kSeqToPad><<<grid, threads, 0, context.stream()>>>(
-        pad_data,
-        seq_data,
-        pad_value_data,
-        pad_value.numel() == 1,
-        mix_vector_seq_offsets.CUDAData(context.GetPlace()),
-        seq_num,
-        pad_seq_len,
-        step_width,
-        norm_by_times,
-        layout);
-  }
-};
 template <typename T>
 class PaddingLoDTensorFunctor<phi::GPUContext, T> {
 public:
@@ -221,73 +139,6 @@ class PaddingLoDTensorFunctor<phi::GPUContext, T> {
  }
 };
-template <typename T>
-class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::LoDTensor& pad_tensor,
-                  framework::LoDTensor* seq_tensor,
-                  int pad_seq_len = -1,
-                  int lod_level = 0,
-                  bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth) {
-    auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
-    const auto& seq_tensor_dims = seq_tensor->dims();
-    const auto& pad_tensor_dims = pad_tensor.dims();
-    int max_seq_len = MaximumSequenceLength(seq_offsets);
-    if (pad_seq_len == -1) {
-      pad_seq_len = max_seq_len;
-    }
-    int step_width = seq_tensor->numel() / seq_tensor_dims[0];
-    int seq_num = seq_offsets.size() - 1;
-    CheckDims(seq_tensor_dims,
-              pad_tensor_dims,
-              seq_offsets,
-              pad_seq_len,
-              step_width,
-              layout);
-    /*
-    if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
-      paddle::framework::TensorCopy(pad_tensor, context.GetPlace(), context,
-    seq_tensor);
-      seq_tensor->Resize(seq_tensor_dims);
-      return;
-    }
-    */
-    const int kBlockSize = 512;
-    /* At least use 32 threads to copy sequence_width elements,
-     * and at least 8 elements for each thread.
-     */
-    size_t block_dim_x =
-        std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
-    size_t block_dim_y = kBlockSize / block_dim_x;
-    dim3 threads(block_dim_x, block_dim_y);
-    size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
-    size_t grid_dim_y = seq_num;
-    dim3 grid(grid_dim_x, grid_dim_y);
-    const T* pad_data = pad_tensor.data<T>();
-    T* seq_data = seq_tensor->data<T>();
-    paddle::framework::MixVector<size_t> mixv_seq_offsets(&seq_offsets);
-    SequencePaddingKernel<T, kPadToSeq><<<grid, threads, 0, context.stream()>>>(
-        seq_data,
-        pad_data,
-        nullptr,
-        false,
-        mixv_seq_offsets.CUDAData(context.GetPlace()),
-        seq_num,
-        pad_seq_len,
-        step_width,
-        norm_by_times,
-        layout);
-  }
-};
 template <typename T>
 class UnpaddingLoDTensorFunctor<phi::GPUContext, T> {
 public:
@@ -355,16 +206,6 @@ class UnpaddingLoDTensorFunctor<phi::GPUContext, T> {
  }
 };
-template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, int>;
-template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, int64_t>;
-template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
-template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, double>;
-template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, int>;
-template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, int64_t>;
-template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
-template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, double>;
 template class PaddingLoDTensorFunctor<phi::GPUContext, int>;
 template class PaddingLoDTensorFunctor<phi::GPUContext, int64_t>;
 template class PaddingLoDTensorFunctor<phi::GPUContext, float>;

--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@@ -35,43 +35,6 @@ __global__ void SequenceScaleKernel(T* seq,
  }
 }
-template <typename T>
-class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const T* scales,
-                  framework::LoDTensor* seq) {
-    const size_t level = 0;
-    auto lod = seq->lod();
-    const size_t num_seq = lod[level].size() - 1;
-    const size_t seq_width = seq->numel() / seq->dims()[0];
-    auto abs_offset_lod = framework::ToAbsOffset(lod);
-    T* seq_data = seq->mutable_data<T>(context.GetPlace());
-    paddle::framework::MixVector<size_t> mix_vector(&(abs_offset_lod[level]));
-#ifdef PADDLE_WITH_HIP
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS>),
-        dim3(num_seq),
-        dim3(PADDLE_CUDA_NUM_THREADS),
-        0,
-        context.stream(),
-        seq_data,
-        mix_vector.CUDAMutableData(context.GetPlace()),
-        scales,
-        seq_width);
-#else
-    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS>
-        <<<num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
-            seq_data,
-            mix_vector.CUDAMutableData(context.GetPlace()),
-            scales,
-            seq_width);
-#endif
-    mix_vector.CopyToCPU();
-  }
-};
 template <typename T>
 class ScaleLoDTensorFunctor<phi::GPUContext, T> {
 public:
@@ -109,9 +72,6 @@ class ScaleLoDTensorFunctor<phi::GPUContext, T> {
  }
 };
-template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, float>;
-template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, double>;
 template class ScaleLoDTensorFunctor<phi::GPUContext, float>;
 template class ScaleLoDTensorFunctor<phi::GPUContext, double>;

--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -141,56 +141,21 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
 #endif
 }
-template class SoftmaxCUDNNFunctor<float, platform::CUDADeviceContext>;
-template class SoftmaxCUDNNFunctor<platform::float16,
-                                   platform::CUDADeviceContext>;
-template class SoftmaxGradCUDNNFunctor<float, platform::CUDADeviceContext>;
-template class SoftmaxGradCUDNNFunctor<platform::float16,
-                                       platform::CUDADeviceContext>;
 template class SoftmaxCUDNNFunctor<float, phi::GPUContext>;
 template class SoftmaxCUDNNFunctor<platform::float16, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<float, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<platform::float16, phi::GPUContext>;
 #if CUDNN_VERSION_MIN(8, 1, 0)
-template class SoftmaxCUDNNFunctor<platform::bfloat16,
-                                   platform::CUDADeviceContext>;
-template class SoftmaxGradCUDNNFunctor<platform::bfloat16,
-                                       platform::CUDADeviceContext>;
 template class SoftmaxCUDNNFunctor<platform::bfloat16, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<platform::bfloat16, phi::GPUContext>;
 #endif
 // MIOPEN do not support double
 #ifndef PADDLE_WITH_HIP
-template class SoftmaxCUDNNFunctor<double, platform::CUDADeviceContext>;
-template class SoftmaxGradCUDNNFunctor<double, platform::CUDADeviceContext>;
 template class SoftmaxCUDNNFunctor<double, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<double, phi::GPUContext>;
 #endif
-template class SoftmaxFunctor<platform::CUDADeviceContext,
-                              platform::float16,
-                              false>;
-template class SoftmaxFunctor<platform::CUDADeviceContext,
-                              platform::float16,
-                              true>;
-template class SoftmaxFunctor<platform::CUDADeviceContext,
-                              platform::bfloat16,
-                              false>;
-template class SoftmaxFunctor<platform::CUDADeviceContext,
-                              platform::bfloat16,
-                              true>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, float, false>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, double, false>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, float, true>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, double, true>;
-template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
-template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
-template class SoftmaxGradFunctor<platform::CUDADeviceContext,
-                                  platform::float16>;
-template class SoftmaxGradFunctor<platform::CUDADeviceContext,
-                                  platform::bfloat16>;
 template class SoftmaxFunctor<phi::GPUContext, platform::float16, false>;
 template class SoftmaxFunctor<phi::GPUContext, platform::float16, true>;
 template class SoftmaxFunctor<phi::GPUContext, platform::bfloat16, false>;

--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -417,13 +417,9 @@ void Col2VolFunctor<DeviceContext, T>::operator()(
 }
 // };
-template class Vol2ColFunctor<platform::CUDADeviceContext, float>;
-template class Vol2ColFunctor<platform::CUDADeviceContext, double>;
 template class Vol2ColFunctor<phi::GPUContext, float>;
 template class Vol2ColFunctor<phi::GPUContext, double>;
-template class Col2VolFunctor<platform::CUDADeviceContext, float>;
-template class Col2VolFunctor<platform::CUDADeviceContext, double>;
 template class Col2VolFunctor<phi::GPUContext, float>;
 template class Col2VolFunctor<phi::GPUContext, double>;

--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
@@ -16,12 +16,6 @@
 #include "paddle/fluid/framework/op_registry.h"
-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
 REGISTER_OP_CUDA_KERNEL(
    sequence_concat,
    paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext,

--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -51,7 +51,6 @@ namespace platform {
 //
 // The NCCLComm instance is created and reversed in the NCCLCommContext
 // singleton with a global user specified group id.
-class CUDADeviceContext;
 class NCCLComm {
 public:

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -533,11 +533,6 @@ void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
  allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
 }
-CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
-    : phi::GPUContext(place) {}
-CUDADeviceContext::~CUDADeviceContext() = default;
 CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
  eigen_device_.reset(new Eigen::DefaultDevice());
 }

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -271,15 +271,7 @@ struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
 class CudnnWorkspaceHandle;
 class EigenCudaStreamDevice;
-class CUDADeviceContext : public phi::GPUContext {
+using CUDADeviceContext = phi::GPUContext;
- public:
-  explicit CUDADeviceContext(CUDAPlace place);
-  virtual ~CUDADeviceContext();
- private:
-  int place_holder_;  // TO BE REMOVED
-  DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
-};
 class CudnnWorkspaceHandle {
 public:

--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
@@ -96,66 +96,6 @@ struct Transform<phi::CPUContext> {
 };
 #if defined(__NVCC__) || defined(__HIPCC__)
-template <>
-struct Transform<platform::CUDADeviceContext> {
-  template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(const platform::CUDADeviceContext& context,
-                  InputIter first,
-                  InputIter last,
-                  OutputIter result,
-                  UnaryOperation op) {
-    auto place = context.GetPlace();
-    PADDLE_ENFORCE_EQ(is_gpu_place(place),
-                      true,
-                      platform::errors::PreconditionNotMet(
-                          "The CUDA Transform must be used in GPU place."));
-#ifdef __HIPCC__
-    thrust::transform(thrust::hip::par.on(context.stream()),
-                      details::CastToCUDATransformIterator(first),
-                      details::CastToCUDATransformIterator(last),
-                      details::CastToCUDATransformIterator(result),
-                      op);
-#else
-    thrust::transform(thrust::cuda::par.on(context.stream()),
-                      details::CastToCUDATransformIterator(first),
-                      details::CastToCUDATransformIterator(last),
-                      details::CastToCUDATransformIterator(result),
-                      op);
-#endif
-  }
-  template <typename InputIter1,
-            typename InputIter2,
-            typename OutputIter,
-            typename BinaryOperation>
-  void operator()(const platform::CUDADeviceContext& context,
-                  InputIter1 first1,
-                  InputIter1 last1,
-                  InputIter2 first2,
-                  OutputIter result,
-                  BinaryOperation op) {
-    auto place = context.GetPlace();
-    PADDLE_ENFORCE_EQ(is_gpu_place(place),
-                      true,
-                      platform::errors::PreconditionNotMet(
-                          "The CUDA Transform must be used in GPU place."));
-#ifdef __HIPCC__
-    thrust::transform(thrust::hip::par.on(context.stream()),
-                      details::CastToCUDATransformIterator(first1),
-                      details::CastToCUDATransformIterator(last1),
-                      details::CastToCUDATransformIterator(first2),
-                      details::CastToCUDATransformIterator(result),
-                      op);
-#else
-    thrust::transform(thrust::cuda::par.on(context.stream()),
-                      details::CastToCUDATransformIterator(first1),
-                      details::CastToCUDATransformIterator(last1),
-                      details::CastToCUDATransformIterator(first2),
-                      details::CastToCUDATransformIterator(result),
-                      op);
-#endif
-  }
-};
 template <>
 struct Transform<phi::GPUContext> {

--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -66,58 +66,6 @@ struct CUBlas<float> {
 #endif
  }
-  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
-  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
-  template <typename... ARGS>
-  static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx,
-                      cublasOperation_t transa,
-                      cublasOperation_t transb,
-                      int m,
-                      int n,
-                      int k,
-                      const float *alpha,
-                      const void *A,
-                      cudaDataType_t Atype,
-                      int lda,
-                      const void *B,
-                      cudaDataType_t Btype,
-                      int ldb,
-                      const float *beta,
-                      void *C,
-                      cudaDataType_t Ctype,
-                      int ldc) {
-// Because the gcc 4.8 doesn't expand template parameter pack that
-// appears in a lambda-expression, I can not use template parameter pack
-// here.
-#if CUDA_VERSION >= 8000
-    VLOG(5) << "use_tensor_op_math: "
-            << (dev_ctx->tensor_core_available() ? "True" : "False");
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cublasSgemmEx(handle,
-                                                   transa,
-                                                   transb,
-                                                   m,
-                                                   n,
-                                                   k,
-                                                   alpha,
-                                                   A,
-                                                   Atype,
-                                                   lda,
-                                                   B,
-                                                   Btype,
-                                                   ldb,
-                                                   beta,
-                                                   C,
-                                                   Ctype,
-                                                   ldc));
-    });
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "cublasSgemmEx is not supported on cuda <= 7.5"));
-#endif
-  }
  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
  template <typename... ARGS>
@@ -366,66 +314,6 @@ struct CUBlas<phi::dtype::float16> {
 #endif
  }
-  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
-  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
-  template <typename... ARGS>
-  static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx,
-                      cublasOperation_t transa,
-                      cublasOperation_t transb,
-                      int m,
-                      int n,
-                      int k,
-                      const void *alpha,
-                      const void *A,
-                      cudaDataType_t Atype,
-                      int lda,
-                      const void *B,
-                      cudaDataType_t Btype,
-                      int ldb,
-                      const void *beta,
-                      void *C,
-                      cudaDataType_t Ctype,
-                      int ldc,
-                      cudaDataType_t computeType) {
-#if CUDA_VERSION >= 8000
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-#if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = dev_ctx->tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-#endif  // CUDA_VERSION >= 9000
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cublasGemmEx(handle,
-                                                  transa,
-                                                  transb,
-                                                  m,
-                                                  n,
-                                                  k,
-                                                  alpha,
-                                                  A,
-                                                  Atype,
-                                                  lda,
-                                                  B,
-                                                  Btype,
-                                                  ldb,
-                                                  beta,
-                                                  C,
-                                                  Ctype,
-                                                  ldc,
-                                                  computeType,
-                                                  algo));
-    });
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "cublasGemmEx is not supported on cuda <= 7.5"));
-#endif
-  }
  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
  template <typename... ARGS>
@@ -636,66 +524,6 @@ struct CUBlas<phi::dtype::complex<float>> {
        ldb));
  }
-  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
-  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
-  template <typename... ARGS>
-  static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx,
-                      cublasOperation_t transa,
-                      cublasOperation_t transb,
-                      int m,
-                      int n,
-                      int k,
-                      const void *alpha,
-                      const void *A,
-                      cudaDataType_t Atype,
-                      int lda,
-                      const void *B,
-                      cudaDataType_t Btype,
-                      int ldb,
-                      const void *beta,
-                      void *C,
-                      cudaDataType_t Ctype,
-                      int ldc,
-                      cudaDataType_t computeType) {
-#if CUDA_VERSION >= 8000
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-#if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = dev_ctx->tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-#endif  // CUDA_VERSION >= 9000
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cublasGemmEx(handle,
-                                                  transa,
-                                                  transb,
-                                                  m,
-                                                  n,
-                                                  k,
-                                                  alpha,
-                                                  A,
-                                                  Atype,
-                                                  lda,
-                                                  B,
-                                                  Btype,
-                                                  ldb,
-                                                  beta,
-                                                  C,
-                                                  Ctype,
-                                                  ldc,
-                                                  computeType,
-                                                  algo));
-    });
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "cublasGemmEx is not supported on cuda <= 7.5"));
-#endif
-  }
  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
  template <typename... ARGS>
@@ -965,66 +793,6 @@ struct CUBlas<phi::dtype::complex<double>> {
        batch_size));
  }
-  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
-  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
-  template <typename... ARGS>
-  static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx,
-                      cublasOperation_t transa,
-                      cublasOperation_t transb,
-                      int m,
-                      int n,
-                      int k,
-                      const void *alpha,
-                      const void *A,
-                      cudaDataType_t Atype,
-                      int lda,
-                      const void *B,
-                      cudaDataType_t Btype,
-                      int ldb,
-                      const void *beta,
-                      void *C,
-                      cudaDataType_t Ctype,
-                      int ldc,
-                      cudaDataType_t computeType) {
-#if CUDA_VERSION >= 8000
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-#if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = dev_ctx->tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-#endif  // CUDA_VERSION >= 9000
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cublasGemmEx(handle,
-                                                  transa,
-                                                  transb,
-                                                  m,
-                                                  n,
-                                                  k,
-                                                  alpha,
-                                                  A,
-                                                  Atype,
-                                                  lda,
-                                                  B,
-                                                  Btype,
-                                                  ldb,
-                                                  beta,
-                                                  C,
-                                                  Ctype,
-                                                  ldc,
-                                                  computeType,
-                                                  algo));
-    });
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "cublasGemmEx is not supported on cuda <= 7.5"));
-#endif
-  }
  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
  template <typename... ARGS>
@@ -1088,16 +856,16 @@ struct CUBlas<phi::dtype::complex<double>> {
 template <>
 template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
+void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                                     CBLAS_TRANSPOSE transB,
+                                 CBLAS_TRANSPOSE transB,
-                                                     int M,
+                                 int M,
-                                                     int N,
+                                 int N,
-                                                     int K,
+                                 int K,
-                                                     T alpha,
+                                 T alpha,
-                                                     const T *A,
+                                 const T *A,
-                                                     const T *B,
+                                 const T *B,
-                                                     T beta,
+                                 T beta,
-                                                     T *C) const {
+                                 T *C) const {
  // Note that cublas follows fortran order, so the order is different from
  // the cblas convention.
  int lda = (transA == CblasNoTrans) ? K : M;
@@ -1109,8 +877,7 @@ void Blas<paddle::platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
 #if CUDA_VERSION >= 8000
  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
-    auto &cuda_ctx =
+    auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
-        const_cast<paddle::platform::CUDADeviceContext &>(context_);
    CUBlas<T>::GEMM_EX(&cuda_ctx,
                       cuTransB,
                       cuTransA,
@@ -1152,151 +919,6 @@ void Blas<paddle::platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 8000
 }
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                 CBLAS_TRANSPOSE transB,
-                                 int M,
-                                 int N,
-                                 int K,
-                                 T alpha,
-                                 const T *A,
-                                 const T *B,
-                                 T beta,
-                                 T *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-#if CUDA_VERSION >= 8000
-  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
-    auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
-    CUBlas<T>::GEMM_EX(&cuda_ctx,
-                       cuTransB,
-                       cuTransA,
-                       N,
-                       M,
-                       K,
-                       &alpha,
-                       B,
-                       CUDA_R_32F,
-                       ldb,
-                       A,
-                       CUDA_R_32F,
-                       lda,
-                       &beta,
-                       C,
-                       CUDA_R_32F,
-                       N);
-  } else {
-#endif  // CUDA_VERSION >= 8000
-    context_.CublasCall([&](cublasHandle_t handle) {
-      CUBlas<T>::GEMM(handle,
-                      cuTransB,
-                      cuTransA,
-                      N,
-                      M,
-                      K,
-                      &alpha,
-                      B,
-                      ldb,
-                      A,
-                      lda,
-                      &beta,
-                      C,
-                      N);
-    });
-#if CUDA_VERSION >= 8000
-  }
-#endif  // CUDA_VERSION >= 8000
-}
-template <>
-template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    phi::dtype::float16 alpha,
-    const phi::dtype::float16 *A,
-    const phi::dtype::float16 *B,
-    phi::dtype::float16 beta,
-    phi::dtype::float16 *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(
-      context_.GetComputeCapability(),
-      53,
-      phi::errors::InvalidArgument(
-          "cublas fp16 gemm requires GPU compute capability >= 53,"
-          "but received %d",
-          context_.GetComputeCapability()));
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
-#if CUDA_VERSION >= 8000
-  // cublasHgemm does true FP16 computation which is slow for non-Volta
-  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
-  // input/output in fp16, computation in fp32, which can also be accelerated
-  // using tensor cores in volta GPUs.
-  auto &cuda_ctx = const_cast<paddle::platform::CUDADeviceContext &>(context_);
-  CUBlas<phi::dtype::float16>::GEMM_EX(&cuda_ctx,
-                                       cuTransB,
-                                       cuTransA,
-                                       N,
-                                       M,
-                                       K,
-                                       &h_alpha,
-                                       B,
-                                       CUDA_R_16F,
-                                       ldb,
-                                       A,
-                                       CUDA_R_16F,
-                                       lda,
-                                       &h_beta,
-                                       C,
-                                       CUDA_R_16F,
-                                       N,
-                                       CUDA_R_32F);
-#else
-  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-  context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<phi::dtype::float16>::GEMM(handle,
-                                      cuTransB,
-                                      cuTransA,
-                                      N,
-                                      M,
-                                      K,
-                                      &h_alpha,
-                                      h_B,
-                                      ldb,
-                                      h_A,
-                                      lda,
-                                      &h_beta,
-                                      h_C,
-                                      N);
-  });
-#endif  // CUDA_VERSION >= 8000
-}
 template <>
 template <>
 inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
@@ -1376,77 +998,6 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 8000
 }
-template <>
-template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    phi::dtype::bfloat16 alpha,
-    const phi::dtype::bfloat16 *A,
-    const phi::dtype::bfloat16 *B,
-    phi::dtype::bfloat16 beta,
-    phi::dtype::bfloat16 *C) const {
-#if CUDA_VERSION >= 11000
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(
-      context_.GetComputeCapability(),
-      80,
-      phi::errors::InvalidArgument(
-          "cublas fp16 gemm requires GPU compute capability >= 80,"
-          "but received %d",
-          context_.GetComputeCapability()));
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = context_.tensor_core_available();
-  if (use_tensor_op_math) {
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  }
-  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
-  context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        paddle::platform::dynload::cublasGemmEx(handle,
-                                                cuTransB,
-                                                cuTransA,
-                                                N,
-                                                M,
-                                                K,
-                                                &h_alpha,
-                                                B,
-                                                CUDA_R_16BF,
-                                                ldb,
-                                                A,
-                                                CUDA_R_16BF,
-                                                lda,
-                                                &h_beta,
-                                                C,
-                                                CUDA_R_16BF,
-                                                N,
-                                                CUDA_R_32F,
-                                                algo));
-  });
-#else
-  // raise error
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "cublasGemmEx with bfloat16 is not supported on cuda <= 11"));
-#endif  // CUDA_VERSION >= 11000
-}
 template <>
 template <>
 inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
@@ -1517,87 +1068,6 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 11000
 }
-template <>
-template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    phi::dtype::complex<float> alpha,
-    const phi::dtype::complex<float> *A,
-    const phi::dtype::complex<float> *B,
-    phi::dtype::complex<float> beta,
-    phi::dtype::complex<float> *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(
-      context_.GetComputeCapability(),
-      53,
-      phi::errors::InvalidArgument(
-          "cublas complex64 gemm requires GPU compute capability >= 53,"
-          "but received %d",
-          context_.GetComputeCapability()));
-  thrust::complex<float> c_alpha =
-      thrust::complex<float>(alpha.real, alpha.imag);
-  thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
-#if CUDA_VERSION >= 8000
-  // cublasHgemm does true FP16 computation which is slow for non-Volta
-  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
-  // input/output in fp16, computation in fp32, which can also be accelerated
-  // using tensor cores in volta GPUs.
-  auto &cuda_ctx = const_cast<paddle::platform::CUDADeviceContext &>(context_);
-  CUBlas<phi::dtype::complex<float>>::GEMM_EX(&cuda_ctx,
-                                              cuTransB,
-                                              cuTransA,
-                                              N,
-                                              M,
-                                              K,
-                                              &c_alpha,
-                                              B,
-                                              CUDA_C_32F,
-                                              ldb,
-                                              A,
-                                              CUDA_C_32F,
-                                              lda,
-                                              &c_beta,
-                                              C,
-                                              CUDA_C_32F,
-                                              N,
-                                              CUDA_C_32F);
-#else
-  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-  context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<phi::dtype::complex<float>>::GEMM(handle,
-                                             cuTransB,
-                                             cuTransA,
-                                             N,
-                                             M,
-                                             K,
-                                             &c_alpha,
-                                             h_B,
-                                             ldb,
-                                             h_A,
-                                             lda,
-                                             &c_beta,
-                                             h_C,
-                                             N);
-  });
-#endif  // CUDA_VERSION >= 8000
-}
 template <>
 template <>
 inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
@@ -1680,17 +1150,16 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 template <>
 template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
+inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
-    CBLAS_TRANSPOSE transB,
+                                        int M,
-    int M,
+                                        int N,
-    int N,
+                                        int K,
-    int K,
+                                        phi::dtype::complex<double> alpha,
-    phi::dtype::complex<double> alpha,
+                                        const phi::dtype::complex<double> *A,
-    const phi::dtype::complex<double> *A,
+                                        const phi::dtype::complex<double> *B,
-    const phi::dtype::complex<double> *B,
+                                        phi::dtype::complex<double> beta,
-    phi::dtype::complex<double> beta,
+                                        phi::dtype::complex<double> *C) const {
-    phi::dtype::complex<double> *C) const {
  // Note that cublas follows fortran order, so the order is different from
  // the cblas convention.
  int lda = (transA == CblasNoTrans) ? K : M;
@@ -1719,7 +1188,7 @@ inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
  // input/output in fp16, computation in fp32, which can also be accelerated
  // using tensor cores in volta GPUs.
-  auto &cuda_ctx = const_cast<paddle::platform::CUDADeviceContext &>(context_);
+  auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
  CUBlas<phi::dtype::complex<double>>::GEMM_EX(&cuda_ctx,
                                               cuTransB,
                                               cuTransA,
@@ -1760,153 +1229,6 @@ inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
 #endif  // CUDA_VERSION >= 8000
 }
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        phi::dtype::complex<double> alpha,
-                                        const phi::dtype::complex<double> *A,
-                                        const phi::dtype::complex<double> *B,
-                                        phi::dtype::complex<double> beta,
-                                        phi::dtype::complex<double> *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(
-      context_.GetComputeCapability(),
-      53,
-      phi::errors::InvalidArgument(
-          "cublas complex128 gemm requires GPU compute capability >= 53,"
-          "but received %d",
-          context_.GetComputeCapability()));
-  thrust::complex<double> c_alpha =
-      thrust::complex<double>(alpha.real, alpha.imag);
-  thrust::complex<double> c_beta =
-      thrust::complex<double>(beta.real, beta.imag);
-#if CUDA_VERSION >= 8000
-  // cublasHgemm does true FP16 computation which is slow for non-Volta
-  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
-  // input/output in fp16, computation in fp32, which can also be accelerated
-  // using tensor cores in volta GPUs.
-  auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
-  CUBlas<phi::dtype::complex<double>>::GEMM_EX(&cuda_ctx,
-                                               cuTransB,
-                                               cuTransA,
-                                               N,
-                                               M,
-                                               K,
-                                               &c_alpha,
-                                               B,
-                                               CUDA_C_64F,
-                                               ldb,
-                                               A,
-                                               CUDA_C_64F,
-                                               lda,
-                                               &c_beta,
-                                               C,
-                                               CUDA_C_64F,
-                                               N,
-                                               CUDA_C_64F);
-#else
-  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-  context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<phi::dtype::complex<double>>::GEMM(handle,
-                                              cuTransB,
-                                              cuTransA,
-                                              N,
-                                              M,
-                                              K,
-                                              &c_alpha,
-                                              h_B,
-                                              ldb,
-                                              h_A,
-                                              lda,
-                                              &c_beta,
-                                              h_C,
-                                              N);
-  });
-#endif  // CUDA_VERSION >= 8000
-}
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::GEMM(bool transA,
-                                                     bool transB,
-                                                     int M,
-                                                     int N,
-                                                     int K,
-                                                     T alpha,
-                                                     const T *A,
-                                                     int lda,
-                                                     const T *B,
-                                                     int ldb,
-                                                     T beta,
-                                                     T *C,
-                                                     int ldc) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
-  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-#if CUDA_VERSION >= 8000
-  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
-    auto &cuda_ctx =
-        const_cast<paddle::platform::CUDADeviceContext &>(context_);
-    CUBlas<T>::GEMM_EX(&cuda_ctx,
-                       cuTransB,
-                       cuTransA,
-                       N,
-                       M,
-                       K,
-                       &alpha,
-                       B,
-                       CUDA_R_32F,
-                       ldb,
-                       A,
-                       CUDA_R_32F,
-                       lda,
-                       &beta,
-                       C,
-                       CUDA_R_32F,
-                       ldc);
-  } else {
-#endif  // CUDA_VERSION >= 8000
-    context_.CublasCall([&](cublasHandle_t handle) {
-      CUBlas<T>::GEMM(handle,
-                      cuTransB,
-                      cuTransA,
-                      N,
-                      M,
-                      K,
-                      &alpha,
-                      B,
-                      ldb,
-                      A,
-                      lda,
-                      &beta,
-                      C,
-                      ldc);
-    });
-#if CUDA_VERSION >= 8000
-  }
-#endif  // CUDA_VERSION >= 8000
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::GEMM(bool transA,
@@ -1972,45 +1294,6 @@ void Blas<phi::GPUContext>::GEMM(bool transA,
 #endif  // CUDA_VERSION >= 8000
 }
-template <>
-template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
-    bool transA,
-    bool transB,
-    int M,
-    int N,
-    int K,
-    phi::dtype::float16 alpha,
-    const phi::dtype::float16 *A,
-    int lda,
-    const phi::dtype::float16 *B,
-    int ldb,
-    phi::dtype::float16 beta,
-    phi::dtype::float16 *C,
-    int ldc) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
-  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-  context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<phi::dtype::float16>::GEMM(handle,
-                                      cuTransB,
-                                      cuTransA,
-                                      N,
-                                      M,
-                                      K,
-                                      &alpha,
-                                      B,
-                                      ldb,
-                                      A,
-                                      lda,
-                                      &beta,
-                                      C,
-                                      ldc);
-  });
-}
 template <>
 template <>
 inline void Blas<phi::GPUContext>::GEMM(bool transA,
@@ -2049,17 +1332,6 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
  });
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::AXPY(int n,
-                                                     T alpha,
-                                                     const T *x,
-                                                     T *y) const {
-  context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
-  });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
@@ -2068,15 +1340,6 @@ void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
  });
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::SCAL(int n,
-                                                     const T alpha,
-                                                     T *x) const {
-  context_.CublasCall(
-      [&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
@@ -2086,247 +1349,67 @@ void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
 template <>
 template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::VCOPY(int n,
+void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
-                                                      const T *x,
+  context_.CublasCall(
-                                                      T *y) const {
+      [&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
-  context_.CublasCall(
+}
-      [&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
-}
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
-  context_.CublasCall(
-      [&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
-}
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::GEMV(bool trans_a,
-                                                     int M,
-                                                     int N,
-                                                     T alpha,
-                                                     const T *A,
-                                                     const T *B,
-                                                     T beta,
-                                                     T *C) const {
-  cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
-  context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
-  });
-}
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::GEMV(bool trans_a,
-                                 int M,
-                                 int N,
-                                 T alpha,
-                                 const T *A,
-                                 const T *B,
-                                 T beta,
-                                 T *C) const {
-  cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
-  context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
-  });
-}
-template <>
-template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::GEMV(
-    bool trans_a,
-    int M,
-    int N,
-    phi::dtype::float16 alpha,
-    const phi::dtype::float16 *A,
-    const phi::dtype::float16 *B,
-    phi::dtype::float16 beta,
-    phi::dtype::float16 *C) const {
-  // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
-  if (trans_a) {
-    this->template GEMM<phi::dtype::float16>(
-        CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
-  } else {
-    this->template GEMM<phi::dtype::float16>(
-        CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
-  }
-}
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
-                                        int M,
-                                        int N,
-                                        phi::dtype::float16 alpha,
-                                        const phi::dtype::float16 *A,
-                                        const phi::dtype::float16 *B,
-                                        phi::dtype::float16 beta,
-                                        phi::dtype::float16 *C) const {
-  // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
-  if (trans_a) {
-    this->template GEMM<phi::dtype::float16>(
-        CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
-  } else {
-    this->template GEMM<phi::dtype::float16>(
-        CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
-  }
-}
-template <>
-template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::GEMV(
-    bool trans_a,
-    int M,
-    int N,
-    phi::dtype::bfloat16 alpha,
-    const phi::dtype::bfloat16 *A,
-    const phi::dtype::bfloat16 *B,
-    phi::dtype::bfloat16 beta,
-    phi::dtype::bfloat16 *C) const {
-  // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve
-  // it.
-  if (trans_a) {
-    this->template GEMM<phi::dtype::bfloat16>(
-        CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
-  } else {
-    this->template GEMM<phi::dtype::bfloat16>(
-        CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
-  }
-}
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
-                                        int M,
-                                        int N,
-                                        phi::dtype::bfloat16 alpha,
-                                        const phi::dtype::bfloat16 *A,
-                                        const phi::dtype::bfloat16 *B,
-                                        phi::dtype::bfloat16 beta,
-                                        phi::dtype::bfloat16 *C) const {
-  // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve
-  // it.
-  if (trans_a) {
-    this->template GEMM<phi::dtype::bfloat16>(
-        CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
-  } else {
-    this->template GEMM<phi::dtype::bfloat16>(
-        CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
-  }
-}
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    T alpha,
-    const T *A,
-    const T *B,
-    T beta,
-    T *C,
-    int batchCount,
-    int64_t strideA,
-    int64_t strideB) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const int64_t strideC = M * N;
-#if CUDA_VERSION >= 9010
-  if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
-      std::is_same<T, phi::dtype::float16>::value) {
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-    bool use_tensor_op_math = context_.tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-    VLOG(4) << "use_half_precision_compute_type: "
-            << FLAGS_gemm_use_half_precision_compute_type;
-    auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
-    cudaDataType_t compute_type = fp;
-    float h_alpha = static_cast<float>(alpha);
-    float h_beta = static_cast<float>(beta);
-    void *a = static_cast<void *>(&h_alpha);
-    void *b = static_cast<void *>(&h_beta);
-    // set ComputeType as CUDA_R_32F for fp16, for better accuracy
-    if (FLAGS_gemm_use_half_precision_compute_type == true &&
-        std::is_same<T, phi::dtype::float16>::value) {
-      a = static_cast<void *>(&alpha);
-      b = static_cast<void *>(&beta);
-      compute_type = CUDA_R_16F;
-    }
-    // set ComputeType as CUDA_R_32F for fp16 and fp32, for better accuracy
-    context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cublasGemmStridedBatchedEx(handle,
-                                                                cuTransB,
-                                                                cuTransA,
-                                                                N,
-                                                                M,
-                                                                K,
-                                                                a,
-                                                                B,
-                                                                fp,
-                                                                ldb,
-                                                                strideB,
-                                                                A,
-                                                                fp,
-                                                                lda,
-                                                                strideA,
-                                                                b,
-                                                                C,
-                                                                fp,
-                                                                ldc,
-                                                                strideC,
-                                                                batchCount,
-                                                                compute_type,
-                                                                algo));
-    });
-  } else {
-#endif  // CUDA_VERSION >= 9010
-    context_.CublasCall([&](cublasHandle_t handle) {
+template <>
-      CUBlas<T>::GEMM_STRIDED_BATCH(handle,
+template <typename T>
-                                    cuTransB,
+void Blas<phi::GPUContext>::GEMV(bool trans_a,
-                                    cuTransA,
+                                 int M,
-                                    N,
+                                 int N,
-                                    M,
+                                 T alpha,
-                                    K,
+                                 const T *A,
-                                    &alpha,
+                                 const T *B,
-                                    B,
+                                 T beta,
-                                    ldb,
+                                 T *C) const {
-                                    strideB,
+  cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
-                                    A,
-                                    lda,
-                                    strideA,
-                                    &beta,
-                                    C,
-                                    ldc,
-                                    strideC,
-                                    batchCount);
-    });
-#if CUDA_VERSION >= 9010
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
+  });
+}
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
+                                        int M,
+                                        int N,
+                                        phi::dtype::float16 alpha,
+                                        const phi::dtype::float16 *A,
+                                        const phi::dtype::float16 *B,
+                                        phi::dtype::float16 beta,
+                                        phi::dtype::float16 *C) const {
+  // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
+  if (trans_a) {
+    this->template GEMM<phi::dtype::float16>(
+        CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<phi::dtype::float16>(
+        CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
+  }
+}
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
+                                        int M,
+                                        int N,
+                                        phi::dtype::bfloat16 alpha,
+                                        const phi::dtype::bfloat16 *A,
+                                        const phi::dtype::bfloat16 *B,
+                                        phi::dtype::bfloat16 beta,
+                                        phi::dtype::bfloat16 *C) const {
+  // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve
+  // it.
+  if (trans_a) {
+    this->template GEMM<phi::dtype::bfloat16>(
+        CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<phi::dtype::bfloat16>(
+        CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
  }
-#endif  // CUDA_VERSION >= 9010
 }
 template <>
@@ -2438,78 +1521,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 9010
 }
-template <>
-template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    phi::dtype::bfloat16 alpha,
-    const phi::dtype::bfloat16 *A,
-    const phi::dtype::bfloat16 *B,
-    phi::dtype::bfloat16 beta,
-    phi::dtype::bfloat16 *C,
-    int batchCount,
-    int64_t strideA,
-    int64_t strideB) const {
-#if CUDA_VERSION >= 11000
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const int64_t strideC = M * N;
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-  bool use_tensor_op_math = context_.tensor_core_available();
-  if (use_tensor_op_math) {
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  }
-  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
-  context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        paddle::platform::dynload::cublasGemmStridedBatchedEx(
-            handle,
-            cuTransB,
-            cuTransA,
-            N,
-            M,
-            K,
-            &h_alpha,
-            B,
-            CUDA_R_16BF,
-            ldb,
-            strideB,
-            A,
-            CUDA_R_16BF,
-            lda,
-            strideA,
-            &h_beta,
-            C,
-            CUDA_R_16BF,
-            ldc,
-            strideC,
-            batchCount,
-            CUBLAS_COMPUTE_32F,
-            algo));
-  });
-#else
-  // raise error
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= "
-      "11"));
-#endif  // CUDA_VERSION >= 11000
-}
 template <>
 template <>
 inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
@@ -2582,26 +1593,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 11000
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    T alpha,
-    const T **A,
-    const T **B,
-    T beta,
-    T **C,
-    int batchCount) const {
-  for (int k = 0; k < batchCount; ++k) {
-    this->template GEMM<T>(
-        transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
-  }
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
@@ -2621,26 +1612,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
  }
 }
-template <>
-template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    phi::dtype::float16 alpha,
-    const phi::dtype::float16 **A,
-    const phi::dtype::float16 **B,
-    phi::dtype::float16 beta,
-    phi::dtype::float16 **C,
-    int batchCount) const {
-  for (int k = 0; k < batchCount; ++k) {
-    this->template GEMM<phi::dtype::float16>(
-        transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
-  }
-}
 template <>
 template <>
 inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
@@ -2660,26 +1631,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
  }
 }
-template <>
-template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    phi::dtype::bfloat16 alpha,
-    const phi::dtype::bfloat16 **A,
-    const phi::dtype::bfloat16 **B,
-    phi::dtype::bfloat16 beta,
-    phi::dtype::bfloat16 **C,
-    int batchCount) const {
-  for (int k = 0; k < batchCount; ++k) {
-    this->template GEMM<phi::dtype::bfloat16>(
-        transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
-  }
-}
 template <>
 template <>
 inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
@@ -2699,37 +1650,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
  }
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side,
-                                                     CBLAS_UPLO uplo,
-                                                     CBLAS_TRANSPOSE transA,
-                                                     CBLAS_DIAG diag,
-                                                     int M,
-                                                     int N,
-                                                     T alpha,
-                                                     const T *A,
-                                                     int lda,
-                                                     T *B,
-                                                     int ldb) const {
-  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
-  // where ' stands for transpose
-  cublasSideMode_t cuSide =
-      (side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
-  cublasFillMode_t cuUplo =
-      (uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
-  // use CUBLAS_OP_C (conjugate transpose) for complex
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasDiagType_t cuDiag =
-      (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
-  context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::TRSM(
-        handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, lda, B, ldb);
-  });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
@@ -2761,15 +1681,6 @@ void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
  });
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::BatchedGETRF(
-    int n, T **a, int *ipiv, int *info, int batch_size) const {
-  context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
-  });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedGETRF(
@@ -2779,25 +1690,6 @@ void Blas<phi::GPUContext>::BatchedGETRF(
  });
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::BatchedGETRI(
-    int n, const T **a, const int *ipiv, T **a_inv, int *info, int batch_size)
-    const {
-  PADDLE_ENFORCE_NE(
-      a_inv,
-      a,
-      phi::errors::InvalidArgument(
-          "cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
-          "in-place. The memory space of output matrix (address: %p) cannot "
-          "overlap memory space of input matrix (address: %p).",
-          a_inv,
-          a));
-  context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
-  });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedGETRI(int n,
@@ -2820,15 +1712,6 @@ void Blas<phi::GPUContext>::BatchedGETRI(int n,
  });
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::BatchedMatInv(
-    int n, const T **a, T **a_inv, int *info, int batch_size) const {
-  context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
-  });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedMatInv(
@@ -2838,28 +1721,6 @@ void Blas<phi::GPUContext>::BatchedMatInv(
  });
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::BatchedGETRS(
-    CBLAS_TRANSPOSE trans,
-    int n,
-    int nrhs,
-    const T **a,
-    int lda,
-    int *ipiv,
-    T **b,
-    int ldb,
-    int *info,
-    int batch_size) const {
-  // use CUBLAS_OP_C (conjugate transpose) for complex
-  cublasOperation_t cuTrans =
-      (trans == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::GETRS_BATCH(
-        handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size);
-  });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
@@ -2881,50 +1742,6 @@ void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
  });
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::BatchedTRSM(
-    CBLAS_SIDE side,
-    CBLAS_UPLO uplo,
-    CBLAS_TRANSPOSE transA,
-    CBLAS_DIAG diag,
-    int M,
-    int N,
-    T alpha,
-    const T **A,
-    int lda,
-    T **B,
-    int ldb,
-    int batch_size) const {
-  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
-  // where ' stands for transpose
-  cublasSideMode_t cuSide =
-      (side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
-  cublasFillMode_t cuUplo =
-      (uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
-  // use CUBLAS_OP_C (conjugate transpose) for complex
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasDiagType_t cuDiag =
-      (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
-  context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::TRSM_BATCH(handle,
-                          cuSide,
-                          cuUplo,
-                          cuTransA,
-                          cuDiag,
-                          N,
-                          M,
-                          &alpha,
-                          A,
-                          lda,
-                          B,
-                          ldb,
-                          batch_size);
-  });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedTRSM(CBLAS_SIDE side,

--- a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
@@ -257,54 +257,6 @@ struct CUBlas<phi::dtype::float16> {
  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
  template <typename... ARGS>
-  static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx,
-                      rocblas_operation transa,
-                      rocblas_operation transb,
-                      int m,
-                      int n,
-                      int k,
-                      const void *alpha,
-                      const void *A,
-                      rocblas_datatype Atype,
-                      int lda,
-                      const void *B,
-                      rocblas_datatype Btype,
-                      int ldb,
-                      const void *beta,
-                      void *C,
-                      rocblas_datatype Ctype,
-                      int ldc,
-                      rocblas_datatype computeType) {
-    rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::rocblas_gemm_ex(handle,
-                                                     transa,
-                                                     transb,
-                                                     m,
-                                                     n,
-                                                     k,
-                                                     alpha,
-                                                     A,
-                                                     Atype,
-                                                     lda,
-                                                     B,
-                                                     Btype,
-                                                     ldb,
-                                                     beta,
-                                                     C,
-                                                     Ctype,
-                                                     ldc,
-                                                     C,
-                                                     Ctype,
-                                                     ldc,
-                                                     computeType,
-                                                     algo,
-                                                     0,
-                                                     0));
-    });
-  }
-  template <typename... ARGS>
  static void GEMM_EX(phi::GPUContext *dev_ctx,
                      rocblas_operation transa,
                      rocblas_operation transb,
@@ -474,54 +426,6 @@ struct CUBlas<phi::dtype::complex<float>> {
  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
  template <typename... ARGS>
-  static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx,
-                      rocblas_operation transa,
-                      rocblas_operation transb,
-                      int m,
-                      int n,
-                      int k,
-                      const void *alpha,
-                      const void *A,
-                      rocblas_datatype Atype,
-                      int lda,
-                      const void *B,
-                      rocblas_datatype Btype,
-                      int ldb,
-                      const void *beta,
-                      void *C,
-                      rocblas_datatype Ctype,
-                      int ldc,
-                      rocblas_datatype computeType) {
-    rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::rocblas_gemm_ex(handle,
-                                                     transa,
-                                                     transb,
-                                                     m,
-                                                     n,
-                                                     k,
-                                                     alpha,
-                                                     A,
-                                                     Atype,
-                                                     lda,
-                                                     B,
-                                                     Btype,
-                                                     ldb,
-                                                     beta,
-                                                     C,
-                                                     Ctype,
-                                                     ldc,
-                                                     C,
-                                                     Ctype,
-                                                     ldc,
-                                                     computeType,
-                                                     algo,
-                                                     0,
-                                                     0));
-    });
-  }
-  template <typename... ARGS>
  static void GEMM_EX(phi::GPUContext *dev_ctx,
                      rocblas_operation transa,
                      rocblas_operation transb,
@@ -692,54 +596,6 @@ struct CUBlas<phi::dtype::complex<double>> {
  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
  template <typename... ARGS>
-  static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx,
-                      rocblas_operation transa,
-                      rocblas_operation transb,
-                      int m,
-                      int n,
-                      int k,
-                      const void *alpha,
-                      const void *A,
-                      rocblas_datatype Atype,
-                      int lda,
-                      const void *B,
-                      rocblas_datatype Btype,
-                      int ldb,
-                      const void *beta,
-                      void *C,
-                      rocblas_datatype Ctype,
-                      int ldc,
-                      rocblas_datatype computeType) {
-    rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::rocblas_gemm_ex(handle,
-                                                     transa,
-                                                     transb,
-                                                     m,
-                                                     n,
-                                                     k,
-                                                     alpha,
-                                                     A,
-                                                     Atype,
-                                                     lda,
-                                                     B,
-                                                     Btype,
-                                                     ldb,
-                                                     beta,
-                                                     C,
-                                                     Ctype,
-                                                     ldc,
-                                                     C,
-                                                     Ctype,
-                                                     ldc,
-                                                     computeType,
-                                                     algo,
-                                                     0,
-                                                     0));
-    });
-  }
-  template <typename... ARGS>
  static void GEMM_EX(phi::GPUContext *dev_ctx,
                      rocblas_operation transa,
                      rocblas_operation transb,
@@ -789,45 +645,6 @@ struct CUBlas<phi::dtype::complex<double>> {
  }
 };
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                                     CBLAS_TRANSPOSE transB,
-                                                     int M,
-                                                     int N,
-                                                     int K,
-                                                     T alpha,
-                                                     const T *A,
-                                                     const T *B,
-                                                     T beta,
-                                                     T *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  rocblas_operation cuTransA = (transA == CblasNoTrans)
-                                   ? rocblas_operation_none
-                                   : rocblas_operation_transpose;
-  rocblas_operation cuTransB = (transB == CblasNoTrans)
-                                   ? rocblas_operation_none
-                                   : rocblas_operation_transpose;
-  context_.CublasCall([&](rocblas_handle handle) {
-    CUBlas<T>::GEMM(handle,
-                    cuTransB,
-                    cuTransA,
-                    N,
-                    M,
-                    K,
-                    &alpha,
-                    B,
-                    ldb,
-                    A,
-                    lda,
-                    &beta,
-                    C,
-                    N);
-  });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
@@ -868,62 +685,6 @@ void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
  });
 }
-template <>
-template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    phi::dtype::float16 alpha,
-    const phi::dtype::float16 *A,
-    const phi::dtype::float16 *B,
-    phi::dtype::float16 beta,
-    phi::dtype::float16 *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  rocblas_operation cuTransA = (transA == CblasNoTrans)
-                                   ? rocblas_operation_none
-                                   : rocblas_operation_transpose;
-  rocblas_operation cuTransB = (transB == CblasNoTrans)
-                                   ? rocblas_operation_none
-                                   : rocblas_operation_transpose;
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(
-      context_.GetComputeCapability(),
-      53,
-      phi::errors::InvalidArgument(
-          "cublas fp16 gemm requires GPU compute capability >= 53,"
-          "but received %d",
-          context_.GetComputeCapability()));
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
-  auto &cuda_ctx = const_cast<paddle::platform::CUDADeviceContext &>(context_);
-  CUBlas<phi::dtype::float16>::GEMM_EX(&cuda_ctx,
-                                       cuTransB,
-                                       cuTransA,
-                                       N,
-                                       M,
-                                       K,
-                                       &h_alpha,
-                                       B,
-                                       rocblas_datatype_f16_r,
-                                       ldb,
-                                       A,
-                                       rocblas_datatype_f16_r,
-                                       lda,
-                                       &h_beta,
-                                       C,
-                                       rocblas_datatype_f16_r,
-                                       N,
-                                       rocblas_datatype_f32_r);
-}
 template <>
 template <>
 inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
@@ -982,17 +743,16 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 template <>
 template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
+inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
-    CBLAS_TRANSPOSE transB,
+                                        int M,
-    int M,
+                                        int N,
-    int N,
+                                        int K,
-    int K,
+                                        phi::dtype::bfloat16 alpha,
-    phi::dtype::bfloat16 alpha,
+                                        const phi::dtype::bfloat16 *A,
-    const phi::dtype::bfloat16 *A,
+                                        const phi::dtype::bfloat16 *B,
-    const phi::dtype::bfloat16 *B,
+                                        phi::dtype::bfloat16 beta,
-    phi::dtype::bfloat16 beta,
+                                        phi::dtype::bfloat16 *C) const {
-    phi::dtype::bfloat16 *C) const {
  // Note that cublas follows fortran order, so the order is different from
  // the cblas convention.
  int lda = (transA == CblasNoTrans) ? K : M;
@@ -1052,11 +812,11 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                        int M,
                                        int N,
                                        int K,
-                                        phi::dtype::bfloat16 alpha,
+                                        phi::dtype::complex<float> alpha,
-                                        const phi::dtype::bfloat16 *A,
+                                        const phi::dtype::complex<float> *A,
-                                        const phi::dtype::bfloat16 *B,
+                                        const phi::dtype::complex<float> *B,
-                                        phi::dtype::bfloat16 beta,
+                                        phi::dtype::complex<float> beta,
-                                        phi::dtype::bfloat16 *C) const {
+                                        phi::dtype::complex<float> *C) const {
  // Note that cublas follows fortran order, so the order is different from
  // the cblas convention.
  int lda = (transA == CblasNoTrans) ? K : M;
@@ -1067,140 +827,19 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
  rocblas_operation cuTransB = (transB == CblasNoTrans)
                                   ? rocblas_operation_none
                                   : rocblas_operation_transpose;
-  // TODO(zhiqiu): 80 has the same meaning for rocm and cuda?
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
  PADDLE_ENFORCE_GE(
      context_.GetComputeCapability(),
-      80,
+      53,
      phi::errors::InvalidArgument(
-          "rocblas fp16 gemm requires GPU compute capability >= 80,"
+          "cublas complex64 gemm requires GPU compute capability >= 53,"
          "but received %d",
          context_.GetComputeCapability()));
-  float h_alpha = static_cast<float>(alpha);
+  thrust::complex<float> c_alpha =
-  float h_beta = static_cast<float>(beta);
+      thrust::complex<float>(alpha.real, alpha.imag);
-  rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
+  thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
-  context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        paddle::platform::dynload::rocblas_gemm_ex(handle,
-                                                   cuTransB,
-                                                   cuTransA,
-                                                   N,
-                                                   M,
-                                                   K,
-                                                   &h_alpha,
-                                                   B,
-                                                   rocblas_datatype_bf16_r,
-                                                   ldb,
-                                                   A,
-                                                   rocblas_datatype_bf16_r,
-                                                   lda,
-                                                   &h_beta,
-                                                   C,
-                                                   rocblas_datatype_bf16_r,
-                                                   N,
-                                                   C,
-                                                   rocblas_datatype_bf16_r,
-                                                   N,
-                                                   rocblas_datatype_f32_r,
-                                                   algo,
-                                                   0,
-                                                   0));
-  });
-}
-template <>
-template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    phi::dtype::complex<float> alpha,
-    const phi::dtype::complex<float> *A,
-    const phi::dtype::complex<float> *B,
-    phi::dtype::complex<float> beta,
-    phi::dtype::complex<float> *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  rocblas_operation cuTransA = (transA == CblasNoTrans)
-                                   ? rocblas_operation_none
-                                   : rocblas_operation_transpose;
-  rocblas_operation cuTransB = (transB == CblasNoTrans)
-                                   ? rocblas_operation_none
-                                   : rocblas_operation_transpose;
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(
-      context_.GetComputeCapability(),
-      53,
-      phi::errors::InvalidArgument(
-          "cublas complex64 gemm requires GPU compute capability >= 53,"
-          "but received %d",
-          context_.GetComputeCapability()));
-  thrust::complex<float> c_alpha =
-      thrust::complex<float>(alpha.real, alpha.imag);
-  thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
-  auto &cuda_ctx = const_cast<paddle::platform::CUDADeviceContext &>(context_);
-  CUBlas<phi::dtype::complex<float>>::GEMM_EX(&cuda_ctx,
-                                              cuTransB,
-                                              cuTransA,
-                                              N,
-                                              M,
-                                              K,
-                                              &c_alpha,
-                                              B,
-                                              rocblas_datatype_f32_c,
-                                              ldb,
-                                              A,
-                                              rocblas_datatype_f32_c,
-                                              lda,
-                                              &c_beta,
-                                              C,
-                                              rocblas_datatype_f32_c,
-                                              N,
-                                              rocblas_datatype_f32_c);
-}
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        phi::dtype::complex<float> alpha,
-                                        const phi::dtype::complex<float> *A,
-                                        const phi::dtype::complex<float> *B,
-                                        phi::dtype::complex<float> beta,
-                                        phi::dtype::complex<float> *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  rocblas_operation cuTransA = (transA == CblasNoTrans)
-                                   ? rocblas_operation_none
-                                   : rocblas_operation_transpose;
-  rocblas_operation cuTransB = (transB == CblasNoTrans)
-                                   ? rocblas_operation_none
-                                   : rocblas_operation_transpose;
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(
-      context_.GetComputeCapability(),
-      53,
-      phi::errors::InvalidArgument(
-          "cublas complex64 gemm requires GPU compute capability >= 53,"
-          "but received %d",
-          context_.GetComputeCapability()));
-  thrust::complex<float> c_alpha =
-      thrust::complex<float>(alpha.real, alpha.imag);
-  thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
  auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
  CUBlas<phi::dtype::complex<float>>::GEMM_EX(&cuda_ctx,
@@ -1223,64 +862,6 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                              rocblas_datatype_f32_c);
 }
-template <>
-template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    phi::dtype::complex<double> alpha,
-    const phi::dtype::complex<double> *A,
-    const phi::dtype::complex<double> *B,
-    phi::dtype::complex<double> beta,
-    phi::dtype::complex<double> *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  rocblas_operation cuTransA = (transA == CblasNoTrans)
-                                   ? rocblas_operation_none
-                                   : rocblas_operation_transpose;
-  rocblas_operation cuTransB = (transB == CblasNoTrans)
-                                   ? rocblas_operation_none
-                                   : rocblas_operation_transpose;
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(
-      context_.GetComputeCapability(),
-      53,
-      phi::errors::InvalidArgument(
-          "cublas complex128 gemm requires GPU compute capability >= 53,"
-          "but received %d",
-          context_.GetComputeCapability()));
-  thrust::complex<double> c_alpha =
-      thrust::complex<double>(alpha.real, alpha.imag);
-  thrust::complex<double> c_beta =
-      thrust::complex<double>(beta.real, beta.imag);
-  auto &cuda_ctx = const_cast<paddle::platform::CUDADeviceContext &>(context_);
-  CUBlas<phi::dtype::complex<double>>::GEMM_EX(&cuda_ctx,
-                                               cuTransB,
-                                               cuTransA,
-                                               N,
-                                               M,
-                                               K,
-                                               &c_alpha,
-                                               B,
-                                               rocblas_datatype_f64_c,
-                                               ldb,
-                                               A,
-                                               rocblas_datatype_f64_c,
-                                               lda,
-                                               &c_beta,
-                                               C,
-                                               rocblas_datatype_f64_c,
-                                               N,
-                                               rocblas_datatype_f64_c);
-}
 template <>
 template <>
 inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
@@ -1339,44 +920,6 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                               rocblas_datatype_f64_c);
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::GEMM(bool transA,
-                                                     bool transB,
-                                                     int M,
-                                                     int N,
-                                                     int K,
-                                                     T alpha,
-                                                     const T *A,
-                                                     int lda,
-                                                     const T *B,
-                                                     int ldb,
-                                                     T beta,
-                                                     T *C,
-                                                     int ldc) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  rocblas_operation cuTransA =
-      transA ? rocblas_operation_transpose : rocblas_operation_none;
-  rocblas_operation cuTransB =
-      transB ? rocblas_operation_transpose : rocblas_operation_none;
-  context_.CublasCall([&](rocblas_handle handle) {
-    CUBlas<T>::GEMM(handle,
-                    cuTransB,
-                    cuTransA,
-                    N,
-                    M,
-                    K,
-                    &alpha,
-                    B,
-                    ldb,
-                    A,
-                    lda,
-                    &beta,
-                    C,
-                    ldc);
-  });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::GEMM(bool transA,
@@ -1416,46 +959,6 @@ void Blas<phi::GPUContext>::GEMM(bool transA,
  });
 }
-template <>
-template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
-    bool transA,
-    bool transB,
-    int M,
-    int N,
-    int K,
-    phi::dtype::float16 alpha,
-    const phi::dtype::float16 *A,
-    int lda,
-    const phi::dtype::float16 *B,
-    int ldb,
-    phi::dtype::float16 beta,
-    phi::dtype::float16 *C,
-    int ldc) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  rocblas_operation cuTransA =
-      transA ? rocblas_operation_transpose : rocblas_operation_none;
-  rocblas_operation cuTransB =
-      transB ? rocblas_operation_transpose : rocblas_operation_none;
-  context_.CublasCall([&](rocblas_handle handle) {
-    CUBlas<phi::dtype::float16>::GEMM(handle,
-                                      cuTransB,
-                                      cuTransA,
-                                      N,
-                                      M,
-                                      K,
-                                      &alpha,
-                                      B,
-                                      ldb,
-                                      A,
-                                      lda,
-                                      &beta,
-                                      C,
-                                      ldc);
-  });
-}
 template <>
 template <>
 inline void Blas<phi::GPUContext>::GEMM(bool transA,
@@ -1496,16 +999,6 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
  });
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::AXPY(int n,
-                                                     T alpha,
-                                                     const T *x,
-                                                     T *y) const {
-  context_.CublasCall([&](rocblas_handle handle) {
-    CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
-  });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
@@ -1514,14 +1007,6 @@ void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
  });
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::SCAL(int n,
-                                                     const T alpha,
-                                                     T *x) const {
-  context_.CublasCall(
-      [&](rocblas_handle handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
@@ -1529,14 +1014,6 @@ void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
      [&](rocblas_handle handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::VCOPY(int n,
-                                                      const T *x,
-                                                      T *y) const {
-  context_.CublasCall(
-      [&](rocblas_handle handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
@@ -1544,23 +1021,6 @@ void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
      [&](rocblas_handle handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::GEMV(bool trans_a,
-                                                     int M,
-                                                     int N,
-                                                     T alpha,
-                                                     const T *A,
-                                                     const T *B,
-                                                     T beta,
-                                                     T *C) const {
-  rocblas_operation cuTransA =
-      !trans_a ? rocblas_operation_transpose : rocblas_operation_none;
-  context_.CublasCall([&](rocblas_handle handle) {
-    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
-  });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::GEMV(bool trans_a,
@@ -1579,26 +1039,6 @@ void Blas<phi::GPUContext>::GEMV(bool trans_a,
  });
 }
-template <>
-template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::GEMV(
-    bool trans_a,
-    int M,
-    int N,
-    phi::dtype::float16 alpha,
-    const phi::dtype::float16 *A,
-    const phi::dtype::float16 *B,
-    phi::dtype::float16 beta,
-    phi::dtype::float16 *C) const {
-  // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
-  if (trans_a) {
-    this->template GEMM<phi::dtype::float16>(
-        CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
-  } else {
-    this->template GEMM<phi::dtype::float16>(
-        CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
-  }
-}
 template <>
 template <>
 inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
@@ -1619,26 +1059,6 @@ inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
  }
 }
-template <>
-template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::GEMV(
-    bool trans_a,
-    int M,
-    int N,
-    phi::dtype::bfloat16 alpha,
-    const phi::dtype::bfloat16 *A,
-    const phi::dtype::bfloat16 *B,
-    phi::dtype::bfloat16 beta,
-    phi::dtype::bfloat16 *C) const {
-  // Because rocblas doesn't support bfloat16 gemv, we use gemmex to achieve it.
-  if (trans_a) {
-    this->template GEMM<phi::dtype::bfloat16>(
-        CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
-  } else {
-    this->template GEMM<phi::dtype::bfloat16>(
-        CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
-  }
-}
 template <>
 template <>
 inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
@@ -1659,56 +1079,6 @@ inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
  }
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    T alpha,
-    const T *A,
-    const T *B,
-    T beta,
-    T *C,
-    int batchCount,
-    int64_t strideA,
-    int64_t strideB) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  rocblas_operation cuTransA = (transA == CblasNoTrans)
-                                   ? rocblas_operation_none
-                                   : rocblas_operation_transpose;
-  rocblas_operation cuTransB = (transB == CblasNoTrans)
-                                   ? rocblas_operation_none
-                                   : rocblas_operation_transpose;
-  const int64_t strideC = M * N;
-  context_.CublasCall([&](rocblas_handle handle) {
-    CUBlas<T>::GEMM_STRIDED_BATCH(handle,
-                                  cuTransB,
-                                  cuTransA,
-                                  N,
-                                  M,
-                                  K,
-                                  &alpha,
-                                  B,
-                                  ldb,
-                                  strideB,
-                                  A,
-                                  lda,
-                                  strideA,
-                                  &beta,
-                                  C,
-                                  ldc,
-                                  strideC,
-                                  batchCount);
-  });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
@@ -1758,71 +1128,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
  });
 }
-template <>
-template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    phi::dtype::bfloat16 alpha,
-    const phi::dtype::bfloat16 *A,
-    const phi::dtype::bfloat16 *B,
-    phi::dtype::bfloat16 beta,
-    phi::dtype::bfloat16 *C,
-    int batchCount,
-    int64_t strideA,
-    int64_t strideB) const {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  const int64_t strideC = M * N;
-  rocblas_operation cuTransA = (transA == CblasNoTrans)
-                                   ? rocblas_operation_none
-                                   : rocblas_operation_transpose;
-  rocblas_operation cuTransB = (transB == CblasNoTrans)
-                                   ? rocblas_operation_none
-                                   : rocblas_operation_transpose;
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
-  rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
-  context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        paddle::platform::dynload::rocblas_gemm_strided_batched_ex(
-            handle,
-            cuTransB,
-            cuTransA,
-            N,
-            M,
-            K,
-            &h_alpha,
-            B,
-            rocblas_datatype_bf16_r,
-            ldb,
-            strideB,
-            A,
-            rocblas_datatype_bf16_r,
-            lda,
-            strideA,
-            &h_beta,
-            C,
-            rocblas_datatype_bf16_r,
-            ldc,
-            strideC,
-            C,
-            rocblas_datatype_bf16_r,
-            ldc,
-            strideC,
-            batchCount,
-            rocblas_datatype_f32_r,
-            algo,
-            0,
-            0));
-  });
-}
 template <>
 template <>
 inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
@@ -1887,26 +1192,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
  });
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    T alpha,
-    const T **A,
-    const T **B,
-    T beta,
-    T **C,
-    int batchCount) const {
-  for (int k = 0; k < batchCount; ++k) {
-    this->template GEMM<T>(
-        transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
-  }
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
@@ -1926,25 +1211,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
  }
 }
-template <>
-template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    phi::dtype::float16 alpha,
-    const phi::dtype::float16 **A,
-    const phi::dtype::float16 **B,
-    phi::dtype::float16 beta,
-    phi::dtype::float16 **C,
-    int batchCount) const {
-  for (int k = 0; k < batchCount; ++k) {
-    this->template GEMM<phi::dtype::float16>(
-        transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
-  }
-}
 template <>
 template <>
 inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
@@ -1964,26 +1230,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
  }
 }
-template <>
-template <>
-inline void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    phi::dtype::bfloat16 alpha,
-    const phi::dtype::bfloat16 **A,
-    const phi::dtype::bfloat16 **B,
-    phi::dtype::bfloat16 beta,
-    phi::dtype::bfloat16 **C,
-    int batchCount) const {
-  for (int k = 0; k < batchCount; ++k) {
-    this->template GEMM<phi::dtype::bfloat16>(
-        transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
-  }
-}
 template <>
 template <>
 inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
@@ -2003,37 +1249,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
  }
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side,
-                                                     CBLAS_UPLO uplo,
-                                                     CBLAS_TRANSPOSE transA,
-                                                     CBLAS_DIAG diag,
-                                                     int M,
-                                                     int N,
-                                                     T alpha,
-                                                     const T *A,
-                                                     int lda,
-                                                     T *B,
-                                                     int ldb) const {
-  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
-  // where ' stands for transpose
-  rocblas_side cuSide =
-      (side == CblasLeft) ? rocblas_side_right : rocblas_side_left;
-  rocblas_fill cuUplo =
-      (uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower;
-  // use CUBLAS_OP_C (conjugate transpose) for complex
-  rocblas_operation cuTransA = (transA == CblasNoTrans)
-                                   ? rocblas_operation_none
-                                   : rocblas_operation_transpose;
-  rocblas_diagonal cuDiag =
-      (diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit;
-  context_.CublasCall([&](rocblas_handle handle) {
-    CUBlas<T>::TRSM(
-        handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, lda, B, ldb);
-  });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
@@ -2066,14 +1281,6 @@ void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
  });
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::BatchedGETRF(
-    int n, T **a, int *ipiv, int *info, int batch_size) const {
-  context_.CublasCall([&](rocblas_handle handle) {
-    CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
-  });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedGETRF(
@@ -2083,24 +1290,6 @@ void Blas<phi::GPUContext>::BatchedGETRF(
  });
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::BatchedGETRI(
-    int n, const T **a, const int *ipiv, T **a_inv, int *info, int batch_size)
-    const {
-  PADDLE_ENFORCE_NE(
-      a_inv,
-      a,
-      phi::errors::InvalidArgument(
-          "cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
-          "in-place. The memory space of output matrix (address: %p) cannot "
-          "overlap memory space of input matrix (address: %p).",
-          a_inv,
-          a));
-  context_.CublasCall([&](rocblas_handle handle) {
-    CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
-  });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedGETRI(int n,
@@ -2123,14 +1312,6 @@ void Blas<phi::GPUContext>::BatchedGETRI(int n,
  });
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::BatchedMatInv(
-    int n, const T **a, T **a_inv, int *info, int batch_size) const {
-  context_.CublasCall([&](rocblas_handle handle) {
-    CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
-  });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedMatInv(
@@ -2140,27 +1321,6 @@ void Blas<phi::GPUContext>::BatchedMatInv(
  });
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::BatchedGETRS(
-    CBLAS_TRANSPOSE trans,
-    int n,
-    int nrhs,
-    const T **a,
-    int lda,
-    int *ipiv,
-    T **b,
-    int ldb,
-    int *info,
-    int batch_size) const {
-  rocblas_operation cuTrans = (trans == CblasNoTrans)
-                                  ? rocblas_operation_none
-                                  : rocblas_operation_transpose;
-  context_.CublasCall([&](rocblas_handle handle) {
-    CUBlas<T>::GETRS_BATCH(
-        handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size);
-  });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
@@ -2182,50 +1342,6 @@ void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
  });
 }
-template <>
-template <typename T>
-void Blas<paddle::platform::CUDADeviceContext>::BatchedTRSM(
-    CBLAS_SIDE side,
-    CBLAS_UPLO uplo,
-    CBLAS_TRANSPOSE transA,
-    CBLAS_DIAG diag,
-    int M,
-    int N,
-    T alpha,
-    const T **A,
-    int lda,
-    T **B,
-    int ldb,
-    int batch_size) const {
-  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
-  // where ' stands for transpose
-  rocblas_side cuSide =
-      (side == CblasLeft) ? rocblas_side_right : rocblas_side_left;
-  rocblas_fill cuUplo =
-      (uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower;
-  // use CUBLAS_OP_C (conjugate transpose) for complex
-  rocblas_operation cuTransA = (transA == CblasNoTrans)
-                                   ? rocblas_operation_none
-                                   : rocblas_operation_transpose;
-  rocblas_diagonal cuDiag =
-      (diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit;
-  context_.CublasCall([&](rocblas_handle handle) {
-    CUBlas<T>::TRSM_BATCH(handle,
-                          cuSide,
-                          cuUplo,
-                          cuTransA,
-                          cuDiag,
-                          N,
-                          M,
-                          &alpha,
-                          A,
-                          lda,
-                          B,
-                          ldb,
-                          batch_size);
-  });
-}
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedTRSM(CBLAS_SIDE side,

--- a/paddle/phi/kernels/funcs/fc_functor.cu
+++ b/paddle/phi/kernels/funcs/fc_functor.cu
@@ -313,10 +313,6 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
  AddReluKernel(context.stream(), M, N, Y, B, relu);
 }
-template class FCFunctor<paddle::platform::CUDADeviceContext, float16>;
-template class FCFunctor<paddle::platform::CUDADeviceContext, float>;
-template class FCFunctor<paddle::platform::CUDADeviceContext, double>;
 template class FCFunctor<GPUContext, float16>;
 template class FCFunctor<GPUContext, float>;
 template class FCFunctor<GPUContext, double>;

--- a/paddle/phi/kernels/funcs/for_range.h
+++ b/paddle/phi/kernels/funcs/for_range.h
@@ -91,22 +91,6 @@ struct ForRange<phi::GPUContext> {
  size_t limit_;
 };
-// NOTE: After the pten kernel is migrated, it needs to be deleted.
-template <>
-struct ForRange<paddle::platform::CUDADeviceContext> {
-  ForRange(const paddle::platform::CUDADeviceContext& dev_ctx, size_t limit)
-      : dev_ctx_(dev_ctx), limit_(limit) {}
-  template <typename Function>
-  inline void operator()(Function func) const {
-    phi::funcs::ForRange<phi::GPUContext> for_range(dev_ctx_, limit_);
-    for_range(func);
-  }
-  const paddle::platform::CUDADeviceContext& dev_ctx_;
-  size_t limit_;
-};
 #endif
 }  // namespace funcs

--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -31,22 +31,6 @@ namespace funcs {
 using float16 = phi::dtype::float16;
 using bfloat16 = phi::dtype::bfloat16;
-template struct SetConstant<paddle::platform::CUDADeviceContext,
-                            phi::dtype::float16>;
-template struct SetConstant<paddle::platform::CUDADeviceContext,
-                            phi::dtype::bfloat16>;
-template struct SetConstant<paddle::platform::CUDADeviceContext, float>;
-template struct SetConstant<paddle::platform::CUDADeviceContext, double>;
-template struct SetConstant<paddle::platform::CUDADeviceContext, uint8_t>;
-template struct SetConstant<paddle::platform::CUDADeviceContext, int>;
-template struct SetConstant<paddle::platform::CUDADeviceContext, int16_t>;
-template struct SetConstant<paddle::platform::CUDADeviceContext, int64_t>;
-template struct SetConstant<paddle::platform::CUDADeviceContext, bool>;
-template struct SetConstant<paddle::platform::CUDADeviceContext,
-                            phi::dtype::complex<float>>;
-template struct SetConstant<paddle::platform::CUDADeviceContext,
-                            phi::dtype::complex<double>>;
 template struct SetConstant<phi::GPUContext, phi::dtype::float16>;
 template struct SetConstant<phi::GPUContext, phi::dtype::bfloat16>;
 template struct SetConstant<phi::GPUContext, float>;
@@ -75,44 +59,18 @@ template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
 template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
                            phi::dtype::complex<double>>;
-#define DEFINE_GPU_TRANS(RANK)                                                 \
+#define DEFINE_GPU_TRANS(RANK)                                \
-  template struct Transpose<paddle::platform::CUDADeviceContext, bool, RANK>;  \
+  template struct Transpose<phi::GPUContext, bool, RANK>;     \
-  template struct Transpose<paddle::platform::CUDADeviceContext, float, RANK>; \
+  template struct Transpose<phi::GPUContext, float, RANK>;    \
-  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+  template struct Transpose<phi::GPUContext, double, RANK>;   \
-                            double,                                            \
+  template struct Transpose<phi::GPUContext, float16, RANK>;  \
-                            RANK>;                                             \
+  template struct Transpose<phi::GPUContext, bfloat16, RANK>; \
-  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+  template struct Transpose<phi::GPUContext, int8_t, RANK>;   \
-                            float16,                                           \
+  template struct Transpose<phi::GPUContext, int32_t, RANK>;  \
-                            RANK>;                                             \
+  template struct Transpose<phi::GPUContext, int64_t, RANK>;  \
-  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+  template struct Transpose<phi::GPUContext,                  \
-                            bfloat16,                                          \
+                            phi::dtype::complex<float>,       \
-                            RANK>;                                             \
+                            RANK>;                            \
-  template struct Transpose<paddle::platform::CUDADeviceContext,               \
-                            int8_t,                                            \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CUDADeviceContext,               \
-                            int32_t,                                           \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CUDADeviceContext,               \
-                            int64_t,                                           \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CUDADeviceContext,               \
-                            phi::dtype::complex<float>,                        \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CUDADeviceContext,               \
-                            phi::dtype::complex<double>,                       \
-                            RANK>;                                             \
-  template struct Transpose<phi::GPUContext, bool, RANK>;                      \
-  template struct Transpose<phi::GPUContext, float, RANK>;                     \
-  template struct Transpose<phi::GPUContext, double, RANK>;                    \
-  template struct Transpose<phi::GPUContext, float16, RANK>;                   \
-  template struct Transpose<phi::GPUContext, bfloat16, RANK>;                  \
-  template struct Transpose<phi::GPUContext, int8_t, RANK>;                    \
-  template struct Transpose<phi::GPUContext, int32_t, RANK>;                   \
-  template struct Transpose<phi::GPUContext, int64_t, RANK>;                   \
-  template struct Transpose<phi::GPUContext,                                   \
-                            phi::dtype::complex<float>,                        \
-                            RANK>;                                             \
  template struct Transpose<phi::GPUContext, phi::dtype::complex<double>, RANK>;
 DEFINE_GPU_TRANS(1);
@@ -240,8 +198,7 @@ struct TransposeNormal<phi::GPUContext, T> {
 };
 // define transpose normal
-#define DEFINE_GPU_TRANS_NORMAL(TYPE)                                         \
+#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
-  template struct TransposeNormal<paddle::platform::CUDADeviceContext, TYPE>; \
  template struct TransposeNormal<phi::GPUContext, TYPE>
 DEFINE_GPU_TRANS_NORMAL(float16);

--- a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
@@ -131,10 +131,5 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
 template class MatrixInverseFunctor<GPUContext, float>;
 template class MatrixInverseFunctor<GPUContext, double>;
-// TODO(chenweihang): remove these instantiations later
-template class MatrixInverseFunctor<paddle::platform::CUDADeviceContext, float>;
-template class MatrixInverseFunctor<paddle::platform::CUDADeviceContext,
-                                    double>;
 }  // namespace funcs
 }  // namespace phi
--- a/paddle/phi/kernels/funcs/matrix_solve.cu
+++ b/paddle/phi/kernels/funcs/matrix_solve.cu
@@ -170,9 +170,5 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
 template class MatrixSolveFunctor<GPUContext, float>;
 template class MatrixSolveFunctor<GPUContext, double>;
-// TODO(wuweilong): remove these instantiations later
-template class MatrixSolveFunctor<paddle::platform::CUDADeviceContext, float>;
-template class MatrixSolveFunctor<paddle::platform::CUDADeviceContext, double>;
 }  // namespace funcs
 }  // namespace phi