[PTEN] Add Gpu context (#39305)

a821c4a9 · Wilber · GitHub · dcff7fa8 · a821c4a9 · a821c4a9
135 changed file
--- a/paddle/fluid/distributed/common/utils.h
+++ b/paddle/fluid/distributed/common/utils.h
@@ -33,7 +33,7 @@ namespace distributed {
 template <typename T>
 inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
 GetBlas() {
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
  return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
                                          T>(cpu_ctx);
 }

--- a/paddle/fluid/distributed/ps/service/communicator/communicator.cc
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
@@ -1155,7 +1155,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
    auto &t_latest = var_latest->Get<framework::LoDTensor>();
    auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();

-    auto cpu_ctx = paddle::platform::CPUDeviceContext();
+    paddle::platform::CPUDeviceContext cpu_ctx;
    auto *var_delta = delta_scope_->Var(varname);
    auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
    t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
@@ -1185,7 +1185,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
  RpcRecvDense(varnames, table_id, pserver_scope_.get());

  // 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
  for (auto &varname : varnames) {
    auto *var_latest = recv_scope_->FindVar(varname);
    auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
@@ -1292,7 +1292,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
  auto *t_old = var_old->GetMutable<framework::LoDTensor>();

  auto dims1 = t_latest.dims()[1];
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;

  auto *var_delta = delta_scope_->Var(varname);
  auto *t_delta = var_delta->GetMutable<pten::SelectedRows>();
@@ -1370,7 +1370,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
  std::vector<float> v_delta;
  v_delta.resize(numel);

-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
  auto blas =
      paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
          cpu_ctx);

--- a/paddle/fluid/distributed/ps/service/communicator/communicator.h
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h
@@ -179,7 +179,7 @@ inline void MergeVars(const std::string &var_name,
    }

    // set output tensor to 0.
-    auto cpu_ctx = paddle::platform::CPUDeviceContext();
+    paddle::platform::CPUDeviceContext cpu_ctx;
    paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T>
        constant_functor;
    constant_functor(cpu_ctx, out_t, static_cast<T>(0));
@@ -204,7 +204,7 @@ inline void MergeVars(const std::string &var_name,
    for (auto &var : vars) {
      inputs.push_back(&var->Get<pten::SelectedRows>());
    }
-    auto dev_ctx = paddle::platform::CPUDeviceContext();
+    paddle::platform::CPUDeviceContext dev_ctx;
    if (merge_add) {
      paddle::operators::math::scatter::MergeAdd<
          paddle::platform::CPUDeviceContext, T>

--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
@@ -21,7 +21,10 @@ TEST(DataTypeTransform, GPUTransform) {
  auto cpu_place = paddle::platform::CPUPlace();
  auto gpu_place = paddle::platform::CUDAPlace(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
-
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
  auto kernel_fp16 = paddle::framework::OpKernelType(
      paddle::framework::proto::VarType::FP16, gpu_place,
      paddle::framework::DataLayout::kAnyLayout,

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -1361,7 +1361,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
      auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
          pool.Get(member_->places_[dev_id]));
      auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
-      dev_ctx->set_bkcl_context(bkcl_ctx.comm());
+      dev_ctx->SetBkclContext(bkcl_ctx.comm());
    }
 #else
    PADDLE_THROW(

--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -77,6 +77,13 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> {
  using TYPE = pten::CPUContext;
 };

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+struct ConvertToPtenContext<platform::CUDADeviceContext> {
+  using TYPE = pten::GPUContext;
+};
+#endif
+
 #ifdef PADDLE_WITH_XPU
 template <>
 struct ConvertToPtenContext<platform::XPUDeviceContext> {

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -1085,7 +1085,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
    is.seekg(seekg, is.cur);

    void* buf;
-    auto ctx = platform::CPUDeviceContext();
+    platform::CPUDeviceContext ctx;
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
        platform::is_xpu_place(dev_ctx.GetPlace()) ||
@@ -1155,7 +1155,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
    tensor->Resize(framework::make_ddim(dims));
    void* buf;
-    auto ctx = platform::CPUDeviceContext();
+    platform::CPUDeviceContext ctx;
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
        platform::is_xpu_place(dev_ctx.GetPlace()) ||
@@ -1432,4 +1432,4 @@ std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) {
  VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
  return os;
 }
-}
+}  // namespace pten
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -73,6 +73,10 @@ TEST(TensorCopy, Tensor) {
    // CPU Tensor to GPU Tensor
    auto gpu_place = new platform::CUDAPlace(0);
    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(*gpu_place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
    TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);

    // GPU Tensor to CPU Tensor
@@ -166,6 +170,10 @@ TEST(TensorFromVector, Tensor) {
    gpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
    auto gpu_place = new paddle::platform::CUDAPlace();
    paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(*gpu_place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
    paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
    // Copy from GPU to CPU tensor for comparison
    paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
@@ -230,6 +238,10 @@ TEST(TensorToVector, Tensor) {
    paddle::framework::Tensor gpu_tensor;
    paddle::platform::CUDAPlace place;
    paddle::platform::CUDADeviceContext gpu_ctx(place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
    paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);

    std::vector<int> dst;
@@ -267,6 +279,10 @@ TEST(TensorToVector, Tensor_bool) {
    paddle::framework::Tensor gpu_tensor;
    paddle::platform::CUDAPlace place;
    paddle::platform::CUDADeviceContext gpu_ctx(place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
    paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);

    std::vector<bool> dst;
@@ -493,6 +509,10 @@ TEST(Tensor, FromAndToStream) {

    auto gpu_place = new platform::CUDAPlace();
    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(*gpu_place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();

    TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);


--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -46,6 +46,17 @@ void GLOOParallelContext::Init() {
  gloo_wrapper->Init();
  device_ = std::unique_ptr<platform::CPUDeviceContext>(
      new platform::CPUDeviceContext(platform::CPUPlace()));
+  device_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(platform::CPUPlace())
+                            .get());
+  device_->SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  device_->SetZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(platform::CPUPlace())
+          .get());
 }

 void GLOOParallelContext::InitWithRingID(int ring_id) {

--- a/paddle/fluid/inference/lite/test_engine_lite.cc
+++ b/paddle/fluid/inference/lite/test_engine_lite.cc
@@ -77,6 +77,10 @@ void make_fake_model(std::string* model, std::string* param) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  platform::CUDAPlace place;
  platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
 #else
  platform::CPUPlace place;
  platform::CPUDeviceContext ctx(place);

--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -27,6 +27,18 @@ class TensorRTEngineTest : public ::testing::Test {
 protected:
  void SetUp() override {
    ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
+    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
+                           .get());
+    ctx_->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx_->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(platform::CUDAPlace(0))
+            .get());
+    ctx_->PartialInitWithAllocator();

    engine_ = new TensorRTEngine(10, 1 << 10);
    engine_->InitNetwork();

--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
@@ -18,6 +18,7 @@
 #include <vector>

 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
@@ -44,6 +45,10 @@ TEST(BestFitAllocator, concurrent_cuda) {

  platform::CUDAPlace gpu(0);
  platform::CUDADeviceContext dev_ctx(gpu);
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu, dev_ctx.stream())
+                           .get());
+  dev_ctx.PartialInitWithAllocator();

  auto th_main = [&](std::random_device::result_type seed) {
    std::default_random_engine engine(seed);

--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -25,6 +25,7 @@
 #include <vector>

 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device_context.h"

@@ -105,8 +106,21 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
  main_stream_alloc_ptr.reset();

  for (int i = 0; i < NUM_STREAMS; ++i) {
-    dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
-        new platform::CUDADeviceContext(place)));
+    auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
+        new platform::CUDADeviceContext(place));
+    ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                          .GetAllocator(place, ctx->stream())
+                          .get());
+    ctx->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(place)
+            .get());
+    ctx->PartialInitWithAllocator();
+    dev_ctx.emplace_back(std::move(ctx));
    MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]);
  }

@@ -144,8 +158,21 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
  main_stream_alloc_ptr.reset();

  for (int i = 0; i < NUM_STREAMS; ++i) {
-    dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
-        new platform::CUDADeviceContext(place)));
+    auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
+        new platform::CUDADeviceContext(place));
+    ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                          .GetAllocator(place, ctx->stream())
+                          .get());
+    ctx->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(place)
+            .get());
+    ctx->PartialInitWithAllocator();
+    dev_ctx.emplace_back(std::move(ctx));
    threads.push_back(std::thread(MultiStreamCompute, &data[i], &second_data[i],
                                  std::cref(*dev_ctx[i])));
  }

--- a/paddle/fluid/operators/arg_min_max_op_base.cu.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.cu.h
@@ -110,7 +110,7 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
    return block_size;
  };

-  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x;
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
  int64_t height = pre * post;
  int64_t width = n;
  int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;

--- a/paddle/fluid/operators/argsort_op.cu
+++ b/paddle/fluid/operators/argsort_op.cu
@@ -131,7 +131,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,

  int block_size = ComputeBlockSize(num_cols);

-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
  // actually, int num_rows < max_grid_size
  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
  // Init a index array
@@ -212,7 +212,7 @@ void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,

  int block_size = ComputeBlockSize(num_cols);

-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
  // actually, int num_rows < max_grid_size
  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
  FillGrad<<<grid_size, block_size, 0, cu_stream>>>(

--- a/paddle/fluid/operators/broadcast_tensors_op.cu
+++ b/paddle/fluid/operators/broadcast_tensors_op.cu
@@ -90,8 +90,8 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
        // reduce_sum implementation on CUDA
        auto stream = context.cuda_device_context().stream();
        TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-            *input_tensor, output_tensor, kps::IdentityFunctor<T>(),
-            reduce_dims_vec, stream);
+            context.cuda_device_context(), *input_tensor, output_tensor,
+            kps::IdentityFunctor<T>(), reduce_dims_vec, stream);
      }
    }
  }

--- a/paddle/fluid/operators/cholesky_solve_op.cu
+++ b/paddle/fluid/operators/cholesky_solve_op.cu
@@ -115,7 +115,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
    }
    gpuStream_t stream = ctx.cuda_device_context().stream();
    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream);
+        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
+        out_reduce_dims, stream);
  }
 };


--- a/paddle/fluid/operators/clip_by_norm_op.cu
+++ b/paddle/fluid/operators/clip_by_norm_op.cu
@@ -77,7 +77,7 @@ class ClipByNormKernel<platform::CUDADeviceContext, platform::float16>
        {1}, dev_ctx);
    TensorReduceFunctorImpl<platform::float16, float, kps::AddFunctor,
                            kps::SquareFunctor<platform::float16, float>>(
-        *input, &tmp, kps::SquareFunctor<platform::float16, float>(),
+        dev_ctx, *input, &tmp, kps::SquareFunctor<platform::float16, float>(),
        reduce_dims, dev_ctx.stream());
    auto tmp_eigen = EigenVector<float>::Flatten(tmp);
    auto x_norm = tmp_eigen.sqrt();

--- a/paddle/fluid/operators/controlflow/compare_all_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_all_op.cu
@@ -65,7 +65,8 @@ class CompareReduceOpKernel
      auto stream = context.cuda_device_context().stream();
      TensorReduceFunctorImpl<bool, bool, BitwiseAdd,
                              kps::IdentityFunctor<bool>>(
-          tmp, z, kps::IdentityFunctor<bool>(), reduce_dims, stream);
+          context.cuda_device_context(), tmp, z, kps::IdentityFunctor<bool>(),
+          reduce_dims, stream);
    }
  }
 };

--- a/paddle/fluid/operators/copy_cross_scope_test.cc
+++ b/paddle/fluid/operators/copy_cross_scope_test.cc
@@ -131,12 +131,20 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
 TEST(copy_cross_scope, CUDA_fp32) {
  f::Scope scope;
  p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(p::CUDAPlace(0), ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
  Compare1<float>(&scope, ctx, "copy_cross_scope");
 }

 TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
  f::Scope scope;
  p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(p::CUDAPlace(0), ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
  Compare2<float>(&scope, ctx, "copy_cross_scope");
 }
 #elif PADDLE_WITH_ASCEND_CL

--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"

 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
@@ -51,8 +52,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
      auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
      auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod);
      auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
-      pten::MultiplyRawKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis,
-                                 pt_z.get());
+      pten::MultiplyRawKernel<T>(static_cast<const pten::GPUContext&>(cuda_ctx),
+                                 *pt_x.get(), *pt_y.get(), axis, pt_z.get());
    } else {
      PADDLE_THROW(platform::errors::InvalidArgument(
          "X's type[%s] is not supported by elementwise_op. X's type should be "

--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -1189,7 +1189,8 @@ void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
                   framework::Tensor *src, framework::Tensor *dst) {
  std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis);
  TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-      *src, dst, kps::IdentityFunctor<T>(), reduce_dims, dev_ctx.stream());
+      dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims,
+      dev_ctx.stream());
 }

 template <ElementwiseType ET, typename T, typename Functor>

--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -275,6 +275,18 @@ class TestFeedForward {
    output_size_ = 3 * num_head_ * dim_head_;
    input_size_ = dim_embed_;
    ctx_ = new platform::CUDADeviceContext(place_);
+    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(place_, ctx_->stream())
+                           .get());
+    ctx_->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx_->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(place_)
+            .get());
+    ctx_->PartialInitWithAllocator();

    size_src_ = bsz_seq_ * dim_embed_;         // src: [bs, seq_len, em_dim]
    size_weight_ = dim_embed_ * output_size_;  // weight: [output_size, em_dim]

--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -166,7 +166,8 @@ class AttnMatMul {
      if (support_case_1 || support_case_2) {
        gpuStream_t stream = dev_ctx_.stream();
        TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-            *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1}, stream);
+            dev_ctx_, *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1},
+            stream);
      } else {
        PADDLE_THROW(platform::errors::InvalidArgument(
            "Only support reduce when the input dims are [0,1,2,3,4] and "

--- a/paddle/fluid/operators/gelu_op.cu
+++ b/paddle/fluid/operators/gelu_op.cu
@@ -108,7 +108,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
        is_aligned(y, kAlignment)) {                                          \
      size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
      size_t block = (n / __vec_size + thread - 1) / thread;                  \
-      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize().x);     \
+      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
      VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block          \
               << " , thread = " << thread;                                   \
      FP16FastGeluFwdCUDAKernel<                                              \
@@ -144,7 +144,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
        is_aligned(x_g, kAlignment)) {                                        \
      size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
      size_t block = (n / __vec_size + thread - 1) / thread;                  \
-      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize().x);     \
+      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
      VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block          \
               << " , thread = " << thread;                                   \
      FP16FastGeluBwdCUDAKernel<                                              \

--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -260,7 +260,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, const T* input,
  int block = 1024;
 #endif
  const auto& dev_ctx = ctx.cuda_device_context();
-  int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+  int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
  int grid_tmp = (num_input + block - 1) / block;
  int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
  // 1. Insert data into keys and values.
@@ -334,7 +334,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
  int block = 1024;
 #endif
  const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
  int64_t grid_tmp = (outputs->size() + block - 1) / block;
  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
  ReindexSrcOutput<

--- a/paddle/fluid/operators/graph_send_recv_op.cu
+++ b/paddle/fluid/operators/graph_send_recv_op.cu
@@ -197,7 +197,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(
 #endif
  int64_t n = slice_size * index_size;
  const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
  int64_t grid_tmp = (n + block - 1) / block;
  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
  int64_t input_size = src_dims[0];
@@ -320,7 +320,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
 #endif
  int64_t n = slice_size * index_size;
  const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
  int64_t grid_tmp = (n + block - 1) / block;
  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
  int64_t input_size = src_dims[0];

--- a/paddle/fluid/operators/gumbel_softmax_op.cu
+++ b/paddle/fluid/operators/gumbel_softmax_op.cu
@@ -92,7 +92,7 @@ struct OneHotGenerator<platform::CUDADeviceContext, T> {
    const int size_from_axis = SizeFromAxis(axis, X.dims());
    const int size_out_axis = SizeOutAxis(axis, X.dims());
    constexpr int thread_size = 512;
-    int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize().x;
+    int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize()[0];
    int64_t height = size_to_axis * size_out_axis;
    int block_size = height < max_grid_dimx ? height : max_grid_dimx;


--- a/paddle/fluid/operators/index_sample_op.cu
+++ b/paddle/fluid/operators/index_sample_op.cu
@@ -27,10 +27,10 @@ namespace operators {

 namespace {
 void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) {
-  dim3 max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
+  auto max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
                          .GetCUDAMaxGridDimSize();
-  grid_dim->x = grid_dim->x < max_grid_dim.x ? grid_dim->x : max_grid_dim.x;
-  grid_dim->y = grid_dim->y < max_grid_dim.y ? grid_dim->y : max_grid_dim.y;
+  grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
+  grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
 }
 }


--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -45,11 +45,11 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D(
  int block_y = std::min(GetLastPow2(height), max_threads / block_x);
  int block_z = std::min(num_img, max_threads / block_x / block_y);

-  dim3 max_grid_dim = context.GetCUDAMaxGridDimSize();
-  int grid_x = std::min<int>(max_grid_dim.x, platform::DivUp(width, block_x));
-  int grid_y = std::min<int>(max_grid_dim.y, platform::DivUp(height, block_y));
+  auto max_grid_dim = context.GetCUDAMaxGridDimSize();
+  int grid_x = std::min<int>(max_grid_dim[0], platform::DivUp(width, block_x));
+  int grid_y = std::min<int>(max_grid_dim[1], platform::DivUp(height, block_y));
  int grid_z =
-      std::min<int>(max_grid_dim.z, platform::DivUp(num_img, block_z * 4));
+      std::min<int>(max_grid_dim[2], platform::DivUp(num_img, block_z * 4));

  const int capability = context.GetComputeCapability();
  platform::GpuLaunchConfig config;

--- a/paddle/fluid/operators/kron_op.h
+++ b/paddle/fluid/operators/kron_op.h
@@ -306,11 +306,11 @@ struct KronGradOpFunctor {
    auto stream = dev_ctx.stream();  // it is a cuda device_context
    if (dx) {
      TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
+          dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
    }
    if (dy) {
      TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
+          dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
    }
 #else
    auto* place = dev_ctx.eigen_device();

--- a/paddle/fluid/operators/kthvalue_op.cu
+++ b/paddle/fluid/operators/kthvalue_op.cu
@@ -54,7 +54,7 @@ bool SortKthvalue(const platform::CUDADeviceContext& ctx,
  input_indices.mutable_data<int64_t>(ctx.GetPlace());
  size_t temp_storage_bytes = -1;
  int block_size = getBlockSize(num_cols);
-  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
  unsigned int grid_size = num_rows < maxGridDimX
                               ? static_cast<unsigned int>(num_rows)
                               : maxGridDimX;

--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
@@ -72,6 +72,10 @@ TEST(LiteEngineOp, engine_op) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  platform::CUDAPlace place;
  platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
 #else
  platform::CPUPlace place;
  platform::CPUDeviceContext ctx(place);

--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -299,7 +299,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
    T* logits_max_buff = logits_max.mutable_data<T>(place);
    TensorReduceFunctorImpl<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>(
-        softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1},
+        dev_ctx, softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1},
        dev_ctx.stream());

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -321,7 +321,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
    T* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::ExpFunctor<T>>(
-        softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1},
+        dev_ctx, softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1},
        dev_ctx.stream());

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)

--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/beam_search.h"

 #include <gtest/gtest.h>
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"

 void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
                       paddle::framework::LoDTensor* scores,
@@ -129,6 +131,83 @@ void TestBeamSearch() {
  delete context;
 }

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+void TestBeamSearch<paddle::platform::CUDADeviceContext,
+                    paddle::platform::CUDAPlace>() {
+  paddle::framework::LoDTensor ids;
+  paddle::framework::LoDTensor scores;
+  paddle::framework::LoDTensor pre_ids;
+  paddle::framework::LoDTensor pre_scores;
+
+  auto* place = new paddle::platform::CUDAPlace();
+  auto* context = new paddle::platform::CUDADeviceContext(*place);
+  context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(*place, context->stream())
+                            .get());
+  context->PartialInitWithAllocator();
+  if (paddle::platform::is_cpu_place(*place)) {
+    PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores);
+  } else {
+    paddle::framework::LoDTensor cpu_ids;
+    paddle::framework::LoDTensor cpu_scores;
+    paddle::framework::LoDTensor cpu_pre_ids;
+    paddle::framework::LoDTensor cpu_pre_scores;
+
+    PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores);
+
+    paddle::framework::TensorCopySync(cpu_ids, *place, &ids);
+    paddle::framework::TensorCopySync(cpu_scores, *place, &scores);
+    paddle::framework::TensorCopySync(cpu_pre_ids, *place, &pre_ids);
+    paddle::framework::TensorCopySync(cpu_pre_scores, *place, &pre_scores);
+
+    ids.set_lod(cpu_ids.lod());
+    scores.set_lod(cpu_scores.lod());
+    pre_ids.set_lod(cpu_pre_ids.lod());
+    pre_scores.set_lod(cpu_pre_scores.lod());
+  }
+
+  paddle::framework::LoDTensor selected_ids;
+  paddle::framework::LoDTensor selected_scores;
+  paddle::framework::LoDTensor parent_idx;
+
+  size_t level = 0;
+  size_t beam_size = 2;
+  int end_id = 0;
+  paddle::operators::math::BeamSearchFunctor<
+      paddle::platform::CUDADeviceContext, float>
+      beamsearch;
+  beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
+             &selected_scores, &parent_idx, level, beam_size, end_id, true);
+
+  ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
+
+  paddle::framework::LoDTensor cpu_selected_ids;
+  paddle::framework::LoDTensor cpu_selected_scores;
+  if (paddle::platform::is_cpu_place(*place)) {
+    cpu_selected_ids = selected_ids;
+    cpu_selected_scores = selected_scores;
+  } else {
+    paddle::framework::TensorCopySync(
+        selected_ids, paddle::platform::CPUPlace(), &cpu_selected_ids);
+    paddle::framework::TensorCopySync(
+        selected_scores, paddle::platform::CPUPlace(), &cpu_selected_scores);
+    cpu_selected_ids.set_lod(selected_ids.lod());
+    cpu_selected_scores.set_lod(selected_scores.lod());
+  }
+
+  std::vector<int64_t> expected_ids({4, 5, 3, 8});
+  std::vector<float> expected_scores({0.6f, 0.5f, 0.9f, 0.7f});
+  for (int i = 0; i < 4; i++) {
+    ASSERT_EQ(expected_ids[i], cpu_selected_ids.data<int64_t>()[i]);
+    ASSERT_EQ(expected_scores[i], cpu_selected_scores.data<float>()[i]);
+  }
+
+  delete place;
+  delete context;
+}
+#endif
+
 TEST(BeamSearch, CPU) {
  TestBeamSearch<paddle::platform::CPUDeviceContext,
                 paddle::platform::CPUPlace>();

--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -18,6 +18,7 @@
 #include "paddle/fluid/platform/dynload/cublas.h"

 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"

 DECLARE_bool(enable_cublas_tensor_op_math);

@@ -92,6 +93,32 @@ struct CUBlas<float> {
 #endif
  }

+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
+                      cublasOperation_t transb, int m, int n, int k,
+                      const float *alpha, const void *A, cudaDataType_t Atype,
+                      int lda, const void *B, cudaDataType_t Btype, int ldb,
+                      const float *beta, void *C, cudaDataType_t Ctype,
+                      int ldc) {
+// Because the gcc 4.8 doesn't expand template parameter pack that
+// appears in a lambda-expression, I can not use template parameter pack
+// here.
+#if CUDA_VERSION >= 8000
+    VLOG(5) << "use_tensor_op_math: "
+            << (dev_ctx->tensor_core_available() ? "True" : "False");
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgemmEx(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc));
+    });
+#else
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasSgemmEx is not supported on cuda <= 7.5"));
+#endif
+  }
+
  template <typename... ARGS>
  static void TRSM(ARGS... args) {
    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasStrsm(args...));
@@ -273,6 +300,37 @@ struct CUBlas<platform::float16> {
        "cublasGemmEx is not supported on cuda <= 7.5"));
 #endif
  }
+
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
+                      cublasOperation_t transb, int m, int n, int k,
+                      const void *alpha, const void *A, cudaDataType_t Atype,
+                      int lda, const void *B, cudaDataType_t Btype, int ldb,
+                      const void *beta, void *C, cudaDataType_t Ctype, int ldc,
+                      cudaDataType_t computeType) {
+#if CUDA_VERSION >= 8000
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
+#endif  // CUDA_VERSION >= 9000
+
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, computeType, algo));
+    });
+#else
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasGemmEx is not supported on cuda <= 7.5"));
+#endif
+  }
 };

 template <>
@@ -388,6 +446,37 @@ struct CUBlas<platform::complex<float>> {
 #endif
  }

+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
+                      cublasOperation_t transb, int m, int n, int k,
+                      const void *alpha, const void *A, cudaDataType_t Atype,
+                      int lda, const void *B, cudaDataType_t Btype, int ldb,
+                      const void *beta, void *C, cudaDataType_t Ctype, int ldc,
+                      cudaDataType_t computeType) {
+#if CUDA_VERSION >= 8000
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
+#endif  // CUDA_VERSION >= 9000
+
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, computeType, algo));
+    });
+#else
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasGemmEx is not supported on cuda <= 7.5"));
+#endif
+  }
+
  static void TRSM_BATCH(cublasHandle_t handle, cublasSideMode_t side,
                         cublasFillMode_t uplo, cublasOperation_t transa,
                         cublasDiagType_t diag, int m, int n,
@@ -529,6 +618,37 @@ struct CUBlas<platform::complex<double>> {
        "cublasGemmEx is not supported on cuda <= 7.5"));
 #endif
  }
+
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
+                      cublasOperation_t transb, int m, int n, int k,
+                      const void *alpha, const void *A, cudaDataType_t Atype,
+                      int lda, const void *B, cudaDataType_t Btype, int ldb,
+                      const void *beta, void *C, cudaDataType_t Ctype, int ldc,
+                      cudaDataType_t computeType) {
+#if CUDA_VERSION >= 8000
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
+#endif  // CUDA_VERSION >= 9000
+
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, computeType, algo));
+    });
+#else
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasGemmEx is not supported on cuda <= 7.5"));
+#endif
+  }
 };

 template <>
@@ -564,6 +684,39 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 8000
 }

+template <>
+template <typename T>
+void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                  CBLAS_TRANSPOSE transB, int M, int N, int K,
+                                  T alpha, const T *A, const T *B, T beta,
+                                  T *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+#if CUDA_VERSION >= 8000
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+    CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
+                       CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
+                       CUDA_R_32F, N);
+  } else {
+#endif  // CUDA_VERSION >= 8000
+    context_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+                      lda, &beta, C, N);
+    });
+
+#if CUDA_VERSION >= 8000
+  }
+#endif  // CUDA_VERSION >= 8000
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
@@ -611,6 +764,55 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
 #endif  // CUDA_VERSION >= 8000
 }

+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::float16 alpha,
+                                         const platform::float16 *A,
+                                         const platform::float16 *B,
+                                         platform::float16 beta,
+                                         platform::float16 *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas fp16 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+#if CUDA_VERSION >= 8000
+  // cublasHgemm does true FP16 computation which is slow for non-Volta
+  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
+  // input/output in fp16, computation in fp32, which can also be accelerated
+  // using tensor cores in volta GPUs.
+  auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+  CUBlas<platform::float16>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16F, ldb, A,
+      CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F);
+#else
+  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K,
+                                    &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C,
+                                    N);
+  });
+#endif  // CUDA_VERSION >= 8000
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
@@ -659,6 +861,56 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
 #endif  // CUDA_VERSION >= 8000
 }

+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::complex<float> alpha,
+                                         const platform::complex<float> *A,
+                                         const platform::complex<float> *B,
+                                         platform::complex<float> beta,
+                                         platform::complex<float> *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas complex64 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  thrust::complex<float> c_alpha =
+      thrust::complex<float>(alpha.real, alpha.imag);
+  thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
+
+#if CUDA_VERSION >= 8000
+  // cublasHgemm does true FP16 computation which is slow for non-Volta
+  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
+  // input/output in fp16, computation in fp32, which can also be accelerated
+  // using tensor cores in volta GPUs.
+  auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+  CUBlas<platform::complex<float>>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, CUDA_C_32F, ldb, A,
+      CUDA_C_32F, lda, &c_beta, C, CUDA_C_32F, N, CUDA_C_32F);
+#else
+  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<platform::complex<float>>::GEMM(handle, cuTransB, cuTransA, N, M, K,
+                                           &c_alpha, h_B, ldb, h_A, lda,
+                                           &c_beta, h_C, N);
+  });
+#endif  // CUDA_VERSION >= 8000
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
@@ -708,6 +960,57 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
 #endif  // CUDA_VERSION >= 8000
 }

+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::complex<double> alpha,
+                                         const platform::complex<double> *A,
+                                         const platform::complex<double> *B,
+                                         platform::complex<double> beta,
+                                         platform::complex<double> *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas complex128 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  thrust::complex<double> c_alpha =
+      thrust::complex<double>(alpha.real, alpha.imag);
+  thrust::complex<double> c_beta =
+      thrust::complex<double>(beta.real, beta.imag);
+
+#if CUDA_VERSION >= 8000
+  // cublasHgemm does true FP16 computation which is slow for non-Volta
+  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
+  // input/output in fp16, computation in fp32, which can also be accelerated
+  // using tensor cores in volta GPUs.
+  auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+  CUBlas<platform::complex<double>>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, CUDA_C_64F, ldb, A,
+      CUDA_C_64F, lda, &c_beta, C, CUDA_C_64F, N, CUDA_C_64F);
+#else
+  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<platform::complex<double>>::GEMM(handle, cuTransB, cuTransA, N, M, K,
+                                            &c_alpha, h_B, ldb, h_A, lda,
+                                            &c_beta, h_C, N);
+  });
+#endif  // CUDA_VERSION >= 8000
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
@@ -738,6 +1041,35 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
 #endif  // CUDA_VERSION >= 8000
 }

+template <>
+template <typename T>
+void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N, int K,
+                                  T alpha, const T *A, int lda, const T *B,
+                                  int ldb, T beta, T *C, int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+#if CUDA_VERSION >= 8000
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+    CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
+                       CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
+                       CUDA_R_32F, ldc);
+  } else {
+#endif  // CUDA_VERSION >= 8000
+
+    context_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+                      lda, &beta, C, ldc);
+    });
+
+#if CUDA_VERSION >= 8000
+  }
+#endif  // CUDA_VERSION >= 8000
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
@@ -755,6 +1087,25 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
  });
 }

+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N,
+                                         int K, platform::float16 alpha,
+                                         const platform::float16 *A, int lda,
+                                         const platform::float16 *B, int ldb,
+                                         platform::float16 beta,
+                                         platform::float16 *C, int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha,
+                                    B, ldb, A, lda, &beta, C, ldc);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
@@ -764,6 +1115,14 @@ void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
  });
 }

+template <>
+template <typename T>
+void Blas<pten::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const {
@@ -771,6 +1130,13 @@ void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const {
      [&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
 }

+template <>
+template <typename T>
+void Blas<pten::GPUContext>::SCAL(int n, const T alpha, T *x) const {
+  context_.CublasCall(
+      [&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const {
@@ -778,6 +1144,13 @@ void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const {
      [&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
 }

+template <>
+template <typename T>
+void Blas<pten::GPUContext>::VCOPY(int n, const T *x, T *y) const {
+  context_.CublasCall(
+      [&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
@@ -790,6 +1163,17 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
  });
 }

+template <>
+template <typename T>
+void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N, T alpha,
+                                  const T *A, const T *B, T beta, T *C) const {
+  cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
+  });
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMV(
@@ -806,6 +1190,24 @@ inline void Blas<platform::CUDADeviceContext>::GEMV(
  }
 }

+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N,
+                                         platform::float16 alpha,
+                                         const platform::float16 *A,
+                                         const platform::float16 *B,
+                                         platform::float16 beta,
+                                         platform::float16 *C) const {
+  // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
+  if (trans_a) {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, 1, N, M,
+                                           alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, M, 1, N,
+                                           alpha, A, B, beta, C);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -854,6 +1256,56 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
 #endif  // CUDA_VERSION >= 9010
 }

+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, T alpha, const T *A, const T *B,
+                                         T beta, T *C, int batchCount,
+                                         int64_t strideA,
+                                         int64_t strideB) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const int64_t strideC = M * N;
+
+#if CUDA_VERSION >= 9010
+  if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
+      std::is_same<T, paddle::platform::float16>::value) {
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+    bool use_tensor_op_math = context_.tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
+
+    auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
+    context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx(
+          handle, cuTransB, cuTransA, N, M, K, &alpha, B, fp, ldb, strideB, A,
+          fp, lda, strideA, &beta, C, fp, ldc, strideC, batchCount, fp, algo));
+    });
+  } else {
+#endif  // CUDA_VERSION >= 9010
+
+    context_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha,
+                                    B, ldb, strideB, A, lda, strideA, &beta, C,
+                                    ldc, strideC, batchCount);
+    });
+
+#if CUDA_VERSION >= 9010
+  }
+#endif  // CUDA_VERSION >= 9010
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -865,6 +1317,19 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
  }
 }

+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, T alpha, const T **A,
+                                         const T **B, T beta, T **C,
+                                         int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<T>(transA, transB, M, N, K, alpha, A[k], B[k], beta,
+                           C[k]);
+  }
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -878,6 +1343,19 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
  }
 }

+template <>
+template <>
+inline void Blas<pten::GPUContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::float16 alpha, const platform::float16 **A,
+    const platform::float16 **B, platform::float16 beta, platform::float16 **C,
+    int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<platform::float16>(transA, transB, M, N, K, alpha, A[k],
+                                           B[k], beta, C[k]);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
@@ -903,6 +1381,30 @@ void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
  });
 }

+template <>
+template <typename T>
+void Blas<pten::GPUContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
+                                  CBLAS_TRANSPOSE transA, CBLAS_DIAG diag,
+                                  int M, int N, T alpha, const T *A, int lda,
+                                  T *B, int ldb) const {
+  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
+  // where ' stands for transpose
+  cublasSideMode_t cuSide =
+      (side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
+  cublasFillMode_t cuUplo =
+      (uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasDiagType_t cuDiag =
+      (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::TRSM(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A,
+                    lda, B, ldb);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
@@ -913,6 +1415,15 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
  });
 }

+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGETRF(int n, T **a, int *ipiv, int *info,
+                                          int batch_size) const {
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
@@ -931,6 +1442,23 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
  });
 }

+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGETRI(int n, const T **a, const int *ipiv,
+                                          T **a_inv, int *info,
+                                          int batch_size) const {
+  PADDLE_ENFORCE_NE(
+      a_inv, a,
+      platform::errors::InvalidArgument(
+          "cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
+          "in-place. The memory space of output matrix (address: %p) cannot "
+          "overlap memory space of input matrix (address: %p).",
+          a_inv, a));
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
@@ -941,6 +1469,15 @@ void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
  });
 }

+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedMatInv(int n, const T **a, T **a_inv,
+                                           int *info, int batch_size) const {
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGETRS(
@@ -955,6 +1492,21 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRS(
  });
 }

+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans, int n,
+                                          int nrhs, const T **a, int lda,
+                                          int *ipiv, T **b, int ldb, int *info,
+                                          int batch_size) const {
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  cublasOperation_t cuTrans =
+      (trans == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRS_BATCH(handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info,
+                           batch_size);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedTRSM(
@@ -979,6 +1531,31 @@ void Blas<platform::CUDADeviceContext>::BatchedTRSM(
  });
 }

+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedTRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
+                                         CBLAS_TRANSPOSE transA,
+                                         CBLAS_DIAG diag, int M, int N, T alpha,
+                                         const T **A, int lda, T **B, int ldb,
+                                         int batch_size) const {
+  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
+  // where ' stands for transpose
+  cublasSideMode_t cuSide =
+      (side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
+  cublasFillMode_t cuUplo =
+      (uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasDiagType_t cuDiag =
+      (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::TRSM_BATCH(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M,
+                          &alpha, A, lda, B, ldb, batch_size);
+  });
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/math/blas_impl.hip.h
+++ b/paddle/fluid/operators/math/blas_impl.hip.h
@@ -17,6 +17,7 @@
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"

 DECLARE_bool(enable_cublas_tensor_op_math);

@@ -221,6 +222,20 @@ struct CUBlas<platform::float16> {
          beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
    });
  }
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, rocblas_operation transa,
+                      rocblas_operation transb, int m, int n, int k,
+                      const void *alpha, const void *A, rocblas_datatype Atype,
+                      int lda, const void *B, rocblas_datatype Btype, int ldb,
+                      const void *beta, void *C, rocblas_datatype Ctype,
+                      int ldc, rocblas_datatype computeType) {
+    rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
+    });
+  }
 };

 template <>
@@ -305,6 +320,20 @@ struct CUBlas<platform::complex<float>> {
          beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
    });
  }
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, rocblas_operation transa,
+                      rocblas_operation transb, int m, int n, int k,
+                      const void *alpha, const void *A, rocblas_datatype Atype,
+                      int lda, const void *B, rocblas_datatype Btype, int ldb,
+                      const void *beta, void *C, rocblas_datatype Ctype,
+                      int ldc, rocblas_datatype computeType) {
+    rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
+    });
+  }
 };

 template <>
@@ -389,6 +418,20 @@ struct CUBlas<platform::complex<double>> {
          beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
    });
  }
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, rocblas_operation transa,
+                      rocblas_operation transb, int m, int n, int k,
+                      const void *alpha, const void *A, rocblas_datatype Atype,
+                      int lda, const void *B, rocblas_datatype Btype, int ldb,
+                      const void *beta, void *C, rocblas_datatype Ctype,
+                      int ldc, rocblas_datatype computeType) {
+    rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
+    });
+  }
 };

 template <>
@@ -412,6 +455,27 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
                    &beta, C, N);
  });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                  CBLAS_TRANSPOSE transB, int M, int N, int K,
+                                  T alpha, const T *A, const T *B, T beta,
+                                  T *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda,
+                    &beta, C, N);
+  });
+}

 template <>
 template <>
@@ -448,6 +512,43 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
      rocblas_datatype_f16_r, ldb, A, rocblas_datatype_f16_r, lda, &h_beta, C,
      rocblas_datatype_f16_r, N, rocblas_datatype_f32_r);
 }
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::float16 alpha,
+                                         const platform::float16 *A,
+                                         const platform::float16 *B,
+                                         platform::float16 beta,
+                                         platform::float16 *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas fp16 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+  auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+  CUBlas<platform::float16>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B,
+      rocblas_datatype_f16_r, ldb, A, rocblas_datatype_f16_r, lda, &h_beta, C,
+      rocblas_datatype_f16_r, N, rocblas_datatype_f32_r);
+}

 template <>
 template <>
@@ -485,6 +586,44 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
      rocblas_datatype_f32_c, ldb, A, rocblas_datatype_f32_c, lda, &c_beta, C,
      rocblas_datatype_f32_c, N, rocblas_datatype_f32_c);
 }
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::complex<float> alpha,
+                                         const platform::complex<float> *A,
+                                         const platform::complex<float> *B,
+                                         platform::complex<float> beta,
+                                         platform::complex<float> *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas complex64 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  thrust::complex<float> c_alpha =
+      thrust::complex<float>(alpha.real, alpha.imag);
+  thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
+
+  auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+  CUBlas<platform::complex<float>>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B,
+      rocblas_datatype_f32_c, ldb, A, rocblas_datatype_f32_c, lda, &c_beta, C,
+      rocblas_datatype_f32_c, N, rocblas_datatype_f32_c);
+}

 template <>
 template <>
@@ -523,6 +662,45 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
      rocblas_datatype_f64_c, ldb, A, rocblas_datatype_f64_c, lda, &c_beta, C,
      rocblas_datatype_f64_c, N, rocblas_datatype_f64_c);
 }
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::complex<double> alpha,
+                                         const platform::complex<double> *A,
+                                         const platform::complex<double> *B,
+                                         platform::complex<double> beta,
+                                         platform::complex<double> *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas complex128 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  thrust::complex<double> c_alpha =
+      thrust::complex<double>(alpha.real, alpha.imag);
+  thrust::complex<double> c_beta =
+      thrust::complex<double>(beta.real, beta.imag);
+
+  auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+  CUBlas<platform::complex<double>>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B,
+      rocblas_datatype_f64_c, ldb, A, rocblas_datatype_f64_c, lda, &c_beta, C,
+      rocblas_datatype_f64_c, N, rocblas_datatype_f64_c);
+}

 template <>
 template <typename T>
@@ -541,6 +719,22 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
                    &beta, C, ldc);
  });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N, int K,
+                                  T alpha, const T *A, int lda, const T *B,
+                                  int ldb, T beta, T *C, int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  rocblas_operation cuTransA =
+      transA ? rocblas_operation_transpose : rocblas_operation_none;
+  rocblas_operation cuTransB =
+      transB ? rocblas_operation_transpose : rocblas_operation_none;
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda,
+                    &beta, C, ldc);
+  });
+}

 template <>
 template <>
@@ -560,6 +754,26 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
                                    B, ldb, A, lda, &beta, C, ldc);
  });
 }
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N,
+                                         int K, platform::float16 alpha,
+                                         const platform::float16 *A, int lda,
+                                         const platform::float16 *B, int ldb,
+                                         platform::float16 beta,
+                                         platform::float16 *C, int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  rocblas_operation cuTransA =
+      transA ? rocblas_operation_transpose : rocblas_operation_none;
+  rocblas_operation cuTransB =
+      transB ? rocblas_operation_transpose : rocblas_operation_none;
+
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha,
+                                    B, ldb, A, lda, &beta, C, ldc);
+  });
+}

 template <>
 template <typename T>
@@ -569,6 +783,13 @@ void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
    CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
  });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
+  });
+}

 template <>
 template <typename T>
@@ -576,6 +797,12 @@ void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const {
  context_.CublasCall(
      [&](rocblas_handle handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::SCAL(int n, const T alpha, T *x) const {
+  context_.CublasCall(
+      [&](rocblas_handle handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
+}

 template <>
 template <typename T>
@@ -583,6 +810,12 @@ void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const {
  context_.CublasCall(
      [&](rocblas_handle handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::VCOPY(int n, const T *x, T *y) const {
+  context_.CublasCall(
+      [&](rocblas_handle handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
+}

 template <>
 template <typename T>
@@ -596,6 +829,17 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
  });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N, T alpha,
+                                  const T *A, const T *B, T beta, T *C) const {
+  rocblas_operation cuTransA =
+      !trans_a ? rocblas_operation_transpose : rocblas_operation_none;
+
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
+  });
+}

 template <>
 template <>
@@ -612,6 +856,23 @@ inline void Blas<platform::CUDADeviceContext>::GEMV(
                                           alpha, A, B, beta, C);
  }
 }
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N,
+                                         platform::float16 alpha,
+                                         const platform::float16 *A,
+                                         const platform::float16 *B,
+                                         platform::float16 beta,
+                                         platform::float16 *C) const {
+  // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
+  if (trans_a) {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, 1, N, M,
+                                           alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, M, 1, N,
+                                           alpha, A, B, beta, C);
+  }
+}

 template <>
 template <typename T>
@@ -637,6 +898,32 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
                                  ldc, strideC, batchCount);
  });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, T alpha, const T *A, const T *B,
+                                         T beta, T *C, int batchCount,
+                                         int64_t strideA,
+                                         int64_t strideB) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  const int64_t strideC = M * N;
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha,
+                                  B, ldb, strideB, A, lda, strideA, &beta, C,
+                                  ldc, strideC, batchCount);
+  });
+}

 template <>
 template <typename T>
@@ -648,6 +935,18 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
                           C[k]);
  }
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, T alpha, const T **A,
+                                         const T **B, T beta, T **C,
+                                         int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<T>(transA, transB, M, N, K, alpha, A[k], B[k], beta,
+                           C[k]);
+  }
+}

 template <>
 template <>
@@ -661,6 +960,18 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
                                           B[k], beta, C[k]);
  }
 }
+template <>
+template <>
+inline void Blas<pten::GPUContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::float16 alpha, const platform::float16 **A,
+    const platform::float16 **B, platform::float16 beta, platform::float16 **C,
+    int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<platform::float16>(transA, transB, M, N, K, alpha, A[k],
+                                           B[k], beta, C[k]);
+  }
+}

 template <>
 template <typename T>
@@ -687,6 +998,30 @@ void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
                    lda, B, ldb);
  });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
+                                  CBLAS_TRANSPOSE transA, CBLAS_DIAG diag,
+                                  int M, int N, T alpha, const T *A, int lda,
+                                  T *B, int ldb) const {
+  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
+  // where ' stands for transpose
+  rocblas_side cuSide =
+      (side == CblasLeft) ? rocblas_side_right : rocblas_side_left;
+  rocblas_fill cuUplo =
+      (uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower;
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_diagonal cuDiag =
+      (diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit;
+
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::TRSM(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A,
+                    lda, B, ldb);
+  });
+}

 template <>
 template <typename T>
@@ -697,6 +1032,14 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
    CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
  });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGETRF(int n, T **a, int *ipiv, int *info,
+                                          int batch_size) const {
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
+  });
+}

 template <>
 template <typename T>
@@ -715,6 +1058,22 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
    CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
  });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGETRI(int n, const T **a, const int *ipiv,
+                                          T **a_inv, int *info,
+                                          int batch_size) const {
+  PADDLE_ENFORCE_NE(
+      a_inv, a,
+      platform::errors::InvalidArgument(
+          "cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
+          "in-place. The memory space of output matrix (address: %p) cannot "
+          "overlap memory space of input matrix (address: %p).",
+          a_inv, a));
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
+  });
+}

 template <>
 template <typename T>
@@ -725,6 +1084,14 @@ void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
    CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
  });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedMatInv(int n, const T **a, T **a_inv,
+                                           int *info, int batch_size) const {
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
+  });
+}

 template <>
 template <typename T>
@@ -739,6 +1106,20 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRS(
                           batch_size);
  });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans, int n,
+                                          int nrhs, const T **a, int lda,
+                                          int *ipiv, T **b, int ldb, int *info,
+                                          int batch_size) const {
+  rocblas_operation cuTrans = (trans == CblasNoTrans)
+                                  ? rocblas_operation_none
+                                  : rocblas_operation_transpose;
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GETRS_BATCH(handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info,
+                           batch_size);
+  });
+}

 template <>
 template <typename T>
@@ -764,6 +1145,31 @@ void Blas<platform::CUDADeviceContext>::BatchedTRSM(
                          &alpha, A, lda, B, ldb, batch_size);
  });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedTRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
+                                         CBLAS_TRANSPOSE transA,
+                                         CBLAS_DIAG diag, int M, int N, T alpha,
+                                         const T **A, int lda, T **B, int ldb,
+                                         int batch_size) const {
+  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
+  // where ' stands for transpose
+  rocblas_side cuSide =
+      (side == CblasLeft) ? rocblas_side_right : rocblas_side_left;
+  rocblas_fill cuUplo =
+      (uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower;
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_diagonal cuDiag =
+      (diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit;
+
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::TRSM_BATCH(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M,
+                          &alpha, A, lda, B, ldb, batch_size);
+  });
+}

 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -16,6 +16,8 @@ limitations under the License. */

 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"

 /**
 * case 1:
@@ -441,6 +443,31 @@ void TestConcatMain() {
  delete context;
 }

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+void TestConcatMain<paddle::platform::CUDADeviceContext,
+                    paddle::platform::CUDAPlace>() {
+  auto* context =
+      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
+  context->SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CUDAPlace(), context->stream())
+          .get());
+  context->PartialInitWithAllocator();
+
+  ConcatCase1<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
+      context);
+  ConcatCase2<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
+      context);
+  ConcatCase3<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
+      context);
+  ConcatCase4<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
+      context);
+
+  delete context;
+}
+#endif
+
 TEST(math, concat) {
  TestConcatMain<paddle::platform::CPUDeviceContext,
                 paddle::platform::CPUPlace>();

--- a/paddle/fluid/operators/math/cusparse_conversion_api_test.cc
+++ b/paddle/fluid/operators/math/cusparse_conversion_api_test.cc
@@ -24,6 +24,11 @@ void TestNNZ(const std::vector<T>& dense_data, const int correct_nnz,
             const int rows, const int cols) {
  paddle::platform::CUDADeviceContext* context =
      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
+  context->SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CUDAPlace(), context->stream())
+          .get());
+  context->PartialInitWithAllocator();
  auto sparse =
      paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext,
                                         T>(*context);
@@ -61,6 +66,11 @@ void TestDenseToSparse(const std::vector<T>& correct_dense_data,
                       const std::string& mode) {
  paddle::platform::CUDADeviceContext* context =
      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
+  context->SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CUDAPlace(), context->stream())
+          .get());
+  context->PartialInitWithAllocator();
  // get sparse
  auto sparse =
      paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext,

--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/im2col.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"

 template <typename DeviceContext, typename Place>
 void testIm2col() {
@@ -60,6 +62,7 @@ void testIm2col() {

  auto* place = new Place();
  DeviceContext* context = new DeviceContext(*place);
+
  if (paddle::platform::is_cpu_place(*place)) {
    input = input_tmp;
  } else {
@@ -164,6 +167,165 @@ void testIm2col() {
  delete context;
 }

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+void testIm2col<paddle::platform::CUDADeviceContext,
+                paddle::platform::CUDAPlace>() {
+  paddle::framework::Tensor input_tmp;
+  paddle::framework::Tensor input;
+  paddle::framework::Tensor output_cfo;
+  paddle::framework::Tensor output_ocf;
+  paddle::framework::Tensor output_tmp;
+
+  /**
+   * input = [0, 1, 2,
+   *          3, 4, 5]
+   *
+   * output_cfo = [0, 1
+   *               1, 2
+   *               3, 4
+   *               4, 5]
+   *
+   * output_ocf = [0, 1, 3, 4
+   *               1, 2, 4, 5]
+   *
+   * col2im_cfo = [0, 2, 2
+   *               3, 4, 5]
+   *
+   * col2im_ocf = [0, 2, 2
+   *               3, 4, 5]
+   */
+  int input_height = 2;
+  int input_width = 3;
+  int filter_size = 2;
+  std::vector<int> stride({1, 1});  // stride_y, stride_x
+  std::vector<int> padding(
+      {0, 0, 0, 0});                  // up_pad, left_pad, down_pad, right_pad
+  std::vector<int> dilation({1, 1});  // dilation_y, dilation_x
+  int output_height =
+      (input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1;
+  int output_width =
+      (input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1;
+  float* input_ptr = input_tmp.mutable_data<float>(
+      {1, input_height, input_width}, paddle::platform::CPUPlace());
+  float arr[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input_ptr, arr, 6 * sizeof(float));
+
+  auto* place = new paddle::platform::CUDAPlace();
+  auto* context = new paddle::platform::CUDADeviceContext(*place);
+  context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(*place, context->stream())
+                            .get());
+  context->PartialInitWithAllocator();
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+  output_cfo.mutable_data<float>(
+      {1, filter_size, filter_size, output_height, output_width}, *place);
+  output_ocf.mutable_data<float>(
+      {output_height, output_width, 1, filter_size, filter_size}, *place);
+
+  // Im2Col
+  paddle::operators::math::Im2ColFunctor<
+      paddle::operators::math::ColFormat::kCFO,
+      paddle::platform::CUDADeviceContext, float>
+      im2col;
+  paddle::operators::math::Im2ColFunctor<
+      paddle::operators::math::ColFormat::kOCF,
+      paddle::platform::CUDADeviceContext, float>
+      im2col_ocf;
+
+  im2col(*context, input, dilation, stride, padding, &output_cfo);
+  im2col_ocf(*context, input, dilation, stride, padding, &output_ocf);
+
+  float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
+  float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
+
+  float* out_cfo_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_cfo_ptr = output_cfo.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(output_cfo, paddle::platform::CPUPlace(),
+                                      &output_tmp);
+    out_cfo_ptr = output_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]);
+  }
+
+  float* out_ocf_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_ocf_ptr = output_ocf.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(output_ocf, paddle::platform::CPUPlace(),
+                                      &output_tmp);
+    out_ocf_ptr = output_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
+  }
+
+  // Col2Im: kCFO
+  paddle::operators::math::Col2ImFunctor<
+      paddle::operators::math::ColFormat::kCFO,
+      paddle::platform::CUDADeviceContext, float>
+      col2im;
+  paddle::operators::math::Col2ImFunctor<
+      paddle::operators::math::ColFormat::kOCF,
+      paddle::platform::CUDADeviceContext, float>
+      col2im_ocf;
+  float col2im_data[] = {0, 2, 2, 3, 8, 5};
+
+  memset(input_ptr, 0, 6 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+
+  col2im(*context, output_cfo, dilation, stride, padding, &input);
+
+  float* in_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
+                                      &input_tmp);
+    in_ptr = input_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(in_ptr[i], col2im_data[i]);
+  }
+
+  // Col2Im: kOCF
+  memset(input_ptr, 0, 6 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+
+  col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
+                                      &input_tmp);
+    in_ptr = input_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(in_ptr[i], col2im_data[i]);
+  }
+
+  delete place;
+  delete context;
+}
+#endif
+
 TEST(math, im2col) {
  testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -194,7 +194,7 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
  constexpr size_t kThreadNumY = 32;

  size_t grid_dim = (outer_dim + kThreadNumY - 1) / kThreadNumY;
-  grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize().x);
+  grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]);
  dim3 thread_dims(kThreadNumX, kThreadNumY);
  if (reverse) {
    InclusiveScanInnerDimCUDAKernel<

--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/kernels/funcs/eigen/common.h"

 namespace paddle {
@@ -44,6 +45,18 @@ template struct SetConstant<platform::CUDADeviceContext,
 template struct SetConstant<platform::CUDADeviceContext,
                            platform::complex<double>>;

+template struct SetConstant<pten::GPUContext, platform::float16>;
+template struct SetConstant<pten::GPUContext, platform::bfloat16>;
+template struct SetConstant<pten::GPUContext, float>;
+template struct SetConstant<pten::GPUContext, double>;
+template struct SetConstant<pten::GPUContext, uint8_t>;
+template struct SetConstant<pten::GPUContext, int>;
+template struct SetConstant<pten::GPUContext, int16_t>;
+template struct SetConstant<pten::GPUContext, int64_t>;
+template struct SetConstant<pten::GPUContext, bool>;
+template struct SetConstant<pten::GPUContext, platform::complex<float>>;
+template struct SetConstant<pten::GPUContext, platform::complex<double>>;
+
 template struct SetConstant<platform::CUDAPinnedDeviceContext,
                            platform::float16>;
 template struct SetConstant<platform::CUDAPinnedDeviceContext,

--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -223,6 +223,7 @@ TEST(math_funciton, set_constant) {
  t.Resize({10, 10});
  t.mutable_data<int>(paddle::platform::CPUPlace());
  auto* ctx = new paddle::platform::CPUDeviceContext();
+  ctx->Init();
  paddle::operators::math::set_constant(*ctx, &t, 10);
  for (int64_t i = 0; i < t.numel(); ++i) {
    PADDLE_ENFORCE_EQ(10, t.data<int>()[i],

--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/fluid/operators/math/math_function_test.cu
@@ -46,6 +46,10 @@ TEST(math_function, notrans_mul_trans_fp32) {
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();

  float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
  float arr[6] = {0, 1, 2, 3, 4, 5};
@@ -78,6 +82,10 @@ TEST(math_function, notrans_mul_trans_fp16) {
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();

  // fp16 GEMM in cublas requires GPU compute capability >= 53
  if (context.GetComputeCapability() < 53) {
@@ -117,6 +125,10 @@ TEST(math_function, trans_mul_notrans_fp32) {
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();

  float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
  float arr[6] = {0, 1, 2, 3, 4, 5};
@@ -155,6 +167,10 @@ TEST(math_function, trans_mul_notrans_fp16) {
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();

  // fp16 GEMM in cublas requires GPU compute capability >= 53
  if (context.GetComputeCapability() < 53) {
@@ -200,6 +216,10 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();

  int m = 2;
  int n = 3;
@@ -254,6 +274,10 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();

  // fp16 GEMM in cublas requires GPU compute capability >= 53
  if (context.GetComputeCapability() < 53) {
@@ -316,6 +340,10 @@ TEST(math_function, gemm_trans_cublas_fp32) {
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();

  int m = 2;
  int n = 3;
@@ -364,6 +392,10 @@ TEST(math_function, gemm_trans_cublas_fp16) {
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();

  // fp16 GEMM in cublas requires GPU compute capability >= 53
  if (context.GetComputeCapability() < 53) {
@@ -418,6 +450,10 @@ void GemvTest(int m, int n, bool trans) {
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();

  T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
  T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);

--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/vol2col.h"

 #include <gtest/gtest.h>
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"

 template <typename DeviceContext, typename Place>
 void testVol2col() {
@@ -25,7 +27,6 @@ void testVol2col() {

  auto* place = new Place();
  DeviceContext* context = new DeviceContext(*place);
-
  /**
   * input = [[0, 1, 2,
   *          3, 4, 5]
@@ -123,6 +124,124 @@ void testVol2col() {
  delete context;
 }

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+void testVol2col<paddle::platform::CUDADeviceContext,
+                 paddle::platform::CUDAPlace>() {
+  paddle::framework::Tensor input;
+  paddle::framework::Tensor input_tmp;
+  paddle::framework::Tensor output;
+  paddle::framework::Tensor output_tmp;
+
+  auto* place = new paddle::platform::CUDAPlace();
+  auto* context = new paddle::platform::CUDADeviceContext(*place);
+  context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(*place, context->stream())
+                            .get());
+  context->PartialInitWithAllocator();
+
+  /**
+   * input = [[0, 1, 2,
+   *          3, 4, 5]
+   *          [6, 7, 8,
+   *          9, 10, 11]]
+   *
+   * output = [0, 1
+   *           1, 2
+   *           3, 4
+   *           4, 5
+   *           6, 7
+   *           7, 8
+   *           9, 10
+   *           10, 11]
+   *
+   * col2vol = [[0, 2, 2,
+   *             3, 8, 5]
+   *            [6, 14, 8,
+   *             9, 20, 11]]
+   *
+   */
+  int input_depth = 2;
+  int input_height = 2;
+  int input_width = 3;
+  int filter_size = 2;
+  std::vector<int> strides({1, 1, 1});
+  std::vector<int> paddings({0, 0, 0});
+  std::vector<int> dilations({1, 1, 1});
+  int output_depth =
+      (input_depth - filter_size + 2 * paddings[0]) / strides[0] + 1;
+  int output_height =
+      (input_height - filter_size + 2 * paddings[1]) / strides[1] + 1;
+  int output_width =
+      (input_width - filter_size + 2 * paddings[2]) / strides[2] + 1;
+
+  // Vol2Col test
+  float* input_ptr =
+      input_tmp.mutable_data<float>({1, input_depth, input_height, input_width},
+                                    paddle::platform::CPUPlace());
+  float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input_ptr, arr, 12 * sizeof(float));
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+  output.mutable_data<float>({1, filter_size, filter_size, filter_size,
+                              output_depth, output_height, output_width},
+                             *place);
+
+  paddle::operators::math::Vol2ColFunctor<paddle::platform::CUDADeviceContext,
+                                          float>
+      vol2col;
+  vol2col(*context, input, dilations, strides, paddings, &output);
+
+  float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
+  float* out_cfo_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_cfo_ptr = output.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(output, paddle::platform::CPUPlace(),
+                                      &output_tmp);
+    out_cfo_ptr = output_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 16; ++i) {
+    EXPECT_EQ(out_cfo_ptr[i], vol_2_col[i]);
+  }
+
+  // Col2Vol test
+  float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11};
+  memset(input_ptr, 0, 12 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+
+  paddle::operators::math::Col2VolFunctor<paddle::platform::CUDADeviceContext,
+                                          float>
+      col2vol;
+  col2vol(*context, output, dilations, strides, paddings, &input);
+
+  float* in_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
+                                      &input_tmp);
+    in_ptr = input_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 12; ++i) {
+    EXPECT_EQ(in_ptr[i], col_2_vol[i]);
+  }
+
+  delete place;
+  delete context;
+}
+#endif
+
 TEST(math, vol2col) {
  testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -66,7 +66,8 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
      reduce_dims.push_back(i);
    }
    TensorReduceFunctorImpl<T, T, kernel_primitives::AddFunctor, Div>(
-        *input, output, Div(numel), reduce_dims, stream);
+        context.cuda_device_context(), *input, output, Div(numel), reduce_dims,
+        stream);
  }
 };


--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
@@ -57,7 +57,12 @@ class NCCLTester : public ::testing::Test {
    paddle::platform::CPUPlace cpu_place;
    for (size_t i = 0; i < gpu_list_.size(); ++i) {
      p::CUDAPlace place(i);
-      dev_ctxs_.emplace_back(new p::CUDADeviceContext(place));
+      auto *ctx = new p::CUDADeviceContext(place);
+      ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(place, ctx->stream())
+                            .get());
+      ctx->PartialInitWithAllocator();
+      dev_ctxs_.emplace_back(ctx);
    }

    NCCLInitOp();

--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -106,16 +106,20 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
    using MT = typename details::MPTypeTrait<T>::Type;
    if (porder == 0) {
      TensorReduceFunctorImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
-          *in_x, out_norm, NonzeroFunctor<T>(), reduce_axis, stream);
+          ctx.cuda_device_context(), *in_x, out_norm, NonzeroFunctor<T>(),
+          reduce_axis, stream);
    } else if (porder == INFINITY) {
      TensorReduceFunctorImpl<T, T, kps::MaxFunctor, AbsFunctor<T>>(
-          *in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream);
+          ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
+          reduce_axis, stream);
    } else if (porder == -INFINITY) {
      TensorReduceFunctorImpl<T, T, kps::MinFunctor, AbsFunctor<T>>(
-          *in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream);
+          ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
+          reduce_axis, stream);
    } else {
      TensorReduceFunctorImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
-          *in_x, out_norm, UnsignedPowFunctor<T>(porder), reduce_axis, stream);
+          ctx.cuda_device_context(), *in_x, out_norm,
+          UnsignedPowFunctor<T>(porder), reduce_axis, stream);

      const framework::Tensor* tmp_norm = out_norm;
      std::vector<const framework::Tensor*> ins = {tmp_norm};

--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -208,8 +208,8 @@ class PoolKernel : public framework::OpKernel<T> {
            auto stream = dev_ctx.stream();
            TensorReduceFunctorImpl<T, T, kps::AddFunctor,
                                    kps::DivideFunctor<T>>(
-                *in_x, out, kps::DivideFunctor<T>(reduce_num), reduce_dim,
-                stream);
+                dev_ctx, *in_x, out, kps::DivideFunctor<T>(reduce_num),
+                reduce_dim, stream);
 #else  // for cpu
            paddle::operators::math::Pool2dFunctor<
                DeviceContext, paddle::operators::math::AvgPool<T>, T>

--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
@@ -186,7 +186,8 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
    }

    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        dalpha_tmp, dalpha, kps::IdentityFunctor<T>(), reduce_dims, stream);
+        context.cuda_device_context(), dalpha_tmp, dalpha,
+        kps::IdentityFunctor<T>(), reduce_dims, stream);
  }
 };


--- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
@@ -222,6 +222,10 @@ TEST(SENDANDRECV, GPU) {
  framework::Scope* scope = (*micro_scope)[0];
  platform::CUDAPlace place;
  platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();

  framework::Executor exe(place);
  // create var on local scope

--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -39,14 +39,16 @@ namespace operators {

 template <typename Tx, typename Ty, template <typename> class ReduceOp,
          typename TransformOp>
-void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
+void TensorReduceFunctorImpl(const platform::CUDADeviceContext& dev_ctx,
+                             const framework::Tensor& x, framework::Tensor* y,
                             const TransformOp& transform,
                             const std::vector<int>& origin_reduce_dims,
                             gpuStream_t stream) {
  y->mutable_data<Ty>(x.place());

  pten::kernels::TensorReduceFunctorImpl<Tx, Ty, ReduceOp, TransformOp>(
-      x, y, transform, origin_reduce_dims, stream);
+      static_cast<const pten::GPUContext&>(dev_ctx), x, y, transform,
+      origin_reduce_dims, stream);
 }

 }  // namespace operators

--- a/paddle/fluid/operators/renorm_op.cu
+++ b/paddle/fluid/operators/renorm_op.cu
@@ -156,7 +156,8 @@ class CUDARenormKernel : public framework::OpKernel<T> {
        cuda_ctx, ins, &outs, func);
    std::vector<int> reduce_axis = {0, 2};
    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, stream);
+        cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis,
+        stream);
    RenormKernelFunc3<T><<<grid2, block2, 0, stream>>>(
        numel, dim_value.mutable_data<T>(context.GetPlace()), p, max_norm);
    RenormKernelFunc4<T><<<grid, block, 0, stream>>>(
@@ -213,10 +214,11 @@ class CUDAGradRenormKernel : public framework::OpKernel<T> {
        dim_divisor);
    std::vector<int> reduce_axis = {0, 2};
    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, stream);
+        ctx.cuda_device_context(), pow_value, &dim_value,
+        kps::IdentityFunctor<T>(), reduce_axis, stream);
    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        mul_value, &weight_derivative, kps::IdentityFunctor<T>(), reduce_axis,
-        stream);
+        ctx.cuda_device_context(), mul_value, &weight_derivative,
+        kps::IdentityFunctor<T>(), reduce_axis, stream);
    RenormGradKernelFunc2<T><<<grid, block, 0, stream>>>(
        x_data, dout_data, dx_data, numel,
        dim_value.mutable_data<T>(ctx.GetPlace()),

--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -389,7 +389,8 @@ class ReshapeKernel {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    if (platform::is_gpu_place(ctx.GetPlace())) {
      auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-      pten::ReshapeKernel(dev_ctx, *in, pt_scalar_shape, out);
+      pten::ReshapeKernel(static_cast<const pten::GPUContext &>(dev_ctx), *in,
+                          pt_scalar_shape, out);
    }
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -417,7 +418,8 @@ class ReshapeGradKernel {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    if (platform::is_gpu_place(ctx.GetPlace())) {
      auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-      pten::ReshapeGradKernel(dev_ctx, *d_out, d_x);
+      pten::ReshapeGradKernel(static_cast<const pten::GPUContext &>(dev_ctx),
+                              *d_out, d_x);
    }
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -445,7 +447,8 @@ class ReshapeDoubleGradKernel {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    if (platform::is_gpu_place(ctx.GetPlace())) {
      auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-      pten::ReshapeDoubleGradKernel(dev_ctx, *dd_x, dd_out);
+      pten::ReshapeDoubleGradKernel(
+          static_cast<const pten::GPUContext &>(dev_ctx), *dd_x, dd_out);
    }
 #endif
 #ifdef PADDLE_WITH_XPU

--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
@@ -183,8 +183,7 @@ void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,

  int64_t max_grid_dimx =
      reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
-          .GetCUDAMaxGridDimSize()
-          .x;
+          .GetCUDAMaxGridDimSize()[0];
  int64_t grid = height < max_grid_dimx ? height : max_grid_dimx;

  ScatterInitCUDAKernel<T, IndexT><<<

--- a/paddle/fluid/operators/solve_op.h
+++ b/paddle/fluid/operators/solve_op.h
@@ -46,7 +46,8 @@ void ReduceSumForSolve(const Tensor* input, Tensor* output,
 #if defined(__NVCC__) || defined(__HIPCC__)
  auto stream = ctx.cuda_device_context().stream();
  TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-      *input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
+      ctx.cuda_device_context(), *input, output, kps::IdentityFunctor<T>(),
+      reduce_dims, stream);
 #else
  ReduceKernelFunctor<DeviceContext, T, ops::SumFunctor>(
      input, output, reduce_dims, keep_dim, false, ctx)

--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/strided_memcpy.h"

 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"

 namespace paddle {
 namespace operators {
@@ -86,6 +87,10 @@ TEST(StridedMemcpy, GPUCrop) {
  platform::CPUPlace cpu;

  platform::CUDADeviceContext ctx(gpu0);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(gpu0, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();

  auto src_allocation = memory::Alloc(gpu0, sizeof(src));

@@ -124,6 +129,10 @@ TEST(StridedMemcpy, GPUConcat) {
  platform::CUDAPlace gpu0(0);
  platform::CPUPlace cpu;
  platform::CUDADeviceContext ctx(gpu0);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(gpu0, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
  auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src));
  int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());

--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -37,6 +37,10 @@ void CreateCUDATensor(framework::Scope* scope, const std::string& name,
  tensor->Resize(dims);
  platform::CUDAPlace place;
  platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
  inference::tensorrt::RandomizeTensor(tensor, place, ctx);
 }

@@ -133,6 +137,10 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
  framework::Scope scope;
  platform::CUDAPlace place;
  platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
  // Prepare variables.
  if (allow_build_at_runtime)
    CreateCUDATensor(&scope, "x", std::vector<int64_t>({3, 4, 1, 1}));
@@ -159,6 +167,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
  framework::Scope scope;
  platform::CUDAPlace place;
  platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();

  auto* block_ = program.Proto()->add_blocks();
  block_->set_idx(0);

--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -411,7 +411,7 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
  };
  int block_size = ComputeBlockSize(num_cols);

-  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
  // actually, int num_rows < max_grid_size
  unsigned int grid_size = num_rows < maxGridDimX
                               ? static_cast<unsigned int>(num_rows)

--- a/paddle/fluid/operators/trace_op.cu
+++ b/paddle/fluid/operators/trace_op.cu
@@ -40,7 +40,8 @@ class TraceCUDAKernel : public framework::OpKernel<T> {
      std::vector<int> reduce_dims;
      reduce_dims.push_back(out->dims().size());
      TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          diag, out, kps::IdentityFunctor<T>(), reduce_dims, stream);
+          context.cuda_device_context(), diag, out, kps::IdentityFunctor<T>(),
+          reduce_dims, stream);
    } else {
      math::SetConstant<DeviceContext, T> functor;
      functor(context.device_context<DeviceContext>(), out, static_cast<T>(0));

--- a/paddle/fluid/operators/triangular_solve_op.cu
+++ b/paddle/fluid/operators/triangular_solve_op.cu
@@ -45,7 +45,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
    }
    gpuStream_t stream = ctx.cuda_device_context().stream();
    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream);
+        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
+        out_reduce_dims, stream);
  }
 };


--- a/paddle/fluid/operators/viterbi_decode_op.cu
+++ b/paddle/fluid/operators/viterbi_decode_op.cu
@@ -148,7 +148,7 @@ struct Argmax<platform::CUDADeviceContext, T, IndType> {
    }
    const auto& dev_ctx = ctx.cuda_device_context();
    auto cu_stream = dev_ctx.stream();
-    int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+    int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
    int64_t height = pre * post;
    int64_t width = n;
    int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -138,6 +138,7 @@ if(WITH_CNCL)
 endif()

 if(WITH_GPU OR WITH_ROCM)
+    target_link_libraries(device_context gpu_info gpu_context pten_gpu_info)
    target_link_libraries(device_context gpu_resource_pool)
 endif()


--- a/paddle/fluid/platform/bfloat16_test.cu
+++ b/paddle/fluid/platform/bfloat16_test.cu
@@ -66,6 +66,10 @@ TEST(bfloat16, lod_tensor_on_gpu) {
  // CPU LoDTensor to GPU LoDTensor
  CUDAPlace gpu_place(0);
  CUDADeviceContext gpu_ctx(gpu_place);
+  gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, gpu_ctx.stream())
+                           .get());
+  gpu_ctx.PartialInitWithAllocator();
  framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);

  // GPU LoDTensor to CPU LoDTensor

--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/platform/collective_helper.h"
 #include <utility>

+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"

@@ -187,6 +188,18 @@ NCCLComm* NCCLCommContext::AssignNCCLComm(ncclComm_t comm, int nranks, int rank,
                                          int dev_id, int ring_id) {
  std::unique_ptr<CUDADeviceContext> dev_ctx(
      new CUDADeviceContext(CUDAPlace(dev_id)));
+  dev_ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(CUDAPlace(dev_id), dev_ctx->stream())
+                            .get());
+  dev_ctx->SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  dev_ctx->SetZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(CUDAPlace(dev_id))
+          .get());
+  dev_ctx->PartialInitWithAllocator();

  std::shared_ptr<platform::CudaEventObject> compute_event(
      platform::CudaEventResourcePool::Instance().New(dev_id));
@@ -329,7 +342,7 @@ BKCLComm* BKCLCommContext::AssignBKCLComm(BKCLContext_t comm, int nranks,
    auto* dev_ctx = static_cast<platform::XPUDeviceContext*>(
        platform::DeviceContextPool::Instance().Get(
            platform::XPUPlace(dev_id)));
-    dev_ctx->set_bkcl_context(comm);
+    dev_ctx->SetBkclContext(comm);
  }

  return comm_map_[ring_id][dev_id].get();

--- a/paddle/fluid/platform/device/gpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
 IF(WITH_GPU)
    add_subdirectory(cuda)
-    nv_library(gpu_info SRCS gpu_info.cc DEPS cuda_info gflags glog enforce monitor dynload_cuda)
+    nv_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)

    nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
    nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
 ELSEIF(WITH_ROCM)
    add_subdirectory(rocm)
-    hip_library(gpu_info SRCS gpu_info.cc DEPS rocm_info gflags glog enforce monitor dynload_cuda)
+    hip_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)

    hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
    hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)

--- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
-nv_library(cuda_info SRCS cuda_info.cc DEPS gflags glog enforce monitor dynload_cuda)
 nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
 nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)


--- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
@@ -14,8 +14,10 @@

 #pragma once

+#include <functional>
 #include <mutex>  // NOLINT

+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
@@ -96,8 +98,7 @@ class CublasHandleHolder {
    PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
  }

-  template <typename Callback>
-  inline void Call(Callback&& callback) const {
+  inline void Call(const std::function<void(blasHandle_t)>& callback) const {
    std::lock_guard<std::mutex> guard(mtx_);
    callback(handle_);
  }

--- a/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
@@ -14,11 +14,13 @@ limitations under the License. */

 #pragma once

+#include <functional>
 #include <mutex>  // NOLINT

 #include "paddle/fluid/platform/dynload/cusparse.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/pten/backends/gpu/gpu_decls.h"

 namespace paddle {
 namespace platform {
@@ -45,8 +47,8 @@ class CusparseHandleHolder {
 #endif
  }

-  template <typename Callback>
-  inline void Call(Callback&& callback) const {
+  inline void Call(
+      const std::function<void(pten::sparseHandle_t)>& callback) const {
    std::lock_guard<std::mutex> guard(mtx_);
    callback(handle_);
  }

--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include <array>
 #include <cstdlib>
 #include <mutex>
 #include <set>
@@ -39,11 +40,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"

+#include "paddle/pten/backends/gpu/gpu_info.h"
+
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(enable_cublas_tensor_op_math);
-DECLARE_string(selected_gpus);
 DECLARE_uint64(gpu_memory_limit_mb);

 constexpr static float fraction_reserve_gpu_memory = 0.05f;
@@ -51,23 +53,6 @@ constexpr static float fraction_reserve_gpu_memory = 0.05f;
 USE_GPU_MEM_STAT;
 namespace paddle {
 namespace platform {
-//! Get a list of device ids from environment variable or use all.
-std::vector<int> GetSelectedDevices() {
-  // use user specified GPUs in single-node multi-process mode.
-  std::vector<int> devices;
-  if (!FLAGS_selected_gpus.empty()) {
-    auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
-    for (auto id : devices_str) {
-      devices.push_back(atoi(id.c_str()));
-    }
-  } else {
-    int count = GetGPUDeviceCount();
-    for (int i = 0; i < count; ++i) {
-      devices.push_back(i);
-    }
-  }
-  return devices;
-}

 void GpuMemoryUsage(size_t *available, size_t *total) {
  size_t actual_available, actual_total;
@@ -382,5 +367,91 @@ void *GetGpuBasePtr(void *ptr, int dev_id) {
  return RecordedGpuMallocHelper::Instance(dev_id)->GetBasePtr(ptr);
 }

+int DnnVersion() { return pten::backends::gpu::DnnVersion(); }
+
+int GetGPUDeviceCount() { return pten::backends::gpu::GetGPUDeviceCount(); }
+
+int GetGPUComputeCapability(int id) {
+  return pten::backends::gpu::GetGPUComputeCapability(id);
+}
+
+int GetGPURuntimeVersion(int id) {
+  return pten::backends::gpu::GetGPURuntimeVersion(id);
+}
+
+int GetGPUDriverVersion(int id) {
+  return pten::backends::gpu::GetGPUDriverVersion(id);
+}
+
+bool TensorCoreAvailable() {
+  return pten::backends::gpu::TensorCoreAvailable();
+}
+
+int GetGPUMultiProcessors(int id) {
+  return pten::backends::gpu::GetGPUMultiProcessors(id);
+}
+
+int GetGPUMaxThreadsPerMultiProcessor(int id) {
+  return pten::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(id);
+}
+
+int GetGPUMaxThreadsPerBlock(int id) {
+  return pten::backends::gpu::GetGPUMaxThreadsPerBlock(id);
+}
+
+int GetCurrentDeviceId() { return pten::backends::gpu::GetCurrentDeviceId(); }
+
+std::array<int, 3> GetGpuMaxGridDimSize(int id) {
+  return pten::backends::gpu::GetGpuMaxGridDimSize(id);
+}
+
+std::vector<int> GetSelectedDevices() {
+  return pten::backends::gpu::GetSelectedDevices();
+}
+
+const gpuDeviceProp &GetDeviceProperties(int id) {
+  return pten::backends::gpu::GetDeviceProperties(id);
+}
+
+void SetDeviceId(int device_id) { pten::backends::gpu::SetDeviceId(device_id); }
+
+gpuError_t GpuGetLastError() { return pten::backends::gpu::GpuGetLastError(); }
+
+void GpuStreamSync(gpuStream_t stream) {
+  pten::backends::gpu::GpuStreamSync(stream);
+}
+
+void GpuDestroyStream(gpuStream_t stream) {
+  pten::backends::gpu::GpuDestroyStream(stream);
+}
+
+void GpuDeviceSync() { pten::backends::gpu::GpuDeviceSync(); }
+
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    gpuMemcpyKind kind, gpuStream_t stream) {
+  pten::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream);
+}
+
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   gpuMemcpyKind kind) {
+  pten::backends::gpu::GpuMemcpySync(dst, src, count, kind);
+}
+
+void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
+                        int src_device, size_t count, gpuStream_t stream) {
+  pten::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device,
+                                          count, stream);
+}
+
+void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
+                       int src_device, size_t count) {
+  pten::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device,
+                                         count);
+}
+
+void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
+  pten::backends::gpu::GpuMemsetAsync(dst, value, count, stream);
+}
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device/gpu/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

 #include <stddef.h>
+#include <array>
 #include <string>
 #include <vector>

@@ -52,7 +53,7 @@ int GetGPUMaxThreadsPerBlock(int id);
 int GetCurrentDeviceId();

 //! Get the maximum GridDim size for GPU buddy allocator.
-dim3 GetGpuMaxGridDimSize(int);
+std::array<int, 3> GetGpuMaxGridDimSize(int);

 //! Get a list of device ids from environment variable or use all.
 std::vector<int> GetSelectedDevices();
@@ -110,7 +111,7 @@ void GpuStreamSync(gpuStream_t stream);
 void GpuDestroyStream(gpuStream_t stream);

 // ! Blocks until device has completed all operations.
-void GpuDeviceync();
+void GpuDeviceSync();

 //! CudaMalloc with recorded info
 gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id);

--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -83,8 +83,21 @@ struct NCCLContext {
  std::unique_ptr<CUDADeviceContext> ctx_;
  ncclComm_t comm_;

-  explicit NCCLContext(int dev_id)
-      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
+  explicit NCCLContext(int dev_id) : comm_{nullptr} {
+    ctx_.reset(new CUDADeviceContext(CUDAPlace(dev_id)));
+    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(CUDAPlace(dev_id), ctx_->stream())
+                           .get());
+    ctx_->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx_->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(CUDAPlace(dev_id))
+            .get());
+    ctx_->PartialInitWithAllocator();
+  }

  gpuStream_t stream() const { return ctx_->stream(); }
  ncclComm_t comm() const { return comm_; }

--- a/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
-hip_library(rocm_info SRCS rocm_info.cc DEPS gflags glog enforce monitor dynload_cuda)
-
 hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -10,8 +10,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
+#include <functional>
 #include <memory>
 #include <set>
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/cuda_stream.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
@@ -149,14 +153,15 @@ inline void EmplaceDeviceContext(
              cuda_ctx,
              platform::errors::InvalidArgument(
                  "Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
-          dev_ctx->SetDeviceAllocator(
-              memory::allocation::AllocatorFacade::Instance()
-                  .GetAllocator(p, cuda_ctx->context()->RawStream())
+          // Note: A trick method to init context, why GetAllocator interface
+          // needs a stream parameter?
+          dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
+                                    .GetAllocator(p, cuda_ctx->stream())
                                    .get());
+          cuda_ctx->PartialInitWithAllocator();
 #endif
        } else {
-          dev_ctx->SetDeviceAllocator(
-              memory::allocation::AllocatorFacade::Instance()
+          dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
                                    .GetAllocator(p)
                                    .get());
        }
@@ -251,14 +256,18 @@ DeviceContextPool::DeviceContextPool(
  }
 }

-CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {}
+CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {
+  pten::CPUContext::Init();
+}

-CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext() {}
+CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext(place) {
+  pten::CPUContext::Init();
+}

 #ifdef PADDLE_WITH_IPU
 IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}

-Place IPUDeviceContext::GetPlace() const { return place_; }
+const Place& IPUDeviceContext::GetPlace() const { return place_; }

 void IPUDeviceContext::Wait() const {
  /*! \brief  Wait for all operations completion in the stream. */
@@ -268,11 +277,14 @@ IPUDeviceContext::~IPUDeviceContext() {}

 #endif
 #ifdef PADDLE_WITH_XPU
-XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {}
+XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {
+  pten::XPUContext::Init();
+}

 XPUDeviceContext::~XPUDeviceContext() {}

 XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) {
+  pten::XPUContext::Init();
  LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
                          << static_cast<int>(place.device);
 }
@@ -302,7 +314,7 @@ void NPUDeviceContext::Wait() const {

 aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }

-Place NPUDeviceContext::GetPlace() const { return place_; }
+const Place& NPUDeviceContext::GetPlace() const { return place_; }

 aclrtContext NPUDeviceContext::context() const { return context_; }

@@ -319,7 +331,7 @@ Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
  return eigen_device_.get();
 }

-Place NPUPinnedDeviceContext::GetPlace() const { return place_; }
+const Place& NPUPinnedDeviceContext::GetPlace() const { return place_; }

 #endif

@@ -470,102 +482,28 @@ CUDAContext::~CUDAContext() {
 #endif
 }

-CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
-  CUDADeviceGuard guard(place_.device);
-  compute_capability_ = GetGPUComputeCapability(place_.device);
-  multi_process_ = GetGPUMultiProcessors(place_.device);
-  max_threads_per_mp_ = GetGPUMaxThreadsPerMultiProcessor(place_.device);
-  max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device);
-  max_threads_per_block_ = GetGPUMaxThreadsPerBlock(place_.device);
-
-  driver_version_ = GetGPUDriverVersion(place_.device);
-  runtime_version_ = GetGPURuntimeVersion(place_.device);
-
-  LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: "
-                          << static_cast<int>(place_.device)
-                          << ", GPU Compute Capability: "
-                          << compute_capability_ / 10 << "."
-                          << compute_capability_ % 10
-                          << ", Driver API Version: " << driver_version_ / 1000
-                          << "." << (driver_version_ % 100) / 10
-                          << ", Runtime API Version: "
-                          << runtime_version_ / 1000 << "."
-                          << (runtime_version_ % 100) / 10;
-#ifdef PADDLE_WITH_HIP
-  size_t version_major, version_minor, version_patch;
-  PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
-      &version_major, &version_minor, &version_patch));
-  LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
-                          << ", MIOpen Version: " << version_major << "."
-                          << version_minor << "." << version_patch;
-#else
-  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
-  LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
-                          << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
-                          << (cudnn_dso_ver % 1000) / 100 << ".";
-#endif
-  {
-    // Check CUDA/CUDNN version compatiblity
-    auto local_cuda_version =
-        (driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
-#ifdef PADDLE_WITH_HIP
-    auto compile_cuda_version = (HIP_VERSION / 100) * 10 + (HIP_VERSION % 10);
-#else
-    auto compile_cuda_version =
-        (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
-#endif
-    if (local_cuda_version < compile_cuda_version) {
-      LOG_FIRST_N(WARNING, 1)
-          << "WARNING: device: " << static_cast<int>(place_.device)
-          << ". The installed Paddle is compiled with CUDA "
-          << compile_cuda_version / 10 << "." << compile_cuda_version % 10
-          << ", but CUDA runtime version in your machine is "
-          << local_cuda_version / 10 << "." << local_cuda_version % 10
-          << ", which may cause serious incompatible bug. "
-          << "Please recompile or reinstall Paddle with compatible CUDA "
-             "version.";
-    }
-  }
-  default_ctx_.reset(new CUDAContext(place_));
-}
-
-CUDADeviceContext::~CUDADeviceContext() {
-  SetDeviceId(place_.device);
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  if (nccl_comm_) {
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
-  }
-#endif
-}
-
-Place CUDADeviceContext::GetPlace() const { return place_; }
-
-void CUDADeviceContext::Wait() const { context()->Stream()->Wait(); }
-
-int CUDADeviceContext::GetComputeCapability() const {
-  return compute_capability_;
-}
-
-int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
-  return multi_process_ * max_threads_per_mp_;
+CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
+    : pten::GPUContext(place) {
+  pten::GPUContext::PartialInitWithoutAllocator();
+  cuda_stream_.reset(
+      new stream::CUDAStream(pten::GPUContext::stream(), this->GetPlace()));
 }

-int CUDADeviceContext::GetSMCount() const { return multi_process_; }
-
-int CUDADeviceContext::GetMaxThreadsPerBlock() const {
-  return max_threads_per_block_;
-}
+CUDADeviceContext::~CUDADeviceContext() = default;

 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
+  if (thread_ctx_.count(this)) {
    return context()->EigenDevice().get();
+  }
+  return pten::GPUContext::eigen_device();
 }

-bool CUDADeviceContext::tensor_core_available() const {
-  return context()->CublasTensorCoreHandle() != nullptr;
-}
-
-dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const {
-  return max_grid_dim_size_;
+void CUDADeviceContext::Wait() const {
+  if (thread_ctx_.count(this)) {
+    context()->Stream()->Wait();
+    return;
+  }
+  pten::GPUContext::Wait();
 }

 #ifdef PADDLE_WITH_HIP
@@ -573,33 +511,96 @@ miopenHandle_t CUDADeviceContext::cudnn_handle() const {
 #else
 cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
 #endif
+  if (thread_ctx_.count(this)) {
    return context()->CudnnHandle();
+  }
+  return pten::GPUContext::cudnn_handle();
 }

 #ifdef PADDLE_WITH_HIP
 rocblas_handle CUDADeviceContext::cublas_handle() const {
+  if (thread_ctx_.count(this)) {
    return context()->CublasHandle()->GetCublasHandle();
+  }
+  return pten::GPUContext::cublas_handle();
 }
 #else
 cublasHandle_t CUDADeviceContext::cublas_handle() const {
+  if (thread_ctx_.count(this)) {
    return context()->CublasHandle()->GetCublasHandle();
+  }
+  return pten::GPUContext::cublas_handle();
 }
 cusparseHandle_t CUDADeviceContext::cusparse_handle() const {
+  if (thread_ctx_.count(this)) {
    return context()->CusparseHandle()->GetCusparseHandle();
+  }
+  return pten::GPUContext::cusparse_handle();
+}
+cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
+  if (thread_ctx_.count(this)) {
+    return context()->CusolverDnHandle();
+  }
+  return pten::GPUContext::cusolver_dn_handle();
 }
 #endif

+void CUDADeviceContext::RecordEvent(
+    gpuEvent_t ev, const std::function<void()>& callback) const {
+  if (thread_ctx_.count(this)) {
+    context()->Stream()->RecordEvent(ev, callback);
+    return;
+  }
+  pten::GPUContext::RecordEvent(ev, callback);
+}
+
+void CUDADeviceContext::AddStreamCallback(
+    const std::function<void()>& callback) const {
+  if (thread_ctx_.count(this)) {
+    context()->Stream()->AddCallback(callback);
+    return;
+  }
+  pten::GPUContext::AddStreamCallback(callback);
+}
+
+void CUDADeviceContext::WaitStreamCallback() const {
+  if (thread_ctx_.count(this)) {
+    context()->Stream()->WaitCallback();
+    return;
+  }
+  pten::GPUContext::WaitStreamCallback();
+}
+
 CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
  return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
 }

-#ifndef PADDLE_WITH_HIP
-cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
-  return context()->CusolverDnHandle();
+gpuStream_t CUDADeviceContext::stream() const {
+  if (thread_ctx_.count(this)) {
+    return context()->RawStream();
+  }
+  return pten::GPUContext::stream();
+}
+
+std::shared_ptr<CUDAContext> CUDADeviceContext::context() const {
+  if (!thread_ctx_.count(this)) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "CUDADeviceContext call context() failed, make sure in the "
+        "thread_local semantic."));
+  }
+  return thread_ctx_.at(this);
+}
+
+stream::CUDAStream* CUDADeviceContext::GetCudaStream() const {
+  return cuda_stream_.get();
 }
-#endif

-gpuStream_t CUDADeviceContext::stream() const { return context()->RawStream(); }
+stream::CUDAStream* CUDADeviceContext::SetCudaStream(
+    stream::CUDAStream* new_stream_ptr) {
+  auto* old_stream_ptr = cuda_stream_.release();
+  cuda_stream_.reset(new_stream_ptr);
+  return old_stream_ptr;
+}

 CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
  eigen_device_.reset(new Eigen::DefaultDevice());
@@ -614,7 +615,7 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
  return eigen_device_.get();
 }

-Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
+const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 #endif

 #ifdef PADDLE_WITH_MKLDNN

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once

+#include <functional>
 #include <future>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
@@ -18,7 +19,9 @@ limitations under the License. */
 #include <utility>
 #include <vector>

+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/backends/gpu/gpu_decls.h"
 #include "paddle/pten/core/device_context.h"

 #include "paddle/fluid/memory/malloc.h"
@@ -28,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
 #include "paddle/fluid/platform/dynload/cusparse.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
@@ -38,6 +42,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"  // NOLINT
 #include "paddle/fluid/platform/dynload/miopen.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"  // NOLINT
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
@@ -145,7 +150,7 @@ class IPUDeviceContext : public DeviceContext {
  explicit IPUDeviceContext(IPUPlace place);
  virtual ~IPUDeviceContext();
  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
-  Place GetPlace() const override;
+  const Place& GetPlace() const override;
  /*! \brief  Wait for all operations completion in the stream. */
  void Wait() const override;

@@ -187,7 +192,7 @@ class NPUDeviceContext : public DeviceContext {
  explicit NPUDeviceContext(NPUPlace place);
  virtual ~NPUDeviceContext();
  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
-  Place GetPlace() const override;
+  const Place& GetPlace() const override;
  aclrtContext context() const;

  /*! \brief  Wait for all operations completion in the stream. */
@@ -247,7 +252,7 @@ class NPUPinnedDeviceContext : public DeviceContext {
  NPUPinnedDeviceContext();
  explicit NPUPinnedDeviceContext(NPUPinnedPlace place);

-  Place GetPlace() const override;
+  const Place& GetPlace() const override;

  Eigen::DefaultDevice* eigen_device() const;

@@ -326,20 +331,20 @@ class CUDAContext {
 #endif

  /*! \brief  Call cublas function safely. */
-  template <typename Callback>
-  inline void CublasCall(Callback&& callback) const {
+  inline void CublasCall(
+      const std::function<void(blasHandle_t)>& callback) const {
    if (cublas_tf32_tensor_core_handle_) {
-      cublas_tf32_tensor_core_handle_->Call(std::forward<Callback>(callback));
+      cublas_tf32_tensor_core_handle_->Call(callback);
    } else {
-      cublas_handle_->Call(std::forward<Callback>(callback));
+      cublas_handle_->Call(callback);
    }
  }

 #ifndef PADDLE_WITH_HIP
  /*! \brief  Call cusparse function safely. */
-  template <typename Callback>
-  inline void CusparseCall(Callback&& callback) const {
-    cusparse_handle_->Call(std::forward<Callback>(callback));
+  inline void CusparseCall(
+      const std::function<void(pten::sparseHandle_t)>& callback) const {
+    cusparse_handle_->Call(callback);
  }
 #endif

@@ -348,12 +353,12 @@ class CUDAContext {

  /*! \brief  Call cublas function with Tensor Core safely. If
      Tensor Core is not available, use DEFAULT_MATH instead. */
-  template <typename Callback>
-  inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const {
+  inline void TensorCoreCublasCallIfAvailable(
+      const std::function<void(blasHandle_t)>& callback) const {
    if (cublas_tensor_core_handle_) {
-      cublas_tensor_core_handle_->Call(std::forward<Callback>(callback));
+      cublas_tensor_core_handle_->Call(callback);
    } else {
-      cublas_handle_->Call(std::forward<Callback>(callback));
+      cublas_handle_->Call(callback);
    }
  }

@@ -491,7 +496,7 @@ class CUDAContext {
  DISABLE_COPY_AND_ASSIGN(CUDAContext);
 };

-class CUDADeviceContext : public DeviceContext {
+class CUDADeviceContext : public pten::GPUContext {
 public:
  explicit CUDADeviceContext(CUDAPlace place);
  virtual ~CUDADeviceContext();
@@ -499,49 +504,40 @@ class CUDADeviceContext : public DeviceContext {
  /*! \brief  Wait for all operations completion in the stream. */
  void Wait() const override;

-  /*! \brief  Return place in the device context. */
-  Place GetPlace() const override;
-
-  /*! \brief  Return compute capability in the device context. */
-  int GetComputeCapability() const;
-
-  /*! \brief  Return the max physical thread count in the device context */
-  int GetMaxPhysicalThreadCount() const;
-
-  /*! \brief  Return the SM count in the device context */
-  int GetSMCount() const;
-
-  /*! \brief  Return the Max thread num of block in the device context */
-  int GetMaxThreadsPerBlock() const;
-
-  /*! \brief  Return the max grid dim size in the device context */
-  dim3 GetCUDAMaxGridDimSize() const;
-
  /*! \brief  Return eigen device in the device context. */
  Eigen::GpuDevice* eigen_device() const;

  /*! \brief  Call cublas function safely. */
-  template <typename Callback>
-  inline void CublasCall(Callback&& callback) const {
+  inline void CublasCall(
+      const std::function<void(blasHandle_t)>& callback) const {
+    if (!thread_ctx_.count(this)) {
+      pten::GPUContext::CublasCall(callback);
+      return;
+    }
    return context()->CublasCall(callback);
  }

 #ifndef PADDLE_WITH_HIP
  /*! \brief  Call cusparse function safely. */
-  template <typename Callback>
-  inline void CusparseCall(Callback&& callback) const {
-    return context()->CusparseCall(callback);
+  inline void CusparseCall(
+      const std::function<void(pten::sparseHandle_t)>& callback) const {
+    if (!thread_ctx_.count(this)) {
+      pten::GPUContext::CusparseCall(callback);
+      return;
+    }
+    context()->CusparseCall(callback);
  }
 #endif

-  /*! \brief  Check whether tensor core is supported */
-  bool tensor_core_available() const;
-
  /*! \brief  Call cublas function with Tensor Core safely. If
      Tensor Core is not available, use DEFAULT_MATH instead. */
-  template <typename Callback>
-  inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const {
-    return context()->TensorCoreCublasCallIfAvailable(callback);
+  inline void TensorCoreCublasCallIfAvailable(
+      const std::function<void(blasHandle_t)>& callback) const {
+    if (!thread_ctx_.count(this)) {
+      pten::GPUContext::TensorCoreCublasCallIfAvailable(callback);
+      return;
+    }
+    context()->TensorCoreCublasCallIfAvailable(callback);
  }

 /*! \brief  Return cudnn  handle in the device context. */
@@ -559,6 +555,10 @@ class CUDADeviceContext : public DeviceContext {
  cusparseHandle_t cusparse_handle() const;
 #endif

+#ifndef PADDLE_WITH_HIP
+  cusolverDnHandle_t cusolver_dn_handle() const;
+#endif
+
  /*! \brief  Return a cudnn workspace handle to call multiple cudnn
   *  functions without interrupting by other threads.
   *  Once the first cudnn function is called by the handle, a lock
@@ -568,60 +568,33 @@ class CUDADeviceContext : public DeviceContext {
   *  sequential cudnn function calls. */
  CudnnWorkspaceHandle cudnn_workspace_handle() const;

-#ifndef PADDLE_WITH_HIP
-  cusolverDnHandle_t cusolver_dn_handle() const;
-#endif
-
  /*! \brief  Return cuda stream in the device context. */
  gpuStream_t stream() const;

-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  /*! \brief  Return nccl communicators. */
-  ncclComm_t nccl_comm() const { return nccl_comm_; }
-
-  /*! \brief  Set nccl communicators. */
-  void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; }
-#endif
-
-  template <typename Callback>
-  void RecordEvent(gpuEvent_t ev, Callback callback) const {
-    return context()->Stream()->RecordEvent(ev, callback);
-  }
-
-  template <typename Callback>
-  void AddStreamCallback(Callback&& callback) const {
-    return context()->Stream()->AddCallback(callback);
-  }
+  void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const;

-  void WaitStreamCallback() const {
-    return context()->Stream()->WaitCallback();
-  }
+  void AddStreamCallback(const std::function<void()>& callback) const;

-  void ResetDefaultContext(const stream::Priority& priority) {
-    default_ctx_.reset(new CUDAContext(place_, priority));
-  }
+  void WaitStreamCallback() const;

  void ResetThreadContext(const stream::Priority& priority) {
    std::lock_guard<std::mutex> guard(ctx_mtx_);
-    thread_ctx_[this].reset(new CUDAContext(place_, priority));
+    thread_ctx_[this].reset(new CUDAContext(this->GetPlace(), priority));
  }

-  std::shared_ptr<CUDAContext> context() const {
-    if (!thread_ctx_.count(this)) {
-      return default_ctx_;
-    }
-    return thread_ctx_.at(this);
-  }
+  std::shared_ptr<CUDAContext> context() const;

  // Note: Can only be used under thread_local semantics.
  void SetThreadLocalStream(const gpuStream_t stream) {
    thread_ctx_.at(this)->SetStream(stream);
  }

- private:
-  CUDAPlace place_;
-  std::shared_ptr<CUDAContext> default_ctx_;
+  // NOTE: Just for compatibility with the past, please delete if there is an
+  // elegant way.
+  stream::CUDAStream* GetCudaStream() const;
+  stream::CUDAStream* SetCudaStream(stream::CUDAStream*);

+ private:
  // The thread_local static variable will be released before the
  // global static variable, so avoid using it in dtor.
  static thread_local std::unordered_map<const CUDADeviceContext*,
@@ -631,22 +604,9 @@ class CUDADeviceContext : public DeviceContext {

  mutable std::mutex cudnn_handle_mtx_;

-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  // NCCL communicator (single process version) for NCCL collective operations.
-  // NCCL collective operations provides fast collectives over multiple GPUs
-  // both within and across nodes.
-  // But, this collectives is used for collectives over multiple GPUs within
-  // nodes.
-  ncclComm_t nccl_comm_{nullptr};
-#endif
-
-  int compute_capability_;
-  int runtime_version_;
-  int driver_version_;
-  int multi_process_;
-  int max_threads_per_mp_;
-  int max_threads_per_block_;
-  dim3 max_grid_dim_size_;
+  // NOTE: Just for compatibility with the past, please delete if there is an
+  // elegant way.
+  std::unique_ptr<stream::CUDAStream> cuda_stream_;

  DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
 };
@@ -711,7 +671,7 @@ class CUDAPinnedDeviceContext : public DeviceContext {
  CUDAPinnedDeviceContext();
  explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place);

-  Place GetPlace() const override;
+  const Place& GetPlace() const override;

  Eigen::DefaultDevice* eigen_device() const;


--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -17,6 +17,7 @@ limitations under the License. */

 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"

 TEST(Device, Init) {
  using paddle::platform::DeviceContext;
@@ -26,6 +27,20 @@ TEST(Device, Init) {
  int count = paddle::platform::GetGPUDeviceCount();
  for (int i = 0; i < count; i++) {
    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
+    device_context->SetAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(CUDAPlace(i), device_context->stream())
+            .get());
+    device_context->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    device_context->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(CUDAPlace(i))
+            .get());
+    device_context->PartialInitWithAllocator();
+
    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
    ASSERT_NE(nullptr, gpu_device);
    delete device_context;
@@ -39,6 +54,19 @@ TEST(Device, CUDADeviceContext) {
  int count = paddle::platform::GetGPUDeviceCount();
  for (int i = 0; i < count; i++) {
    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
+    device_context->SetAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(CUDAPlace(i), device_context->stream())
+            .get());
+    device_context->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    device_context->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(CUDAPlace(i))
+            .get());
+    device_context->PartialInitWithAllocator();
    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
    ASSERT_NE(nullptr, gpu_device);
 #ifdef PADDLE_WITH_HIP

--- a/paddle/fluid/platform/device_event_gpu.cc
+++ b/paddle/fluid/platform/device_event_gpu.cc
@@ -53,7 +53,7 @@ void DeviceEventRecordCUDA(DeviceEvent* event, const DeviceContext* context) {
      platform::errors::PreconditionNotMet(
          "Failed to dynamic_cast context into CUDADeviceContext."));

-  wrapper->inner_event_.Record(*cuda_dev_ctx->context()->Stream());
+  wrapper->inner_event_.Record(cuda_dev_ctx->stream());
 }

 bool DeviceEventQueryCUDA(const DeviceEvent* event) {
@@ -82,8 +82,7 @@ void DeviceEventCUDAWaitCUDA(const DeviceEvent* event,
      platform::errors::PreconditionNotMet(
          "Failed to dynamic_cast context into CUDADeviceContext."));
  // calling cudaStreamWaitEvent(stream, event, 0)
-  cuda_dev_ctx->context()->Stream()->WaitEvent(
-      wrapper->inner_event_.GetRawCudaEvent());
+  cuda_dev_ctx->WaitEvent(wrapper->inner_event_.GetRawCudaEvent());
 }

 void DeviceEventCPUWaitCUDA(const DeviceEvent* event,

--- a/paddle/fluid/platform/device_event_test.cc
+++ b/paddle/fluid/platform/device_event_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/platform/device_event.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/platform/place.h"

 using ::paddle::platform::kCUDA;
 using ::paddle::platform::kCPU;
@@ -38,9 +39,11 @@ TEST(DeviceEvent, CUDA) {
  // case 1. test for event_creator
  DeviceEvent event(place);
  ASSERT_NE(event.GetEvent().get(), nullptr);
+  bool status = event.Query();
+  ASSERT_EQ(status, true);
  // case 2. test for event_recorder
  event.Record(context);
-  bool status = event.Query();
+  status = event.Query();
  ASSERT_EQ(status, false);
  // case 3. test for event_finisher
  event.Finish();

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -539,7 +539,7 @@ inline void retry_sleep(unsigned milliseconds) {
        ::paddle::platform::details::ExternalApiType<                   \
            __CUDA_STATUS_TYPE__>::kSuccess;                            \
    while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
-      retry_sleep(FLAGS_gpu_allocator_retry_time);                      \
+      paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time);    \
      __cond__ = (COND);                                                \
      ++retry_count;                                                    \
    }                                                                   \
@@ -727,7 +727,7 @@ inline void retry_sleep(unsigned millisecond) {
        ::paddle::platform::details::ExternalApiType<                   \
            __CUDA_STATUS_TYPE__>::kSuccess;                            \
    while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
-      retry_sleep(FLAGS_gpu_allocator_retry_time);                      \
+      ::paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time);  \
      __cond__ = (COND);                                                \
      ++retry_count;                                                    \
    }                                                                   \

--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -152,11 +152,11 @@ class CudaEvent {
 #endif
  }

-  void Record(const paddle::platform::stream::CUDAStream &stream) {
+  void Record(gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream.raw_stream()));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream));
 #else
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream.raw_stream()));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream));
 #endif
  }


--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -328,6 +328,10 @@ TEST(float16, lod_tensor_on_gpu) {
  // CPU LoDTensor to GPU LoDTensor
  CUDAPlace gpu_place(0);
  CUDADeviceContext gpu_ctx(gpu_place);
+  gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, gpu_ctx.stream())
+                           .get());
+  gpu_ctx.PartialInitWithAllocator();
  framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);

  // GPU LoDTensor to CPU LoDTensor

--- a/paddle/fluid/platform/for_range.h
+++ b/paddle/fluid/platform/for_range.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"

 namespace paddle {
 namespace platform {
@@ -72,6 +73,7 @@ __global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
  }
 }

+// NOTE: After the pten kernel is migrated, it needs to be deleted.
 template <>
 struct ForRange<CUDADeviceContext> {
  ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
@@ -106,6 +108,40 @@ struct ForRange<CUDADeviceContext> {
  size_t limit_;
 };

+template <>
+struct ForRange<pten::GPUContext> {
+  ForRange(const pten::GPUContext& dev_ctx, size_t limit)
+      : dev_ctx_(dev_ctx), limit_(static_cast<size_t>(limit)) {}
+
+  template <typename Function>
+  inline void operator()(Function func) const {
+#ifdef __HIPCC__
+    // HIP will throw core dump when threads > 256
+    constexpr int num_threads = 256;
+#elif WITH_NV_JETSON
+    // JETSON_NANO will throw core dump when threads > 128
+    int num_thread = 256;
+    platform::ChangeThreadNum(dev_ctx_, &num_thread, 128);
+    const int num_threads = num_thread;
+#else
+    constexpr int num_threads = 1024;
+#endif
+    size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
+    size_t grid_size = (limit_ + num_threads - 1) / num_threads;
+
+    if (grid_size == 1) {
+      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
+          func);
+    } else {
+      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
+          func, limit_);
+    }
+  }
+
+  const pten::GPUContext& dev_ctx_;
+  size_t limit_;
+};
+
 #endif

 }  // namespace platform

--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -14,6 +14,7 @@ limitations under the License. */

 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"

@@ -116,11 +117,8 @@ CUDAStream* get_current_stream(int deviceId) {

  platform::Place device = CUDAPlace(deviceId);

-  auto stream = static_cast<platform::CUDADeviceContext*>(pool.Get(device))
-                    ->context()
-                    ->Stream()
-                    .get();
-  return stream;
+  return static_cast<platform::CUDADeviceContext*>(pool.Get(device))
+      ->GetCudaStream();
 #else
  PADDLE_THROW(platform::errors::Unavailable(
      "Paddle is not compiled with CUDA. Cannot visit cuda current stream."));
@@ -133,12 +131,12 @@ CUDAStream* set_current_stream(CUDAStream* stream) {
  auto& device = stream->GetPlace();
  auto& pool = platform::DeviceContextPool::Instance();
  return static_cast<platform::CUDADeviceContext*>(pool.Get(device))
-      ->context()
-      ->SetStream(stream);
+      ->SetCudaStream(stream);
 #else
  PADDLE_THROW(platform::errors::Unavailable(
-      "Paddle is not compiled with CUDA. Cannot visit cuda current stream."));
-  return nullptr;
+      "Paddle is not compiled with CUDA. Cannot visit cuda current"
+      "stream."));
+  return CUDAStream(nullptr);
 #endif
 }
 }  // namespace stream

--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include <cstdint>
+#include <functional>
 #include <memory>

 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -51,24 +52,28 @@ class CUDAStream final {
                      const StreamFlag& flag = StreamFlag::kDefaultFlag) {
    Init(place, priority, flag);
  }
+  explicit CUDAStream(gpuStream_t stream, const Place& place)
+      : place_(place), stream_(stream) {
+    owned_stream_ = false;
+    callback_manager_.reset(new StreamCallbackManager<gpuStream_t>(stream_));
+  }
  virtual ~CUDAStream() { Destroy(); }

  bool Init(const Place& place, const Priority& priority = Priority::kNormal,
            const StreamFlag& flag = StreamFlag::kDefaultFlag);

-  template <typename Callback>
-  void AddCallback(Callback&& callback) const {
+  void AddCallback(std::function<void()> callback) const {
    callback_manager_->AddCallback(callback);
  }

-  template <typename Callback>
 #ifdef PADDLE_WITH_HIP
-  void RecordEvent(hipEvent_t ev, Callback callback) const {
+  void RecordEvent(hipEvent_t ev, const std::function<void()>& callback) const {
    callback();
    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_));
  }
 #else
-  void RecordEvent(cudaEvent_t ev, Callback callback) const {
+  void RecordEvent(cudaEvent_t ev,
+                   const std::function<void()>& callback) const {
    callback();
    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_));
  }
@@ -149,6 +154,7 @@ class CUDAStream final {
 };

 CUDAStream* get_current_stream(int deviceId);
+// NOTE: There is a problem with the interface and needs to be fixed
 CUDAStream* set_current_stream(CUDAStream* stream);

 }  // namespace stream

--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <gtest/gtest.h>
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/transform.h"
@@ -57,6 +58,10 @@ TEST(Transform, CPUUnary) {
 TEST(Transform, GPUUnary) {
  CUDAPlace gpu0(0);
  CUDADeviceContext ctx(gpu0);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(gpu0, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
  float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
  auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4);
  float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
@@ -84,6 +89,10 @@ TEST(Transform, GPUBinary) {
  int buf[4] = {1, 2, 3, 4};
  CUDAPlace gpu0(0);
  CUDADeviceContext ctx(gpu0);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(gpu0, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
  auto gpu_allocation = Alloc(gpu0, sizeof(buf));
  int* gpu_buf = static_cast<int*>(gpu_allocation->ptr());
  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());

--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -34,7 +34,7 @@ void BindCudaStream(py::module *m_ptr) {
          return paddle::platform::stream::get_current_stream(deviceId);
 #else
          PADDLE_THROW(platform::errors::Unavailable(
-              "Paddle is not compiled with CUDA. Cannot visit cuda current "
+              "Paddle is not compiled with CUDA. Cannot visit cuda current"
              "stream."));
 #endif
        },
@@ -119,7 +119,7 @@ void BindCudaStream(py::module *m_ptr) {
           [](paddle::platform::stream::CUDAStream &self,
              paddle::platform::stream::CUDAStream &stream) {
             paddle::platform::CudaEvent event;
-             event.Record(stream);
+             event.Record(stream.raw_stream());

             self.WaitEvent(event.GetRawCudaEvent());
           },
@@ -179,7 +179,7 @@ void BindCudaStream(py::module *m_ptr) {
             if (event == nullptr) {
               event = new paddle::platform::CudaEvent();
             }
-             event->Record(self);
+             event->Record(self.raw_stream());
             return event;
           },
           R"DOC(
@@ -321,7 +321,7 @@ void BindCudaStream(py::module *m_ptr) {
             if (stream == nullptr) {
               stream = paddle::platform::stream::get_current_stream(-1);
             }
-             self.Record(*stream);
+             self.Record(stream->raw_stream());
           },
           R"DOC(
          Records the event in the given stream.

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1596,7 +1596,20 @@ All parameter, weight, gradient are variables in Paddle.
      .def_static("create",
                  [](paddle::platform::CPUPlace& place)
                      -> paddle::platform::DeviceContext* {
-                    return new paddle::platform::CPUDeviceContext();
+    auto* context = new paddle::platform::CPUDeviceContext();
+    context->SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+        .GetAllocator(place)
+        .get());
+    context->SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+        .GetAllocator(paddle::platform::CPUPlace())
+        .get());
+    context->SetZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+        .GetZeroAllocator(place)
+        .get());
+    return context;
                  })
      .def_static("create",
                  [](paddle::platform::XPUPlace& place)
@@ -1607,7 +1620,20 @@ All parameter, weight, gradient are variables in Paddle.
                 "Cannot use XPUPlace in CPU/GPU version, "
                 "Please recompile or reinstall Paddle with XPU support."));
 #else
-                    return new paddle::platform::XPUDeviceContext(place);
+      auto* context = new paddle::platform::XPUDeviceContext(place);
+      context->SetAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(place)
+          .get());
+      context->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+      context->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(place)
+          .get());
+      return context;
 #endif
                  })
        .def_static("create",
@@ -1643,7 +1669,21 @@ All parameter, weight, gradient are variables in Paddle.
                 "Cannot use CUDAPlace in CPU only version, "
                 "Please recompile or reinstall Paddle with CUDA support."));
 #else
-                    return new paddle::platform::CUDADeviceContext(place);
+      auto* context = new paddle::platform::CUDADeviceContext(place);
+      context->SetAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(place, context->stream())
+          .get());
+      context->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+      context->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+        .GetZeroAllocator(place)
+        .get());
+      context->PartialInitWithAllocator();
+      return context;
 #endif
                  })
          .def_static("create",

--- a/paddle/pten/backends/CMakeLists.txt
+++ b/paddle/pten/backends/CMakeLists.txt
@@ -2,6 +2,10 @@ add_subdirectory(dynload)

 add_subdirectory(cpu)

+if(WITH_GPU OR WITH_ROCM)
+  add_subdirectory(gpu)
+endif()
+
 if(WITH_XPU)
  add_subdirectory(xpu)
 endif()
@@ -11,3 +15,7 @@ cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context)
 if(WITH_XPU)
  add_dependencies(pten_context xpu_context)
 endif()
+
+if(WITH_GPU)
+  add_dependencies(pten_context gpu_context)
+endif()
--- a/paddle/pten/backends/cpu/cpu_context.cc
+++ b/paddle/pten/backends/cpu/cpu_context.cc
@@ -15,75 +15,59 @@
 #include "paddle/pten/backends/cpu/cpu_context.h"

 #include "paddle/pten/api/ext/exception.h"
+#include "paddle/pten/common/place.h"

 // NOTE: The paddle framework should add WITH_EIGEN option to support compile
 // without eigen.
+#include "paddle/pten/core/device_context.h"
 #include "unsupported/Eigen/CXX11/Tensor"

 namespace pten {

-struct CPUContext::CPUImpl {
-  CPUImpl() { device_ = new Eigen::DefaultDevice(); }
+struct CPUContext::Impl {
+  Impl() : place_(CPUPlace()) {}

-  // Users need to manage external resources.
-  explicit CPUImpl(const CPUContextResource& ctx_res) : res_(ctx_res) {
-    device_ = res_.device;
-  }
+  explicit Impl(const Place& place) : place_(place) {}

-  ~CPUImpl() {
-    if (res_.device == nullptr && device_ != nullptr) {
-      delete device_;
-      device_ = nullptr;
+  ~Impl() {
+    if (owned_) {
+      delete eigen_device_;
    }
  }

-  Eigen::DefaultDevice* GetEigenDevice() const {
-    PD_CHECK(device_ != nullptr, "the eigen_device is nullptr.");
-    return device_;
+  void Init() {
+    owned_ = true;
+    eigen_device_ = new Eigen::DefaultDevice();
  }

-  void SetEigenDevice(Eigen::DefaultDevice* device) {
-    if (device == nullptr) {
-      return;
-    }
-    res_.device = device;
-    device_ = device;
+  Eigen::DefaultDevice* GetEigenDevice() const {
+    PD_CHECK(eigen_device_ != nullptr, "the cpu eigen_device is nullptr.");
+    return eigen_device_;
  }

-  Place GetPlace() const { return place_; }
-
-  Eigen::DefaultDevice* device_{nullptr};
-  CPUContextResource res_;
-  CPUPlace place_;
+  bool owned_{false};
+  Eigen::DefaultDevice* eigen_device_{nullptr};
+  Place place_;
 };

-CPUContext::CPUContext() : DeviceContext() {
-  cpu_impl_ = std::make_unique<CPUImpl>();
-}
-
-CPUContext::CPUContext(const CPUContext& other) : DeviceContext() {
-  cpu_impl_ = std::make_unique<CPUImpl>();
-  cpu_impl_->SetEigenDevice(other.eigen_device());
-}
+CPUContext::CPUContext()
+    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>()) {}

-CPUContext::CPUContext(CPUContext&& other) : DeviceContext() {
-  cpu_impl_ = std::move(other.cpu_impl_);
-}
+CPUContext::CPUContext(const Place& place)
+    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>(place)) {}

 CPUContext::~CPUContext() = default;

-CPUContext::CPUContext(const CPUContextResource& ctx_res) : DeviceContext() {
-  cpu_impl_ = std::make_unique<CPUImpl>(ctx_res);
-}
+void CPUContext::Init() { impl_->Init(); }

 Eigen::DefaultDevice* CPUContext::eigen_device() const {
-  return cpu_impl_->GetEigenDevice();
+  return impl_->GetEigenDevice();
 }

+const Place& CPUContext::GetPlace() const { return impl_->place_; }
+
 void CPUContext::SetEigenDevice(Eigen::DefaultDevice* device) {
-  cpu_impl_->SetEigenDevice(device);
+  impl_->eigen_device_ = device;
 }

-Place CPUContext::GetPlace() const { return cpu_impl_->GetPlace(); }
-
 }  // namespace pten
--- a/paddle/pten/backends/cpu/cpu_context.h
+++ b/paddle/pten/backends/cpu/cpu_context.h
@@ -24,37 +24,29 @@ limitations under the License. */

 namespace pten {

-struct CPUContextResource {
-  Eigen::DefaultDevice* device{nullptr};
-};
-
 class CPUContext : public DeviceContext {
 public:
-  // NOTE: DeviceContext hold resources. Used in training scenarios.
  CPUContext();
-
-  // NOTE: Share the same underlying resources, please ensure that resources are
-  // not released.
-  CPUContext(const CPUContext&);
-
-  CPUContext(CPUContext&&);
-
-  ~CPUContext();
-
+  explicit CPUContext(const Place&);
+  virtual ~CPUContext();
  Eigen::DefaultDevice* eigen_device() const;
-
-  // TODO(wilber): Whether the interface should be preserved.
-  Place GetPlace() const override;
+  const Place& GetPlace() const override;

 public:
-  // NOTE: External users manage resources. Used in inference scenarios.
-  explicit CPUContext(const CPUContextResource& ctx_res);
+  // NOTE: DeviceContext hold resources. Used in training scenarios.
+  // The interface used by the training scene, DeviceContext will initialize
+  // all resources and delete them when destructing.
+  void Init();

+ protected:
+  // NOTE: External users manage resources. Used in inference scenarios.
+  // The Set interface is for inference only, DeviceContext will mark the
+  // resource as external, and will not delete any resource when destructing.
  void SetEigenDevice(Eigen::DefaultDevice* device);

 private:
-  struct CPUImpl;
-  std::unique_ptr<CPUImpl> cpu_impl_;
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
 };

 }  // namespace pten
--- a/paddle/pten/backends/gpu/CMakeLists.txt
+++ b/paddle/pten/backends/gpu/CMakeLists.txt
+if(WITH_GPU)
+  add_subdirectory(cuda)
+  nv_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_cuda_info gflags glog enforce pten_dynload_cuda)
+elseif(WITH_ROCM)
+  add_subdirectory(rocm)
+  hip_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_rocm_info gflags glog enforce pten_dynload_cuda)
+endif()
+
+cc_library(gpu_context SRCS gpu_context.cc DEPS pten_device_context pten_gpu_info eigen3)
--- a/paddle/pten/backends/gpu/cuda/CMakeLists.txt
+++ b/paddle/pten/backends/gpu/cuda/CMakeLists.txt
+nv_library(pten_cuda_info SRCS cuda_info.cc DEPS gflags glog enforce pten_dynload_cuda)
--- a/paddle/pten/backends/gpu/cuda/cuda_helper.h
+++ b/paddle/pten/backends/gpu/cuda/cuda_helper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace pten {
+namespace backends {
+namespace gpu {
+
+/*
+ * Summary: Grid stride looping macro in CUDA kernel
+ *
+ *  [ Why need this macro? ]
+ *
+ *    The original looping in CUDA kernel is:
+ *
+ *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+ *        i += blockDim.x * gridDim.x)`
+ *
+ *    This for condition is risky. The value of `blockIdx.x * blockDim.x`
+ *    may be large, such as over 1GB, the first iteration is no problem here,
+ *    but when `i += blockDim.x * gridDim.x` is executed, the value of i
+ *    will greater than INT_MAX and overflow becomes negative value, at
+ *    this time, the cycle condition `i < (n)` is still satisfied, so it
+ *    will cause illegal access to cuda memory.
+ *
+ *    Here is a real example in ERINE, it will trigger above error.
+ *    The related data are:
+ *      - blockIdx.x = 2172938
+ *      - blockDim.x = 512
+ *      - blockIdx.x * blockDim.x = 1112543864
+ *      - INT_MAX = 2147483647
+ *
+ *    So we polish the for condition as follow, the int64_t __index__ will
+ *    prevent overflow in the loop increment.
+ *
+ * Parameters:
+ *    - i: loop index
+ *    - num: total element numbers
+ *
+ * Examples:
+ *    template <typename T>
+ *    __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
+ *                      const int d, const int remain) {
+ *    CUDA_KERNEL_LOOP(index, num) {
+ *      int idx_n = index / d;
+ *      int idx_remain = index % remain;
+ *      logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
+ *      }
+ *    }
+ *
+*/
+
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)            \
+  int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
+  for (index_type i = __index__; __index__ < (num);          \
+       __index__ += blockDim.x * gridDim.x, i = __index__)
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,20 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/pten/backends/gpu/gpu_info.h"
+
+// TODO(pten): remove fluid headers.
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/lock_guard_ptr.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/monitor.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"

 static std::once_flag g_device_props_size_init_flag;
 static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
-static std::vector<paddle::gpuDeviceProp> g_device_props;
+static std::vector<pten::gpuDeviceProp> g_device_props;
+
+namespace pten {
+namespace backends {
+namespace gpu {

-namespace paddle {
-namespace platform {
 int DnnVersion() {
  if (!dynload::HasCUDNN()) return -1;
  return dynload::cudnnGetVersion();
@@ -75,11 +74,13 @@ int GetGPUDeviceCount() {
 }

 int GetGPUComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  int major, minor;
  auto major_error_code =
      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
@@ -92,22 +93,26 @@ int GetGPUComputeCapability(int id) {
 }

 int GetGPURuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  int runtime_version = 0;
  PADDLE_ENFORCE_GPU_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
  return runtime_version;
 }

 int GetGPUDriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  int driver_version = 0;
  PADDLE_ENFORCE_GPU_SUCCESS(cudaDriverGetVersion(&driver_version));
  return driver_version;
@@ -120,11 +125,13 @@ bool TensorCoreAvailable() {
 }

 int GetGPUMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  int count;
  PADDLE_ENFORCE_GPU_SUCCESS(
      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
@@ -132,11 +139,13 @@ int GetGPUMultiProcessors(int id) {
 }

 int GetGPUMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  int count;
  PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(
      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
@@ -145,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
 }

 int GetGPUMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  int count;
  PADDLE_ENFORCE_GPU_SUCCESS(
      cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
@@ -162,32 +173,34 @@ int GetCurrentDeviceId() {
  return device_id;
 }

-dim3 GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+std::array<int, 3> GetGpuMaxGridDimSize(int id) {
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
-  dim3 ret;
+                        id,
+                        GetGPUDeviceCount()));
+  std::array<int, 3> ret;
  int size;
  auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
  PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
-  ret.x = size;
+  ret[0] = size;

  auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
  PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
-  ret.y = size;
+  ret[1] = size;

  auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
  PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
-  ret.z = size;
+  ret[2] = size;
  return ret;
 }

 const gpuDeviceProp &GetDeviceProperties(int id) {
  std::call_once(g_device_props_size_init_flag, [&] {
    int gpu_num = 0;
-    gpu_num = platform::GetGPUDeviceCount();
+    gpu_num = GetGPUDeviceCount();
    g_device_props_init_flags.resize(gpu_num);
    g_device_props.resize(gpu_num);
    for (int i = 0; i < gpu_num; ++i) {
@@ -196,16 +209,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
  });

  if (id == -1) {
-    id = platform::GetCurrentDeviceId();
+    id = GetCurrentDeviceId();
  }

  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
-    PADDLE_THROW(platform::errors::OutOfRange(
+    PADDLE_THROW(paddle::platform::errors::OutOfRange(
        "The device id %d is out of range [0, %d), where %d is the number of "
        "devices on this machine. Because the device id should be greater than "
        "or equal to zero and smaller than the number of gpus. Please input "
        "appropriate device again!",
-        id, static_cast<int>(g_device_props.size()),
+        id,
+        static_cast<int>(g_device_props.size()),
        static_cast<int>(g_device_props.size())));
  }

@@ -219,32 +233,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) {

 void SetDeviceId(int id) {
  // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
 }

-void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-                    gpuMemcpyKind kind, gpuStream_t stream) {
+void GpuMemcpyAsync(void *dst,
+                    const void *src,
+                    size_t count,
+                    gpuMemcpyKind kind,
+                    gpuStream_t stream) {
  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
 }

-void GpuMemcpySync(void *dst, const void *src, size_t count,
+void GpuMemcpySync(void *dst,
+                   const void *src,
+                   size_t count,
                   gpuMemcpyKind kind) {
  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(dst, src, count, kind));
 }

-void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
-                        int src_device, size_t count, gpuStream_t stream) {
+void GpuMemcpyPeerAsync(void *dst,
+                        int dst_device,
+                        const void *src,
+                        int src_device,
+                        size_t count,
+                        gpuStream_t stream) {
  PADDLE_ENFORCE_GPU_SUCCESS(
      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
 }

-void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
-                       int src_device, size_t count) {
+void GpuMemcpyPeerSync(
+    void *dst, int dst_device, const void *src, int src_device, size_t count) {
  PADDLE_ENFORCE_GPU_SUCCESS(
      cudaMemcpyPeer(dst, dst_device, src, src_device, count));
 }
@@ -264,5 +289,7 @@ void GpuDestroyStream(gpuStream_t stream) {
 void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); }

 gpuError_t GpuGetLastError() { return cudaGetLastError(); }
-}  // namespace platform
-}  // namespace paddle
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
--- a/paddle/pten/backends/gpu/forwards.h
+++ b/paddle/pten/backends/gpu/forwards.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// Forward-declares CUDA API types used in platform-agnostic wrapper headers.
+#pragma once
+
+/// Forward declaration of Eigen types.
+namespace Eigen {
+struct GpuDevice;
+}  // namespace Eigen
+
+/// Forward declaration of CUDA types.
+
+// Forward declaration of CUDA runtime types.
+using cudaStream_t = struct CUstream_st *;
+using cudaEvent_t = struct CUevent_st *;
+
+// Forward declaration of cuDNN types.
+using cudnnHandle_t = struct cudnnContext *;
+using cudnnTensorDescriptor_t = struct cudnnTensorStruct *;
+using cudnnConvolutionDescriptor_t = struct cudnnConvolutionStruct *;
+using cudnnPoolingDescriptor_t = struct cudnnPoolingStruct *;
+using cudnnFilterDescriptor_t = struct cudnnFilterStruct *;
+using cudnnLRNDescriptor_t = struct cudnnLRNStruct *;
+using cudnnActivationDescriptor_t = struct cudnnActivationStruct *;
+using cudnnSpatialTransformerDescriptor_t =
+    struct cudnnSpatialTransformerStruct *;
+using cudnnOpTensorDescriptor_t = struct cudnnOpTensorStruct *;
+using cudnnReduceTensorDescriptor_t = struct cudnnReduceTensorStruct *;
+using cudnnCTCLossDescriptor_t = struct cudnnCTCLossStruct *;
+using cudnnTensorTransformDescriptor_t = struct cudnnTensorTransformStruct *;
+using cudnnDropoutDescriptor_t = struct cudnnDropoutStruct *;
+using cudnnRNNDescriptor_t = struct cudnnRNNStruct *;
+using cudnnPersistentRNNPlan_t = struct cudnnPersistentRNNPlan *;
+using cudnnRNNDataDescriptor_t = struct cudnnRNNDataStruct *;
+using cudnnAlgorithmDescriptor_t = struct cudnnAlgorithmStruct *;
+using cudnnAlgorithmPerformance_t = struct cudnnAlgorithmPerformanceStruct *;
+using cudnnSeqDataDescriptor_t = struct cudnnSeqDataStruct *;
+using cudnnAttnDescriptor_t = struct cudnnAttnStruct *;
+using cudnnFusedOpsConstParamPack_t = struct cudnnFusedOpsConstParamStruct *;
+using cudnnFusedOpsVariantParamPack_t =
+    struct cudnnFusedOpsVariantParamStruct *;
+using cudnnFusedOpsPlan_t = struct cudnnFusedOpsPlanStruct *;
+
+// Forward declaration of cuBLAS types.
+using cublasHandle_t = struct cublasContext *;
+
+// Forward declaration of cuSOLVER types.
+using cusolverDnHandle_t = struct cusolverDnContext *;
+
+// Forward declaration of cuSparse types.
+using cusparseHandle_t = struct cusparseContext *;
+
+// Forward declaration of cuFFT types.
+using cufftHandle = int;
+
+// Forward declaration of NCCL types.
+using ncclComm_t = struct ncclComm *;
+
+/// Forward declaration of ROCM types.
+#include <cstddef>
+
+using hipDevice_t = int;
+using hipCtx_t = struct ihipCtx_t *;
+using hipModule_t = struct ihipModule_t *;
+using hipStream_t = struct ihipStream_t *;
+using hipEvent_t = struct ihipEvent_t *;
+using hipFunction_t = struct ihipModuleSymbol_t *;
+
+// Forward declaration of MIOpen types.
+using miopenHandle_t = struct miopenHandle *;
+using miopenAcceleratorQueue_t = hipStream_t;
+using miopenFusionOpDescriptor_t = struct miopenFusionOpDescriptor *;
+using miopenTensorDescriptor_t = struct miopenTensorDescriptor *;
+using miopenConvolutionDescriptor_t = struct miopenConvolutionDescriptor *;
+using miopenPoolingDescriptor_t = struct miopenPoolingDescriptor *;
+using miopenLRNDescriptor_t = struct miopenLRNDescriptor *;
+using miopenActivationDescriptor_t = struct miopenActivationDescriptor *;
+using miopenRNNDescriptor_t = struct miopenRNNDescriptor *;
+using miopenCTCLossDescriptor_t = struct miopenCTCLossDescriptor *;
+using miopenDropoutDescriptor_t = struct miopenDropoutDescriptor *;
+using miopenFusionPlanDescriptor_t = struct miopenFusionPlanDescriptor *;
+using miopenOperatorDescriptor_t = struct miopenOperatorDescriptor *;
+using miopenOperatorArgs_t = struct miopenOperatorArgs *;
+using miopenAllocatorFunction = void *(*)(void *context, size_t sizeBytes);
+// using miopenDeallocatorFunction = void *(*)(void *context, void *memory);
+// struct miopenConvAlgoPerf_t;
+// struct miopenConvSolution_t;
+
+// Forward declaration of rocBLAS types.
+using rocblas_handle = struct _rocblas_handle *;
+
+// Forward declaration of hipfft types.
+using hipfftHandle = struct hipfftHandle_t *;
+
+// Forward declaration of rocSOLVER types.
+using rocsolver_handle = rocblas_handle;
+
+// Forward declaration of rocSparse types.
+using rocsparse_handle = struct _rocsparse_handle *;
--- a/paddle/pten/backends/gpu/gpu_context.cc
+++ b/paddle/pten/backends/gpu/gpu_context.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include <array>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+
+#include "paddle/pten/api/ext/exception.h"
+
+#include "paddle/pten/backends/gpu/gpu_decls.h"
+#include "paddle/pten/backends/gpu/gpu_info.h"
+#include "paddle/pten/common/float16.h"
+#include "paddle/pten/common/place.h"
+#include "paddle/pten/core/allocator.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/pten/backends/dynload/cublas.h"
+#include "paddle/pten/backends/dynload/cudnn.h"
+#include "paddle/pten/backends/dynload/cusolver.h"
+#include "paddle/pten/backends/dynload/cusparse.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+#include "paddle/pten/backends/dynload/nccl.h"
+#endif  // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+#endif  // PADDLE_WITH_CUDA
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/pten/backends/dynload/miopen.h"
+#include "paddle/pten/backends/dynload/rocblas.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
+#include "paddle/pten/backends/dynload/rccl.h"
+#endif  // !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
+#endif  // PADDLE_WITH_HIP
+
+// NOTE: The paddle framework should add WITH_EIGEN option to support compile
+// without eigen.
+#include "unsupported/Eigen/CXX11/Tensor"
+
+// TODO(pten): remove fluid header.
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pten {
+
+namespace internal {
+
+class EigenGpuStreamDevice : public Eigen::StreamInterface {
+ public:
+  EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
+    Eigen::initializeDeviceProp();
+  }
+  ~EigenGpuStreamDevice() override {}
+
+  void Reinitialize(gpuStream_t cuda_stream,
+                    Allocator* allocator,
+                    GPUPlace place) {
+    stream_ = cuda_stream;
+    place_ = place;
+    allocator_ = allocator;
+    device_prop_ = &Eigen::m_deviceProperties[place.device];
+  }
+
+  const gpuStream_t& stream() const override { return stream_; }
+
+  const gpuDeviceProp& deviceProperties() const override {
+    return *device_prop_;
+  }
+
+  void* allocate(size_t num_bytes) const override {
+    if (UNLIKELY(num_bytes == 0)) {
+      return nullptr;
+    }
+    auto buf = allocator_->Allocate(num_bytes);
+    VLOG(4) << "Eigen allocated at " << buf->ptr() << " requested "
+            << num_bytes;
+    void* retv = buf->ptr();
+    {
+      std::lock_guard<std::mutex> lock(mtx_);
+      allocations_.emplace(retv, std::move(buf));
+    }
+    return retv;
+  }
+
+  void deallocate(void* buffer) const override {
+    if (LIKELY(buffer)) {
+      std::lock_guard<std::mutex> lock(mtx_);
+      allocations_.erase(buffer);
+    }
+  }
+
+  void* scratchpad() const override {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(Eigen::kGpuScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  unsigned int* semaphore() const override {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
+#endif
+    }
+    return semaphore_;
+  }
+
+ private:
+  GPUPlace place_;
+  gpuStream_t stream_;                // not owned;
+  Allocator* allocator_;              // not owned;
+  const gpuDeviceProp* device_prop_;  // not owned;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+  mutable std::mutex mtx_;  // to protect allocations_
+  mutable std::unordered_map<void*, Allocator::AllocationPtr> allocations_;
+};
+
+#ifdef PADDLE_WITH_HIP
+static void StreamCallbackFunc(gpuStream_t stream,
+                               gpuError_t status,
+                               void* user_data)
+#endif
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10000
+    static void CUDART_CB StreamCallbackFunc(void* user_data)
+#else
+    static void CUDART_CB
+    StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void* user_data)
+#endif
+#endif
+{
+  std::unique_ptr<std::function<void()>> func(
+      reinterpret_cast<std::function<void()>*>(user_data));
+  (*func)();
+}
+
+}  // namespace internal
+
+class DnnWorkspaceHandle {
+ public:
+  explicit inline DnnWorkspaceHandle(Allocator* allocator)
+      : allocator_(allocator) {}
+
+  inline void RunFunc(const std::function<void(void*)>& cudnn_func,
+                      size_t required_workspace_bytes) {
+    if (required_workspace_bytes > WorkspaceSize()) {
+      ReallocWorkspace(required_workspace_bytes);
+    }
+    VLOG(2) << "Cudnn workspace size at RunFunc: "
+            << static_cast<double>(WorkspaceSize()) / (1 << 20) << " MB";
+    {
+      std::lock_guard<std::mutex> guard(mtx_);
+      cudnn_func(allocation_ ? allocation_->ptr() : nullptr);
+    }
+  }
+
+  /*! \brief Thread which call RunFuncSync() would release gpu memory after
+   *  running the function. Currently this function is only used when cudnn
+   *  exhaustive searching and callers have to guarantee that the input function
+   *  is host blocking */
+  inline void RunFuncSync(const std::function<void(void*)>& cudnn_func,
+                          size_t required_workspace_bytes) {
+    RunFunc(cudnn_func, required_workspace_bytes);
+    ResetWorkspace();
+  }
+
+  inline size_t WorkspaceSize() {
+    if (allocation_ == nullptr) {
+      return 0;
+    }
+    return allocation_->size();
+  }
+
+  void ResetWorkspace() { allocation_ = nullptr; }
+
+  void ReallocWorkspace(size_t required_workspace_bytes) {
+    if (required_workspace_bytes <= WorkspaceSize()) return;
+    // reset allocation first before re-allocate to save memory
+    allocation_.reset();
+    allocation_ = allocator_->Allocate(required_workspace_bytes);
+  }
+
+ private:
+  Allocator::AllocationPtr allocation_{nullptr};
+  Allocator* allocator_{nullptr};
+  std::mutex mtx_;
+};
+
+struct GPUContext::Impl {
+  void Init() {
+    owned_ = true;
+    backends::gpu::GPUDeviceGuard guard(place_.device);
+    InitGpuProperties();
+    InitStream();
+    InitEigenDevice();
+    InitBlasHandle();
+    InitDNNHandle();
+    InitSolverHandle();
+    InitSparseHandle();
+    InitDnnWorkspace();
+  }
+
+  void PartialInitWithoutAllocator() {
+    owned_ = true;
+    backends::gpu::GPUDeviceGuard guard(place_.device);
+    InitGpuProperties();
+    InitStream();
+    InitBlasHandle();
+    InitDNNHandle();
+    InitSolverHandle();
+    InitSparseHandle();
+  }
+
+  void PartialInitWithAllocator() {
+    owned_ = true;
+    backends::gpu::GPUDeviceGuard guard(place_.device);
+    InitEigenDevice();
+    InitDnnWorkspace();
+  }
+
+  Impl() : place_(GPUPlace()) {}
+
+  explicit Impl(const GPUPlace& place) : place_(place) {}
+
+  ~Impl() {
+    backends::gpu::GPUDeviceGuard guard(place_.device);
+    DestoryInternalWorkspace();
+    DestoryInternalEigenDevice();
+    DestroyInternalSparseHandle();
+    DestroyInternalSolverHandle();
+    DestroyInternalDnnHandle();
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    if (nccl_comm_) {
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
+    }
+#endif
+    DestroyInternalBlasHandle();
+    DestoryInternalStream();
+  }
+
+  const Place& GetPlace() const { return place_; }
+
+  bool IsTensorCoreAvailable() const {
+    return blas_tensor_core_handle_ != nullptr;
+  }
+
+  void InitGpuProperties() {
+    backends::gpu::GPUDeviceGuard guard(place_.GetDeviceId());
+    compute_capability_ =
+        backends::gpu::GetGPUComputeCapability(place_.GetDeviceId());
+    multi_process_ = backends::gpu::GetGPUMultiProcessors(place_.GetDeviceId());
+    max_threads_per_mp_ =
+        backends::gpu::GetGPUMaxThreadsPerMultiProcessor(place_.GetDeviceId());
+    max_grid_dim_size_ =
+        backends::gpu::GetGpuMaxGridDimSize(place_.GetDeviceId());
+    max_threads_per_block_ =
+        backends::gpu::GetGPUMaxThreadsPerBlock(place_.GetDeviceId());
+    driver_version_ = backends::gpu::GetGPUDriverVersion(place_.GetDeviceId());
+    runtime_version_ =
+        backends::gpu::GetGPURuntimeVersion(place_.GetDeviceId());
+
+    // TODO(wilber): glog may be replaced in the future?
+    LOG_FIRST_N(WARNING, 1)
+        << "Please NOTE: device: " << static_cast<int>(place_.device)
+        << ", GPU Compute Capability: " << compute_capability_ / 10 << "."
+        << compute_capability_ % 10
+        << ", Driver API Version: " << driver_version_ / 1000 << "."
+        << (driver_version_ % 100) / 10
+        << ", Runtime API Version: " << runtime_version_ / 1000 << "."
+        << (runtime_version_ % 100) / 10;
+#ifdef PADDLE_WITH_HIP
+    size_t miopen_major, miopen_minor, miopen_patch;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::miopenGetVersion(&miopen_major, &miopen_minor, &miopen_patch));
+    auto cudnn_dso_ver =
+        (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
+    auto compile_miopen_version = MIOPEN_VERSION / 10;
+    if (cudnn_dso_ver < static_cast<size_t>(compile_miopen_version)) {
+      LOG_FIRST_N(WARNING, 1)
+          << "WARNING: device: " << static_cast<int>(place_.device)
+          << ". The installed Paddle is compiled with MIOPEN "
+          << compile_miopen_version / 100 << "." << compile_miopen_version % 100
+          << ", but MIOPEN version in your machine is " << cudnn_dso_ver / 100
+          << "." << cudnn_dso_ver % 100
+          << ", which may cause serious incompatible bug. "
+          << "Please recompile or reinstall Paddle with compatible MIOPEN "
+             "version.";
+    }
+#else
+    size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+    LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
+                            << ", cuDNN Version: " << cudnn_dso_ver / 1000
+                            << "." << (cudnn_dso_ver % 1000) / 100 << ".";
+
+    // Check CUDA/CUDNN version compatiblity
+    auto local_cuda_version =
+        (driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
+    auto compile_cuda_version =
+        (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
+    if (local_cuda_version < compile_cuda_version) {
+      LOG_FIRST_N(WARNING, 1)
+          << "WARNING: device: " << static_cast<int>(place_.device)
+          << ". The installed Paddle is compiled with CUDA "
+          << compile_cuda_version / 10 << "." << compile_cuda_version % 10
+          << ", but CUDA runtime version in your machine is "
+          << local_cuda_version / 10 << "." << local_cuda_version % 10
+          << ", which may cause serious incompatible bug. "
+          << "Please recompile or reinstall Paddle with compatible CUDA "
+             "version.";
+    }
+#endif
+  }
+
+  void InitDnnWorkspace() {
+    PD_CHECK(allocator_ != nullptr,
+             "the device allocator for gpu context is nullptr.");
+    workspace_ = new DnnWorkspaceHandle(allocator_);
+  }
+
+  void DestoryInternalWorkspace() {
+    if (owned_ && workspace_ != nullptr) {
+      delete workspace_;
+      stream_ = nullptr;
+    }
+  }
+
+  DnnWorkspaceHandle* GetDnnWorkspace() {
+    PD_CHECK(workspace_ != nullptr, "the gpu cudnn workspace is nullptr.");
+    return workspace_;
+  }
+
+  void InitStream() {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        hipStreamCreateWithPriority(&stream_, hipStreamDefault, 0));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, 0));
+#endif
+  }
+
+  void DestoryInternalStream() {
+    if (owned_ && stream_ != nullptr) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_));
+#endif
+    }
+    stream_ = nullptr;
+  }
+
+  void SetStream(gpuStream_t stream) { stream_ = stream; }
+
+  gpuStream_t GetStream() const {
+    PD_CHECK(stream_ != nullptr, "the gpu stream is nullptr.");
+    return stream_;
+  }
+
+  void InitEigenDevice() {
+    PD_CHECK(allocator_ != nullptr,
+             "the allocator for eigen device is nullptr.");
+    eigen_stream_.reset(new internal::EigenGpuStreamDevice());
+    eigen_stream_->Reinitialize(stream_, allocator_, place_);
+    eigen_device_ = new Eigen::GpuDevice(eigen_stream_.get());
+  }
+
+  void DestoryInternalEigenDevice() {
+    if (owned_ && eigen_device_ != nullptr) {
+      delete eigen_device_;
+      eigen_device_ = nullptr;
+    }
+  }
+
+  void SetEigenDevice(Eigen::GpuDevice* device) { eigen_device_ = device; }
+
+  Eigen::GpuDevice* eigen_device() const {
+    PD_CHECK(eigen_device_ != nullptr, "the gpu eigen_device is nullptr.");
+    return eigen_device_;
+  }
+
+  void InitBlasHandle() {
+#ifdef PADDLE_WITH_HIP
+    pten::dynload::rocblas_create_handle(&blas_handle_);
+    pten::dynload::rocblas_set_stream(blas_handle_, stream_);
+#else  // PADDLE_WITH_CUDA
+    PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cublasCreate(&blas_handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        pten::dynload::cublasSetStream(blas_handle_, stream_));
+#if CUDA_VERSION >= 9000
+    PADDLE_RETRY_CUDA_SUCCESS(
+        pten::dynload::cublasCreate(&blas_tensor_core_handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        pten::dynload::cublasSetStream(blas_tensor_core_handle_, stream_));
+    PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cublasSetMathMode(
+        blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+#if CUDA_VERSION >= 11000
+    PADDLE_RETRY_CUDA_SUCCESS(
+        pten::dynload::cublasCreate(&blas_tf32_tensor_core_handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        pten::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream_));
+    PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cublasSetMathMode(
+        blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+#endif  // CUDA_VERSION >= 11000
+#endif  // CUDA_VERSION >= 9000
+#endif  // PADDLE_WITH_HIP
+  }
+
+  void DestroyInternalBlasHandle() {
+#ifdef PADDLE_WITH_HIP
+    if (owned_ && blas_handle_ != nullptr) {
+      pten::dynload::rocblas_destroy_handle(blas_handle_);
+      blas_handle_ = nullptr;
+    }
+#else
+    if (owned_ && blas_handle_ != nullptr) {
+      pten::dynload::cublasDestroy(blas_handle_);
+      blas_handle_ = nullptr;
+    }
+    if (owned_ && blas_tensor_core_handle_ != nullptr) {
+      pten::dynload::cublasDestroy(blas_tensor_core_handle_);
+      blas_tensor_core_handle_ = nullptr;
+    }
+    if (owned_ && blas_tf32_tensor_core_handle_ != nullptr) {
+      pten::dynload::cublasDestroy(blas_tf32_tensor_core_handle_);
+      blas_tf32_tensor_core_handle_ = nullptr;
+    }
+#endif  // PADDLE_WITH_HIP
+  }
+
+  blasHandle_t GetBlasHandle() const {
+    PD_CHECK(blas_handle_ != nullptr, "the gpu blas handle is nullptr.");
+    return blas_handle_;
+  }
+
+  void SetBlasHandle(blasHandle_t blas) { blas_handle_ = blas; }
+
+  void InitDNNHandle() {
+    if (pten::dynload::HasCUDNN()) {
+#ifdef PADDLE_WITH_HIP
+      size_t miopen_major, miopen_minor, miopen_patch;
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
+          &miopen_major, &miopen_minor, &miopen_patch));
+      auto local_miopen_version =
+          (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
+      auto compile_miopen_version = MIOPEN_VERSION / 10;
+      if (local_miopen_version < static_cast<size_t>(compile_miopen_version)) {
+        LOG_FIRST_N(WARNING, 1)
+            << "WARNING: device: " << place_.device
+            << ". The installed Paddle is compiled with MIOPEN "
+            << compile_miopen_version / 100 << "."
+            << compile_miopen_version % 100
+            << ", but MIOPEN version in your machine is "
+            << local_miopen_version / 100 << "." << local_miopen_version % 100
+            << ", which may cause serious incompatible bug. "
+            << "Please recompile or reinstall Paddle with compatible MIOPEN "
+               "version.";
+      }
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(&dnn_handle_));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          dynload::miopenSetStream(dnn_handle_, stream_));
+#else
+      auto local_cudnn_version = pten::dynload::cudnnGetVersion() / 100;
+      auto compile_cudnn_version = CUDNN_VERSION / 100;
+      if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
+        LOG_FIRST_N(WARNING, 1)
+            << "WARNING: device: " << place_.device
+            << ". The installed Paddle is compiled with CUDNN "
+            << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
+            << ", but CUDNN version in your machine is "
+            << local_cudnn_version / 10 << "." << local_cudnn_version % 10
+            << ", which may cause serious incompatible bug. "
+            << "Please recompile or reinstall Paddle with compatible CUDNN "
+               "version.";
+      }
+      PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cudnnCreate(&dnn_handle_));
+      PADDLE_RETRY_CUDA_SUCCESS(
+          pten::dynload::cudnnSetStream(dnn_handle_, stream_));
+#endif
+    } else {
+      dnn_handle_ = nullptr;
+    }
+  }
+
+  dnnHandle_t GetDnnHandle() {
+    PD_CHECK(dnn_handle_ != nullptr, "the gpu dnn handle is nullptr.");
+    return dnn_handle_;
+  }
+
+  void DestroyInternalDnnHandle() {
+#ifdef PADDLE_WITH_HIP
+    if (owned_ && dnn_handle_ != nullptr) {
+      PADDLE_ENFORCE_GPU_SUCCESS(pten::dynload::miopenDestroy(dnn_handle_));
+      dnn_handle_ = nullptr;
+    }
+#else
+    if (owned_ && dnn_handle_ != nullptr) {
+      PADDLE_ENFORCE_GPU_SUCCESS(pten::dynload::cudnnDestroy(dnn_handle_));
+      dnn_handle_ = nullptr;
+    }
+#endif  // PADDLE_WITH_HIP
+  }
+
+  void SetDnnHandle(dnnHandle_t handle) { dnn_handle_ = handle; }
+
+  void InitSolverHandle() {
+#ifndef PADDLE_WITH_HIP
+    PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cusolverDnCreate(&solver_handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        pten::dynload::cusolverDnSetStream(solver_handle_, stream_));
+#endif
+  }
+
+  void DestroyInternalSolverHandle() {
+#ifndef PADDLE_WITH_HIP
+    if (owned_ && solver_handle_ != nullptr) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          pten::dynload::cusolverDnDestroy(solver_handle_));
+      solver_handle_ = nullptr;
+    }
+#endif
+  }
+
+  solverHandle_t GetSolverHandle() const {
+    PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr.");
+    return solver_handle_;
+  }
+
+  void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; }
+
+  void InitSparseHandle() {
+// ROCM is not yet supported
+#if defined(PADDLE_WITH_CUDA)
+// The generic APIs is supported from CUDA10.1
+#if CUDA_VERSION >= 10010
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(&sparse_handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        dynload::cusparseSetStream(sparse_handle_, stream_));
+#endif
+#endif
+  }
+
+  void DestroyInternalSparseHandle() {
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10010
+    if (owned_ && sparse_handle_ != nullptr) {
+      PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(sparse_handle_));
+      sparse_handle_ = nullptr;
+    }
+#endif
+#endif
+  }
+
+  sparseHandle_t GetSparseHandle() const {
+    PD_CHECK(sparse_handle_ != nullptr, "the gpu sparse handle is nullptr.");
+    return sparse_handle_;
+  }
+
+  void SetSparseHandle(sparseHandle_t handle) { sparse_handle_ = handle; }
+
+  void Wait() const {
+#ifdef PADDLE_WITH_HIP
+    hipError_t e_sync = hipSuccess;
+#if !defined(_WIN32)
+    e_sync = hipStreamSynchronize(stream_);
+#else
+    while (e_sync = hipStreamQuery(stream_)) {
+      if (e_sync == hipErrorNotReady) continue;
+      break;
+    }
+#endif  // !defined(_WIN32)
+#else   // PADDLE_WITH_HIP
+    cudaError_t e_sync = cudaSuccess;
+#if !defined(_WIN32)
+    e_sync = cudaStreamSynchronize(stream_);
+#else
+    while (e_sync = cudaStreamQuery(stream_)) {
+      if (e_sync == cudaErrorNotReady) continue;
+      break;
+    }
+#endif  // !defined(_WIN32)
+#endif  // PADDLE_WITH_HIP
+
+    PADDLE_ENFORCE_GPU_SUCCESS(e_sync);
+  }
+
+  void WaitEvent(gpuEvent_t ev) const {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream_, ev, 0));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0));
+#endif
+  }
+
+  ncclComm_t GetNcclComm() const {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    // PD_CHECK(nccl_comm_ != nullptr, "the gpu nccl_comm is nullptr.");
+    return nccl_comm_;
+#endif
+    return nullptr;
+  }
+
+  void SetNcclComm(ncclComm_t comm) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    nccl_comm_ = comm;
+#endif
+  }
+
+  inline void CublasCall(
+      const std::function<void(blasHandle_t)>& callback) const {
+    if (blas_tf32_tensor_core_handle_ != nullptr) {
+      std::lock_guard<std::mutex> guard(blas_tf32_mtx_);
+      callback(blas_tf32_tensor_core_handle_);
+    } else {
+      std::lock_guard<std::mutex> guard(blas_mtx_);
+      callback(blas_handle_);
+    }
+  }
+
+  inline void TensorCoreCublasCallIfAvailable(
+      const std::function<void(blasHandle_t)>& callback) const {
+    if (blas_tensor_core_handle_ != nullptr) {
+      std::lock_guard<std::mutex> guard(blas_tensor_core_mtx_);
+      callback(blas_tensor_core_handle_);
+    } else {
+      std::lock_guard<std::mutex> guard(blas_mtx_);
+      callback(blas_handle_);
+    }
+  }
+
+  inline void CusparseCall(
+      const std::function<void(sparseHandle_t)>& callback) const {
+    std::lock_guard<std::mutex> guard(sparse_mtx_);
+    callback(sparse_handle_);
+  }
+
+  void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const {
+    callback();
+    RecordEvent(ev);
+  }
+
+  void RecordEvent(gpuEvent_t ev) const {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_));
+#endif
+  }
+
+  void AddStreamCallback(const std::function<void()>& callback) const {
+    // TODO(wilber): Do we need ThreadPool?
+    auto* func = new std::function<void()>([this, callback] {
+      std::lock_guard<std::mutex> lock(stream_call_back_mtx_);
+      last_future_ = std::async(std::launch::deferred, [&]() { callback(); });
+    });
+
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        hipStreamAddCallback(stream_, internal::StreamCallbackFunc, func, 0));
+#endif
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10000
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaLaunchHostFunc(stream_, internal::StreamCallbackFunc, func));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaStreamAddCallback(stream_, internal::StreamCallbackFunc, func, 0));
+#endif
+#endif
+  }
+
+  void WaitStreamCallback() const {
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
+    pten::backends::gpu::GpuStreamSync(stream_);
+#endif
+    {
+      std::lock_guard<std::mutex> lock(stream_call_back_mtx_);
+      if (last_future_.valid()) {
+        last_future_.wait();
+      }
+    }
+  }
+
+  bool owned_{false};
+  Place place_;
+  int compute_capability_;
+  int runtime_version_;
+  int driver_version_;
+  int multi_process_;
+  int max_threads_per_mp_;
+  int max_threads_per_block_;
+  std::array<int, 3> max_grid_dim_size_;
+
+  gpuStream_t stream_{nullptr};
+  Eigen::GpuDevice* eigen_device_{nullptr};
+  blasHandle_t blas_handle_{nullptr};
+  blasHandle_t blas_tensor_core_handle_{nullptr};
+  blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
+  dnnHandle_t dnn_handle_{nullptr};
+  solverHandle_t solver_handle_{nullptr};
+  sparseHandle_t sparse_handle_{nullptr};
+  DnnWorkspaceHandle* workspace_{nullptr};
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  // NCCL communicator (single process version) for NCCL collective operations.
+  // NCCL collective operations provides fast collectives over multiple GPUs
+  // both within and across nodes.
+  // But, this collectives is used for collectives over multiple GPUs within
+  // nodes.
+
+  // NOTE: Distributed communicator, distributed framework manages its
+  // resources.
+  ncclComm_t nccl_comm_{nullptr};
+#endif
+
+  mutable std::mutex blas_mtx_;
+  mutable std::mutex blas_tensor_core_mtx_;
+  mutable std::mutex blas_tf32_mtx_;
+  mutable std::mutex sparse_mtx_;
+  mutable std::mutex stream_call_back_mtx_;
+  mutable std::future<void> last_future_;
+
+  Allocator* allocator_{nullptr};  // external resource.
+  // A internal resouce to initinalize eigen_device.
+  std::unique_ptr<internal::EigenGpuStreamDevice> eigen_stream_{nullptr};
+};
+
+GPUContext::GPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {}
+
+GPUContext::GPUContext(const GPUPlace& place)
+    : DeviceContext(), impl_(std::make_unique<Impl>(place)) {}
+
+GPUContext::~GPUContext() = default;
+
+const Place& GPUContext::GetPlace() const { return impl_->GetPlace(); }
+
+gpuStream_t GPUContext::stream() const { return impl_->GetStream(); }
+
+dnnHandle_t GPUContext::cudnn_handle() const { return impl_->GetDnnHandle(); }
+
+blasHandle_t GPUContext::cublas_handle() const {
+  return impl_->GetBlasHandle();
+}
+
+solverHandle_t GPUContext::cusolver_dn_handle() const {
+  return impl_->GetSolverHandle();
+}
+
+sparseHandle_t GPUContext::cusparse_handle() const {
+  return impl_->GetSparseHandle();
+}
+
+void GPUContext::Wait() const { impl_->Wait(); }
+
+void GPUContext::WaitEvent(gpuEvent_t ev) const { impl_->WaitEvent(ev); }
+
+bool GPUContext::tensor_core_available() const {
+  return impl_->IsTensorCoreAvailable();
+}
+
+int GPUContext::GetComputeCapability() const {
+  return impl_->compute_capability_;
+}
+
+int GPUContext::GetMaxPhysicalThreadCount() const {
+  return impl_->multi_process_ * impl_->max_threads_per_mp_;
+}
+
+int GPUContext::GetSMCount() const { return impl_->multi_process_; }
+
+int GPUContext::GetMaxThreadsPerBlock() const {
+  return impl_->max_threads_per_block_;
+}
+
+std::array<int, 3> GPUContext::GetCUDAMaxGridDimSize() const {
+  return impl_->max_grid_dim_size_;
+}
+
+Eigen::GpuDevice* GPUContext::eigen_device() const {
+  return impl_->eigen_device();
+}
+
+DnnWorkspaceHandle* GPUContext::cudnn_workspace_handle() {
+  return impl_->GetDnnWorkspace();
+}
+
+void GPUContext::CublasCall(
+    const std::function<void(blasHandle_t)>& callback) const {
+  impl_->CublasCall(callback);
+}
+
+void GPUContext::TensorCoreCublasCallIfAvailable(
+    const std::function<void(blasHandle_t)>& callback) const {
+  impl_->TensorCoreCublasCallIfAvailable(callback);
+}
+
+void GPUContext::CusparseCall(
+    const std::function<void(sparseHandle_t)>& callback) const {
+  impl_->CusparseCall(callback);
+}
+
+void GPUContext::RecordEvent(gpuEvent_t ev,
+                             const std::function<void()>& callback) const {
+  impl_->RecordEvent(ev, callback);
+}
+
+void GPUContext::RecordEvent(gpuEvent_t ev) const { impl_->RecordEvent(ev); }
+
+void GPUContext::AddStreamCallback(
+    const std::function<void()>& callback) const {
+  impl_->AddStreamCallback(callback);
+}
+
+void GPUContext::WaitStreamCallback() const { impl_->WaitStreamCallback(); }
+
+ncclComm_t GPUContext::nccl_comm() const { return impl_->GetNcclComm(); }
+
+void GPUContext::set_nccl_comm(ncclComm_t comm) { impl_->SetNcclComm(comm); }
+
+void GPUContext::Init() {
+  impl_->allocator_ = const_cast<Allocator*>(&this->GetAllocator());
+  impl_->Init();
+}
+
+void GPUContext::SetStream(gpuStream_t stream) { impl_->SetStream(stream); }
+
+void GPUContext::SetEigenDevice(Eigen::GpuDevice* device) {
+  impl_->SetEigenDevice(device);
+}
+
+void GPUContext::SetBlasHandle(blasHandle_t blas) {
+  impl_->SetBlasHandle(blas);
+}
+
+void GPUContext::SetDnnHandle(dnnHandle_t handle) {
+  impl_->SetDnnHandle(handle);
+}
+
+void GPUContext::SetSolverHandle(solverHandle_t handle) {
+  impl_->SetSolverHandle(handle);
+}
+
+void GPUContext::SetSparseHandle(sparseHandle_t handle) {
+  impl_->SetSparseHandle(handle);
+}
+
+void GPUContext::SetDnnWorkspaceHandle(DnnWorkspaceHandle* handle) {
+  impl_->workspace_ = handle;
+}
+
+void GPUContext::PartialInitWithoutAllocator() {
+  impl_->PartialInitWithoutAllocator();
+}
+
+void GPUContext::PartialInitWithAllocator() {
+  impl_->allocator_ = const_cast<Allocator*>(&this->GetAllocator());
+  impl_->PartialInitWithAllocator();
+}
+
+void GPUContext::SetComputeCapability(int val) {
+  impl_->compute_capability_ = val;
+}
+
+void GPUContext::SetMaxThreadsPerMultiProcessor(int val) {
+  impl_->max_threads_per_mp_ = val;
+}
+
+void GPUContext::SetMultiProcessors(int val) { impl_->multi_process_ = val; }
+
+void GPUContext::SetMaxThreadsPerBlock(int val) {
+  impl_->max_threads_per_block_ = val;
+}
+
+void GPUContext::SetMaxGridDimSize(const std::array<int, 3>& val) {
+  impl_->max_grid_dim_size_ = val;
+}
+
+void GPUContext::SetDriverVersion(int val) { impl_->driver_version_ = val; }
+
+void GPUContext::SetRuntimeVersion(int val) { impl_->runtime_version_ = val; }
+
+}  // namespace pten
--- a/paddle/pten/backends/gpu/gpu_context.h
+++ b/paddle/pten/backends/gpu/gpu_context.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,13 +14,162 @@ limitations under the License. */

 #pragma once

-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/device_context.h"
+#include <array>
+#include <functional>
+#include "paddle/pten/backends/gpu/forwards.h"
+#include "paddle/pten/backends/gpu/gpu_decls.h"
+#include "paddle/pten/backends/gpu/gpu_helper.h"
+#include "paddle/pten/common/place.h"
+#include "paddle/pten/core/device_context.h"

 namespace pten {
-using GPUContext = paddle::platform::CUDADeviceContext;
-}  // namespace pten

-#endif
+class DnnWorkspaceHandle;
+
+class GPUContext : public DeviceContext {
+ public:
+  GPUContext();
+
+  explicit GPUContext(const GPUPlace& place);
+
+  virtual ~GPUContext();
+
+  /*! \brief  Return place in the device context. */
+  const Place& GetPlace() const override;
+
+  /*! \brief  Return gpu stream in the device context. */
+  gpuStream_t stream() const;
+
+  /*! \brief  Return cudnn  handle in the device context. */
+  dnnHandle_t cudnn_handle() const;
+
+  /*! \brief  Return cublas handle in the device context. */
+  blasHandle_t cublas_handle() const;
+
+  /*! \brief  Return cusolver handle in the device context. */
+  solverHandle_t cusolver_dn_handle() const;
+
+  /*! \brief  Return cusparse handle in the device context. */
+  sparseHandle_t cusparse_handle() const;
+
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override;
+
+  /*! \brief  Wait for event in the stream. */
+  void WaitEvent(gpuEvent_t ev) const;
+
+  /*! \brief  Check whether tensor core is supported */
+  bool tensor_core_available() const;
+
+  /*! \brief  Return compute capability in the device context. */
+  int GetComputeCapability() const;
+
+  /*! \brief  Return the max physical thread count in the device context */
+  int GetMaxPhysicalThreadCount() const;
+
+  /*! \brief  Return the SM count in the device context */
+  int GetSMCount() const;
+
+  /*! \brief  Return the Max thread num of block in the device context */
+  int GetMaxThreadsPerBlock() const;
+
+  /*! \brief  Return the max grid dim size in the device context */
+  std::array<int, 3> GetCUDAMaxGridDimSize() const;
+
+  /*! \brief  Return eigen device in the device context. */
+  Eigen::GpuDevice* eigen_device() const;
+
+  /*! \brief  Return a cudnn workspace handle to call multiple cudnn
+   *  functions without interrupting by other threads.
+   *  Once the first cudnn function is called by the handle, a lock
+   *  would be acquired to prevent other threads from accessing the
+   *  workspace. Once the handle is destructed, the lock would be released.
+   */
+  DnnWorkspaceHandle* cudnn_workspace_handle();
+
+ public:
+  /*! \brief  Call cublas function safely. */
+  void CublasCall(const std::function<void(blasHandle_t)>&) const;
+
+  /*! \brief  Call cublas function with Tensor Core safely. If
+      Tensor Core is not available, use DEFAULT_MATH instead. */
+  void TensorCoreCublasCallIfAvailable(
+      const std::function<void(blasHandle_t)>&) const;
+
+  /*! \brief  Call cusparse function safely. */
+  void CusparseCall(const std::function<void(sparseHandle_t)>&) const;
+
+  void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const;
+
+  void RecordEvent(gpuEvent_t ev) const;
+
+  void AddStreamCallback(const std::function<void()>& callback) const;
+
+  void WaitStreamCallback() const;
+
+ public:
+  /*! \brief  Return nccl communicators. */
+  ncclComm_t nccl_comm() const;
+
+  /*! \brief  Set nccl communicators. */
+  void set_nccl_comm(ncclComm_t comm);
+
+ public:
+  // NOTE: DeviceContext hold resources. Used in training scenarios.
+  // The interface used by the training scene, DeviceContext will initialize
+  // all resources and delete them when destructing.
+  // Note that you must set the Allocator before calling Init function.
+  void Init();
+
+  // TODO(wilber): Why does the GetAllocator interface require a stream
+  // parameter?
+  // The temporary trick method bypasses this problem, and the following
+  // interfaces
+  // need to be deleted later.
+
+  // Note that this is a trick implementation, which can be used to partially
+  // initialize when the SetAllocator interface is not called.
+  void PartialInitWithoutAllocator();
+  // Note that this is a trick implementation that can be used to initialize
+  // resources that require an Allocator when the SetAllocator interface is
+  // called.
+  void PartialInitWithAllocator();
+
+ protected:
+  // NOTE: External users manage resources. Used in inference scenarios.
+  // The Set interface is for inference only, DeviceContext will mark the
+  // resource as external, and will not delete any resource when destructing.
+  void SetStream(gpuStream_t);
+
+  void SetEigenDevice(Eigen::GpuDevice*);
+
+  void SetBlasHandle(blasHandle_t);
+
+  void SetDnnHandle(dnnHandle_t);
+
+  void SetSolverHandle(solverHandle_t);
+
+  void SetSparseHandle(sparseHandle_t);
+
+  void SetDnnWorkspaceHandle(DnnWorkspaceHandle*);
+
+  void SetComputeCapability(int val);
+
+  void SetMaxThreadsPerMultiProcessor(int val);
+
+  void SetMultiProcessors(int val);
+
+  void SetMaxThreadsPerBlock(int val);
+
+  void SetMaxGridDimSize(const std::array<int, 3>& val);
+
+  void SetDriverVersion(int val);
+
+  void SetRuntimeVersion(int val);
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace pten
--- a/paddle/pten/backends/gpu/gpu_decls.h
+++ b/paddle/pten/backends/gpu/gpu_decls.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/backends/gpu/forwards.h"
+
+namespace pten {
+
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = ROCM_TYPE;
+
+#else  // PADDLE_WITH_CDUA
+
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = CUDA_TYPE;
+#endif
+
+DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t);
+DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t);
+
+DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
+                     cudnnActivationStruct,
+                     miopenActivationDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
+                     cudnnTensorStruct,
+                     miopenTensorDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
+                     cudnnFilterStruct,
+                     miopenTensorDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
+                     cudnnFilterDescriptor_t,
+                     miopenTensorDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
+                     cudnnConvolutionStruct,
+                     miopenConvolutionDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
+                     cudnnConvolutionDescriptor_t,
+                     miopenConvolutionDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
+                     cudnnPoolingDescriptor_t,
+                     miopenPoolingDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
+                     cudnnDropoutDescriptor_t,
+                     miopenDropoutDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
+
+DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
+
+DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle);
+
+DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle);
+
+#undef DECLARE_TYPE_FOR_GPU
+
+using CUDAGraphID = unsigned long long;  // NOLINT
+
+}  // namespace pten
--- a/paddle/pten/backends/gpu/gpu_helper.h
+++ b/paddle/pten/backends/gpu/gpu_helper.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/pten/backends/gpu/rocm/rocm_helper.h"
+#else
+#include "paddle/pten/backends/gpu/cuda/cuda_helper.h"
+#endif
+
+#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int)
+
+#endif
--- a/paddle/pten/backends/gpu/gpu_info.cc
+++ b/paddle/pten/backends/gpu/gpu_info.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/gpu/gpu_info.h"
+
+#include <vector>
+
+#include "gflags/gflags.h"
+
+DECLARE_string(selected_gpus);
+
+namespace pten {
+namespace backends {
+namespace gpu {
+
+static inline std::vector<std::string> Split(std::string const& original,
+                                             char separator) {
+  std::vector<std::string> results;
+  std::string token;
+  std::istringstream is(original);
+  while (std::getline(is, token, separator)) {
+    if (!token.empty()) {
+      results.push_back(token);
+    }
+  }
+  return results;
+}
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedDevices() {
+  // use user specified GPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_gpus.empty()) {
+    auto devices_str = Split(FLAGS_selected_gpus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetGPUDeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
--- a/paddle/pten/backends/gpu/gpu_info.h
+++ b/paddle/pten/backends/gpu/gpu_info.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#include <stddef.h>
+#include <array>
+#include <string>
+#include <vector>
+
+#include "paddle/pten/backends/gpu/gpu_types.h"
+
+namespace pten {
+namespace backends {
+namespace gpu {
+
+//! Get the version of dnn
+int DnnVersion();
+
+//! Get the total number of GPU devices in system.
+int GetGPUDeviceCount();
+
+//! Get the compute capability of the ith GPU (format: major * 10 + minor)
+int GetGPUComputeCapability(int id);
+
+//! Get the runtime version of the ith GPU
+int GetGPURuntimeVersion(int id);
+
+//! Get the driver version of the ith GPU
+int GetGPUDriverVersion(int id);
+
+//! Wheter the current device support TensorCore
+bool TensorCoreAvailable();
+
+//! Get the MultiProcessors of the ith GPU.
+int GetGPUMultiProcessors(int id);
+
+//! Get the MaxThreads of each MultiProcessor of the ith GPU.
+int GetGPUMaxThreadsPerMultiProcessor(int id);
+
+//! Get the MaxThreads of each block of the ith GPU.
+int GetGPUMaxThreadsPerBlock(int id);
+
+//! Get the current GPU device id in system.
+int GetCurrentDeviceId();
+
+//! Get the maximum GridDim size for GPU buddy allocator.
+std::array<int, 3> GetGpuMaxGridDimSize(int);
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedDevices();
+
+//! Get the properties of the ith GPU device.
+const gpuDeviceProp &GetDeviceProperties(int id);
+
+//! Set the GPU device id for next execution.
+void SetDeviceId(int device_id);
+
+//! Copy memory from address src to dst asynchronously.
+void GpuMemcpyAsync(void *dst,
+                    const void *src,
+                    size_t count,
+                    gpuMemcpyKind kind,
+                    gpuStream_t stream);
+
+//! Copy memory from address src to dst synchronously.
+void GpuMemcpySync(void *dst,
+                   const void *src,
+                   size_t count,
+                   gpuMemcpyKind kind);
+
+//! Copy memory from one device to another device asynchronously.
+void GpuMemcpyPeerAsync(void *dst,
+                        int dst_device,
+                        const void *src,
+                        int src_device,
+                        size_t count,
+                        gpuStream_t stream);
+
+//! Copy memory from one device to another device synchronously.
+void GpuMemcpyPeerSync(
+    void *dst, int dst_device, const void *src, int src_device, size_t count);
+
+//! Set memory dst with value count size asynchronously
+void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream);
+
+//! Blocks until stream has completed all operations.
+void GpuStreamSync(gpuStream_t stream);
+
+void GpuDestroyStream(gpuStream_t stream);
+
+// ! Blocks until device has completed all operations.
+void GpuDeviceSync();
+
+gpuError_t GpuGetLastError();
+
+class GPUDeviceGuard {
+ public:
+  explicit inline GPUDeviceGuard(int dev_id) {
+    int prev_id = GetCurrentDeviceId();
+    if (prev_id != dev_id) {
+      prev_id_ = prev_id;
+      SetDeviceId(dev_id);
+    }
+  }
+  inline ~GPUDeviceGuard() {
+    if (prev_id_ != -1) {
+      SetDeviceId(prev_id_);
+    }
+  }
+  GPUDeviceGuard(const GPUDeviceGuard &o) = delete;
+  GPUDeviceGuard &operator=(const GPUDeviceGuard &o) = delete;
+
+ private:
+  int prev_id_{-1};
+};
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
+
+#endif
--- a/paddle/pten/backends/gpu/gpu_launch_config.h
+++ b/paddle/pten/backends/gpu/gpu_launch_config.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Used for compute gpu launch parameter config
+
+#pragma once
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#else
+#include <hip/hip_runtime.h>
+#endif
+
+#include <stddef.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/pten/backends/gpu/gpu_context.h"
+
+#ifdef __HIPCC__
+// HIP results in error or nan if > 256
+#define PREDEFINED_BLOCK_SIZE 256
+#else
+/* CUDA performs better as thread_per_block
+   num is between [64, 512] */
+#define PREDEFINED_BLOCK_SIZE 512
+#endif
+
+namespace pten {
+namespace backends {
+namespace gpu {
+
+inline int DivUp(int a, int b) { return (a + b - 1) / b; }
+
+/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+   for round integer value into next highest power of 2. */
+static inline int RoundToPowerOfTwo(int n) {
+  n--;
+  n |= (n >> 1);
+  n |= (n >> 2);
+  n |= (n >> 4);
+  n |= (n >> 8);
+  n |= (n >> 16);
+#ifdef __HIPCC__
+  return std::min(256, std::max(32, (n + 1)));
+#else
+  return std::min(1024, std::max(32, (n + 1)));
+#endif
+}
+
+#ifdef WITH_NV_JETSON
+// The number of threads cannot be assigned 1024 in some cases when the device
+// is nano or tx2 .
+inline void ChangeThreadNum(const pten::CUDAContext& context,
+                            int* num_thread,
+                            int alternative_num_thread = 512) {
+  if (context.GetComputeCapability() == 53 ||
+      context.GetComputeCapability() == 62) {
+    *num_thread = alternative_num_thread;
+  }
+}
+#endif
+
+struct GpuLaunchConfig {
+ public:
+  GpuLaunchConfig() {}
+
+  size_t GetThreadNum() const { return GetBlockSize() * GetGridSize(); }
+
+  size_t GetGridSize() const {
+    return block_per_grid.x * block_per_grid.y * block_per_grid.z;
+  }
+
+  size_t GetBlockSize() const {
+    return thread_per_block.x * thread_per_block.y * thread_per_block.z;
+  }
+
+  int compute_capability = 0;
+  dim3 thread_per_block = dim3(1, 1, 1);
+  dim3 block_per_grid = dim3(1, 1, 1);
+};
+
+/* According to NVIDIA, if number of threads per block is 64/128/256/512,
+  * cuda performs better. And number of blocks should be greater (at least
+  * 2x~4x) than number of SMs. Hence, SM count is took into account within
+  * this function to determine the right number of threads per block. */
+inline GpuLaunchConfig GetGpuLaunchConfig1D(const pten::GPUContext& context,
+                                            int64_t numel,
+                                            int vec_size = 1) {
+  PADDLE_ENFORCE_GT(numel,
+                    0,
+                    paddle::platform::errors::InvalidArgument(
+                        "element quantity should be greater than 0,"
+                        " but received value is: %d.",
+                        numel));
+  // Get compute_capability
+  const int capability = context.GetComputeCapability();
+  /* If thread number per block is 64/128/256/512, cuda performs better.*/
+  int limit_threads =
+      std::min(PREDEFINED_BLOCK_SIZE, context.GetMaxThreadsPerBlock());
+#ifdef WITH_NV_JETSON
+  if (capability == 53 || capability == 62) {
+    limit_threads = 512;
+  }
+#endif
+  int threads = limit_threads;
+  int sm_count = context.GetSMCount();
+  int active_threads_num = numel / vec_size;
+  if (active_threads_num / (sm_count << 1) < limit_threads) {
+    // Round up threads number into an exponential multiple of 2, while number
+    // of acitve blocks is about twice of SM, to acquire better performance.
+    threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 1));
+  } else if (active_threads_num / (sm_count << 2) < limit_threads) {
+    // Round up threads number into an exponential multiple of 2, while number
+    // of acitve blocks is about 4 times of SM, to acquire better performance.
+    threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 2));
+  }
+  // Number of threads per block shall be larger than 64.
+  threads = std::max(64, threads);
+  int blocks = DivUp(DivUp(numel, vec_size), threads);
+
+  GpuLaunchConfig config;
+  config.thread_per_block.x = threads;
+  config.block_per_grid.x = blocks;
+  config.compute_capability = capability;
+  return config;
+}
+
+inline GpuLaunchConfig GetGpuLaunchConfig2D(const pten::GPUContext& context,
+                                            int x_dim,
+                                            int y_dim) {
+  PADDLE_ENFORCE_GT(x_dim,
+                    0,
+                    paddle::platform::errors::InvalidArgument(
+                        "x dim number should greater than 0,"
+                        " but received value is: %d",
+                        x_dim));
+  PADDLE_ENFORCE_GT(y_dim,
+                    0,
+                    paddle::platform::errors::InvalidArgument(
+                        "y dim number should greater than 0,"
+                        " but received value is: %d",
+                        y_dim));
+
+  const int kThreadsPerBlock = 256;
+  int block_cols = (std::min)(x_dim, kThreadsPerBlock);
+  int block_rows = (std::max)(kThreadsPerBlock / block_cols, 1);
+
+  int max_physical_threads = context.GetMaxPhysicalThreadCount();
+  const int max_blocks = (std::max)(max_physical_threads / kThreadsPerBlock, 1);
+
+  GpuLaunchConfig config;
+  // Noticed, block size is not align to 32, if needed do it yourself.
+  config.thread_per_block = dim3(block_cols, block_rows, 1);
+
+  int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks);
+  int grid_y =
+      (std::min)(max_blocks / grid_x, (std::max)(y_dim / block_rows, 1));
+
+  config.block_per_grid = dim3(grid_x, grid_y, 1);
+  return config;
+}
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
+
+#endif
--- a/paddle/pten/backends/gpu/gpu_types.h
+++ b/paddle/pten/backends/gpu/gpu_types.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/backends/gpu/forwards.h"
+#include "paddle/pten/backends/gpu/gpu_decls.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/pten/backends/dynload/miopen.h"
+#include "paddle/pten/backends/dynload/rocblas.h"
+#else  // PADDLE_WITH_CUDA
+#include "paddle/pten/backends/dynload/cublas.h"
+#include "paddle/pten/backends/dynload/cudnn.h"
+#endif
+
+namespace pten {
+
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = ROCM_TYPE;
+
+#else  // PADDLE_WITH_CDUA
+
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = CUDA_TYPE;
+#endif
+
+DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t);
+DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind);
+DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t);
+DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
+DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
+DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
+                     cudnnTensorFormat_t,
+                     miopenTensorFormat_t);
+DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
+                     cudnnActivationMode_t,
+                     miopenActivationMode_t);
+
+#undef DECLARE_TYPE_FOR_GPU
+
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+  constexpr auto GPU_CV = ROCM_CV;
+#else  // PADDLE_WITH_CUDA
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+  constexpr auto GPU_CV = CUDA_CV;
+#endif
+
+DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
+                         cudaErrorMemoryAllocation,
+                         hipErrorOutOfMemory);
+DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady);
+DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess);
+
+#undef DECLARE_CONSTANT_FOR_GPU
+}  // namespace pten
+
+#endif  // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
--- a/paddle/pten/backends/gpu/rocm/CMakeLists.txt
+++ b/paddle/pten/backends/gpu/rocm/CMakeLists.txt
+hip_library(pten_rocm_info SRCS rocm_info.cc DEPS gflags glog enforce pten_dynload_cuda)
--- a/paddle/pten/backends/gpu/rocm/rocm_helper.h
+++ b/paddle/pten/backends/gpu/rocm/rocm_helper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace pten {
+namespace backends {
+namespace gpu {
+
+/*
+ * Summary: Grid stride looping macro in CUDA kernel
+ *
+ *  [ Why need this macro? ]
+ *
+ *    The original looping in CUDA kernel is:
+ *
+ *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+ *        i += blockDim.x * gridDim.x)`
+ *
+ *    This for condition is risky. The value of `blockIdx.x * blockDim.x`
+ *    may be large, such as over 1GB, the first iteration is no problem here,
+ *    but when `i += blockDim.x * gridDim.x` is executed, the value of i
+ *    will greater than INT_MAX and overflow becomes negative value, at
+ *    this time, the cycle condition `i < (n)` is still satisfied, so it
+ *    will cause illegal access to cuda memory.
+ *
+ *    Here is a real example in ERINE, it will trigger above error.
+ *    The related data are:
+ *      - blockIdx.x = 2172938
+ *      - blockDim.x = 512
+ *      - blockIdx.x * blockDim.x = 1112543864
+ *      - INT_MAX = 2147483647
+ *
+ *    So we polish the for condition as follow, the int64_t __index__ will
+ *    prevent overflow in the loop increment.
+ *
+ * Parameters:
+ *    - i: loop index
+ *    - num: total element numbers
+ *
+ * Examples:
+ *    template <typename T>
+ *    __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
+ *                      const int d, const int remain) {
+ *    CUDA_KERNEL_LOOP(index, num) {
+ *      int idx_n = index / d;
+ *      int idx_remain = index % remain;
+ *      logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
+ *      }
+ *    }
+ *
+*/
+
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                     \
+  int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
+  for (index_type i = __index__; __index__ < (num);                   \
+       __index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
--- a/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc
@@ -12,20 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include <array>
+#include "paddle/pten/backends/gpu/gpu_info.h"
+
+// TODO(pten): remove fluid headers.
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/lock_guard_ptr.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/monitor.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"

 static std::once_flag g_device_props_size_init_flag;
 static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
 static std::vector<paddle::gpuDeviceProp> g_device_props;

-namespace paddle {
-namespace platform {
+namespace pten {
+namespace backends {
+namespace gpu {
+
 int DnnVersion() {
  if (!dynload::HasCUDNN()) return -1;
  size_t version_major, version_minor, version_patch;
@@ -78,11 +78,13 @@ int GetGPUDeviceCount() {
 }

 int GetGPUComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  int major, minor;
  auto major_error_code = hipDeviceGetAttribute(
      &major, hipDeviceAttributeComputeCapabilityMajor, id);
@@ -95,22 +97,26 @@ int GetGPUComputeCapability(int id) {
 }

 int GetGPURuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  int runtime_version = 0;
  PADDLE_ENFORCE_GPU_SUCCESS(hipRuntimeGetVersion(&runtime_version));
  return runtime_version;
 }

 int GetGPUDriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  int driver_version = 0;
  PADDLE_ENFORCE_GPU_SUCCESS(hipDriverGetVersion(&driver_version));
  return driver_version;
@@ -119,11 +125,13 @@ int GetGPUDriverVersion(int id) {
 bool TensorCoreAvailable() { return false; }

 int GetGPUMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  int count;
  PADDLE_ENFORCE_GPU_SUCCESS(
      hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id));
@@ -131,11 +139,13 @@ int GetGPUMultiProcessors(int id) {
 }

 int GetGPUMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  int count;
  PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute(
      &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id));
@@ -144,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
 }

 int GetGPUMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  int count;
  PADDLE_ENFORCE_GPU_SUCCESS(
      hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id));
@@ -161,35 +173,37 @@ int GetCurrentDeviceId() {
  return device_id;
 }

-dim3 GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+std::array<int, 3> GetGpuMaxGridDimSize(int id) {
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
-  dim3 ret;
+                        id,
+                        GetGPUDeviceCount()));
+  std::array<int, 3> ret;
  int size;
  auto error_code_x =
      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id);
  PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
-  ret.x = size;
+  ret[0] = size;

  auto error_code_y =
      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimY, id);
  PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
-  ret.y = size;
+  ret[1] = size;

  auto error_code_z =
      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimZ, id);
  PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
-  ret.z = size;
+  ret[2] = size;
  return ret;
 }

 const gpuDeviceProp &GetDeviceProperties(int id) {
  std::call_once(g_device_props_size_init_flag, [&] {
    int gpu_num = 0;
-    gpu_num = platform::GetGPUDeviceCount();
+    gpu_num = GetGPUDeviceCount();
    g_device_props_init_flags.resize(gpu_num);
    g_device_props.resize(gpu_num);
    for (int i = 0; i < gpu_num; ++i) {
@@ -198,16 +212,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
  });

  if (id == -1) {
-    id = platform::GetCurrentDeviceId();
+    id = GetCurrentDeviceId();
  }

  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
-    PADDLE_THROW(platform::errors::OutOfRange(
+    PADDLE_THROW(paddle::platform::errors::OutOfRange(
        "The device id %d is out of range [0, %d), where %d is the number of "
        "devices on this machine. Because the device id should be greater than "
        "or equal to zero and smaller than the number of gpus. Please input "
        "appropriate device again!",
-        id, static_cast<int>(g_device_props.size()),
+        id,
+        static_cast<int>(g_device_props.size()),
        static_cast<int>(g_device_props.size())));
  }

@@ -220,32 +235,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) {

 void SetDeviceId(int id) {
  // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id));
 }

-void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-                    gpuMemcpyKind kind, gpuStream_t stream) {
+void GpuMemcpyAsync(void *dst,
+                    const void *src,
+                    size_t count,
+                    gpuMemcpyKind kind,
+                    gpuStream_t stream) {
  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(dst, src, count, kind, stream));
 }

-void GpuMemcpySync(void *dst, const void *src, size_t count,
+void GpuMemcpySync(void *dst,
+                   const void *src,
+                   size_t count,
                   gpuMemcpyKind kind) {
  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(dst, src, count, kind));
 }

-void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
-                        int src_device, size_t count, gpuStream_t stream) {
+void GpuMemcpyPeerAsync(void *dst,
+                        int dst_device,
+                        const void *src,
+                        int src_device,
+                        size_t count,
+                        gpuStream_t stream) {
  PADDLE_ENFORCE_GPU_SUCCESS(
      hipMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
 }

-void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
-                       int src_device, size_t count) {
+void GpuMemcpyPeerSync(
+    void *dst, int dst_device, const void *src, int src_device, size_t count) {
  PADDLE_ENFORCE_GPU_SUCCESS(
      hipMemcpyPeer(dst, dst_device, src, src_device, count));
 }
@@ -265,5 +291,7 @@ void GpuDestroyStream(gpuStream_t stream) {
 void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); }

 gpuError_t GpuGetLastError() { return hipGetLastError(); }
-}  // namespace platform
-}  // namespace paddle
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
--- a/paddle/pten/backends/xpu/xpu_context.cc
+++ b/paddle/pten/backends/xpu/xpu_context.cc
@@ -13,8 +13,11 @@
 // limitations under the License.

 #include "paddle/pten/backends/xpu/xpu_context.h"
+
 #include <memory>
+
 #include "paddle/pten/api/ext/exception.h"
+#include "paddle/pten/common/place.h"

 #include "xpu/runtime.h"
 #include "xpu/runtime_ex.h"
@@ -24,12 +27,11 @@ namespace xpu = baidu::xpu::api;

 namespace pten {

-struct XPUContext::XPUImpl {
-  void SetL3Cache() {
+struct XPUContext::Impl {
+  void SetL3Cache(int l3_size = 14155776) {
    const int MAX_XPU_NUM = 16;
    static void* l3ptrs[MAX_XPU_NUM] = {nullptr};

-    int l3_size = 13.5 * 1024 * 1024;
    if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) {
      l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE"));
    }
@@ -52,48 +54,28 @@ struct XPUContext::XPUImpl {
    }
  }

-  XPUImpl() {
-    context_ = xpu::create_context();
-    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
-  }
-
-  explicit XPUImpl(XPUPlace place) : place_(place) {
-    backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
-
-    LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
-                            << static_cast<int>(place_.device);
-
-    context_ = xpu::create_context();
-    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
-    SetL3Cache();
-  }
+  Impl() : place_(XPUPlace()) {}

-  // Users need to manage external resources.
-  explicit XPUImpl(const XPUContextResource& ctx_res,
-                   const XPUPlace& place = XPUPlace(0))
-      : res_(ctx_res), place_(place) {
-    context_ = res_.context;
-    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
-    SetL3Cache();
-  }
+  explicit Impl(const Place& place) : place_(place) {}

-  ~XPUImpl() {
-    if (res_.context == nullptr && context_ != nullptr) {
+  ~Impl() {
+    if (owned_ && context_ != nullptr) {
      xpu::destroy_context(context_);
      context_ = nullptr;
    }
  }

-  Place GetPlace() const { return place_; }
-
-  backends::xpu::XPUVersion GetXpuVersion() const { return xpu_version_; }
+  const Place& GetPlace() const { return place_; }

  xpu::Context* GetXContext() const {
    PD_CHECK(context_ != nullptr, "the xpu context is nullptr.");
    return context_;
  }

-  xpu::BKCLContext_t GetBkclContext() const { return bkcl_context_; }
+  xpu::BKCLContext_t GetBkclContext() const {
+    PD_CHECK(bkcl_context_ != nullptr, "the xpu bkcl_context is nullptr.");
+    return bkcl_context_;
+  }

  void Wait() const {
    backends::xpu::SetXPUDeviceId(place_.GetDeviceId());
@@ -101,53 +83,41 @@ struct XPUContext::XPUImpl {
    xpu_wait(context_->xpu_stream);
  }

-  void SetXContext(xpu::Context* context) {
-    if (context == nullptr) {
-      return;
-    }
-    res_.context = context;
-    context_ = context;
+  void Init() {
+    owned_ = true;
+    backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
+    LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
+                            << static_cast<int>(place_.device);
+    context_ = xpu::create_context();
+    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
+    SetL3Cache();
  }

+  void SetXContext(xpu::Context* context) { context_ = context; }
+
  void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; }

-  XPUContextResource res_;
-  XPUPlace place_;
+  bool owned_{false};
+  Place place_;
  backends::xpu::XPUVersion xpu_version_;
  xpu::Context* context_{nullptr};
+
  // NOTE: Distributed communicator, distributed framework manages its
  // resources, XPUContext only holds references.
  xpu::BKCLContext_t bkcl_context_{nullptr};
 };

-XPUContext::XPUContext() : DeviceContext() {
-  impl_ = std::make_unique<XPUImpl>();
-}
+XPUContext::XPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {}

-XPUContext::XPUContext(const XPUPlace& place) {
-  impl_ = std::make_unique<XPUImpl>(place);
-}
-
-XPUContext::XPUContext(const XPUContext& other) : DeviceContext() {
-  impl_ = std::make_unique<XPUImpl>();
-  impl_->SetXContext(other.x_context());
-  impl_->SetBkclContext(other.bkcl_context());
-}
-
-XPUContext::XPUContext(XPUContext&& other) : DeviceContext() {
-  impl_ = std::move(other.impl_);
-}
+XPUContext::XPUContext(const XPUPlace& place)
+    : DeviceContext(), impl_(std::make_unique<Impl>(place)) {}

 XPUContext::~XPUContext() = default;

-XPUContext::XPUContext(const XPUContextResource& ctx_res) : DeviceContext() {
-  impl_ = std::make_unique<XPUImpl>(ctx_res);
-}
-
-Place XPUContext::GetPlace() const { return impl_->GetPlace(); }
+const Place& XPUContext::GetPlace() const { return impl_->GetPlace(); }

 backends::xpu::XPUVersion XPUContext::xpu_version() const {
-  return impl_->GetXpuVersion();
+  return impl_->xpu_version_;
 }

 xpu::Context* XPUContext::x_context() const { return impl_->GetXContext(); }
@@ -158,12 +128,16 @@ xpu::BKCLContext_t XPUContext::bkcl_context() const {

 void XPUContext::Wait() const { impl_->Wait(); }

-void XPUContext::set_x_context(xpu::Context* context) {
+void XPUContext::SetXContext(xpu::Context* context) {
  impl_->SetXContext(context);
 }

-void XPUContext::set_bkcl_context(xpu::BKCLContext_t context) {
+void XPUContext::SetL3Cache(int l3_size) { impl_->SetL3Cache(l3_size); }
+
+void XPUContext::SetBkclContext(xpu::BKCLContext_t context) {
  impl_->SetBkclContext(context);
 }

+void XPUContext::Init() { impl_->Init(); }
+
 }  // namespace pten
--- a/paddle/pten/backends/xpu/xpu_context.h
+++ b/paddle/pten/backends/xpu/xpu_context.h
@@ -26,26 +26,15 @@ namespace xpu = baidu::xpu::api;

 namespace pten {

-struct XPUContextResource {
-  xpu::Context* context{nullptr};
-};
-
 class XPUContext : public DeviceContext {
 public:
-  // NOTE: DeviceContext hold resources. Used in training scenarios.
  XPUContext();

  explicit XPUContext(const XPUPlace&);

-  // NOTE: Share the same underlying resources, please ensure that resources are
-  // not released.
-  XPUContext(const XPUContext&);
-
-  XPUContext(XPUContext&&);
-
  virtual ~XPUContext();

-  Place GetPlace() const override;
+  const Place& GetPlace() const override;

  backends::xpu::XPUVersion xpu_version() const;

@@ -53,21 +42,28 @@ class XPUContext : public DeviceContext {

  // Return bkcl context.
  xpu::BKCLContext_t bkcl_context() const;
+  void SetBkclContext(xpu::BKCLContext_t context);

  // Wait for all operations completion in the stream.
  void Wait() const override;

 public:
-  // NOTE: External users manage resources. Used in inference scenarios.
-  explicit XPUContext(const XPUContextResource&);
+  // NOTE: DeviceContext hold resources. Used in training scenarios.
+  // The interface used by the training scene, DeviceContext will initialize
+  // all resources and delete them when destructing.
+  void Init();

-  void set_x_context(xpu::Context*);
+ public:
+  // NOTE: External users manage resources. Used in inference scenarios.
+  // The Set interface is for inference only, DeviceContext will mark the
+  // resource as external, and will not delete any resource when destructing.
+  void SetXContext(xpu::Context*);

-  void set_bkcl_context(xpu::BKCLContext_t context);
+  void SetL3Cache(int l3_size = 14155776);

 private:
-  struct XPUImpl;
-  std::unique_ptr<XPUImpl> impl_;
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
 };

 }  // namespace pten
--- a/paddle/pten/core/device_context.cc
+++ b/paddle/pten/core/device_context.cc
@@ -23,7 +23,7 @@ struct DeviceContext::Impl {
  Impl() = default;
  ~Impl() = default;

-  void SetDeviceAllocator(const Allocator* allocator) {
+  void SetAllocator(const Allocator* allocator) {
    PADDLE_ENFORCE_NOT_NULL(
        allocator,
        pten::errors::InvalidArgument(
@@ -47,7 +47,7 @@ struct DeviceContext::Impl {
    zero_allocator_ = allocator;
  }

-  const Allocator& GetDeviceAllocator() const {
+  const Allocator& GetAllocator() const {
    PADDLE_ENFORCE_NOT_NULL(
        device_allocator_,
        pten::errors::InvalidArgument("Required device_allocator_ shall not be "
@@ -124,7 +124,7 @@ DeviceContext::DeviceContext() { impl_ = std::make_unique<Impl>(); }

 DeviceContext::DeviceContext(const DeviceContext& other) {
  impl_->SetHostAllocator(&other.GetHostAllocator());
-  impl_->SetDeviceAllocator(&other.GetDeviceAllocator());
+  impl_->SetAllocator(&other.GetAllocator());
  impl_->SetZeroAllocator(&other.GetZeroAllocator());
 }

@@ -134,12 +134,12 @@ DeviceContext::DeviceContext(DeviceContext&& other) {

 DeviceContext::~DeviceContext() = default;

-void DeviceContext::SetDeviceAllocator(const Allocator* allocator) {
-  impl_->SetDeviceAllocator(allocator);
+void DeviceContext::SetAllocator(const Allocator* allocator) {
+  impl_->SetAllocator(allocator);
 }

-const Allocator& DeviceContext::GetDeviceAllocator() const {
-  return impl_->GetDeviceAllocator();
+const Allocator& DeviceContext::GetAllocator() const {
+  return impl_->GetAllocator();
 }

 void DeviceContext::SetHostAllocator(const Allocator* allocator) {

--- a/paddle/pten/core/device_context.h
+++ b/paddle/pten/core/device_context.h
@@ -60,7 +60,7 @@ class DeviceContext {
   *
   * @param allocator
   */
-  void SetDeviceAllocator(const Allocator*);
+  void SetAllocator(const Allocator*);

  /**
   * @brief Set the host Allocator object.
@@ -81,7 +81,7 @@ class DeviceContext {
   *
   * @return Allocator
   */
-  const Allocator& GetDeviceAllocator() const;
+  const Allocator& GetAllocator() const;

  /**
   * @brief Get the const device-related Allocator object.
@@ -114,7 +114,7 @@ class DeviceContext {

  // TODO(wilber): Just for the convenience of migrating the code, it will be
  // modified or removed later.
-  virtual Place GetPlace() const = 0;
+  virtual const Place& GetPlace() const = 0;
  // TODO(wilber): The fluid framework uses wait() in many places, how to delete
  // this API interface.
  virtual void Wait() const {}

--- a/paddle/pten/kernels/funcs/cuda_kernel_config.h
+++ b/paddle/pten/kernels/funcs/cuda_kernel_config.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"

 #ifdef __HIPCC__
 #define ELEMENTWISE_BLOCK_SIZE 256
@@ -31,7 +32,7 @@ namespace funcs {
 * 2x~4x) than number of SMs. Hence, SM count is took into account within
 * this function to determine the right number of threads per block.
 */
-inline int GetThreadsConfig(const paddle::platform::CUDADeviceContext &ctx,
+inline int GetThreadsConfig(const pten::GPUContext &ctx,
                            int64_t numel,
                            int vec_size) {
  int threads = ELEMENTWISE_BLOCK_SIZE;

--- a/paddle/pten/kernels/funcs/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
@@ -23,8 +23,8 @@ limitations under the License. */

 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/fluid/platform/aligned_vector.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/function_traits.h"
+#include "paddle/pten/backends/gpu/gpu_launch_config.h"
 #include "paddle/pten/kernels/primitive/kernel_primitives.h"

 namespace kps = pten::kps;
@@ -646,7 +646,8 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
                              VecSize><<<grid_size, block_size, 0, stream>>>(
      ins_data, outs_data, numel, main_offset, func);
 #else
-  auto gpu_config = GetGpuLaunchConfig1D(ctx, numel, VecSize);
+  auto gpu_config =
+      pten::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
  int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize *
                    gpu_config.GetBlockSize();
  auto stream = ctx.stream();

--- a/paddle/pten/kernels/funcs/transpose.cu
+++ b/paddle/pten/kernels/funcs/transpose.cu
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/ddim.h"

--- a/paddle/pten/kernels/gpu/concat_and_split.h
+++ b/paddle/pten/kernels/gpu/concat_and_split.h
@@ -237,8 +237,7 @@ __global__ void SplitKernel(const T* input_data,
  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
 }

-static inline void GetBlockDims(
-    const paddle::platform::CUDADeviceContext& context,
+static inline void GetBlockDims(const pten::GPUContext& context,
                                int64_t num_rows,
                                int64_t num_cols,
                                dim3* block_dims,

--- a/paddle/pten/kernels/gpu/copy_kernel.cu
+++ b/paddle/pten/kernels/gpu/copy_kernel.cu
@@ -87,9 +87,7 @@ void Copy(const Context& dev_ctx,
                          ctx_gpu_place));
    auto stream =
        blocking ? nullptr
-                 : reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
-                       dev_ctx)
-                       .stream();
+                 : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
    paddle::memory::Copy(
        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
  } else if (paddle::platform::is_cpu_place(src_place) &&  // NOLINT
@@ -113,9 +111,7 @@ void Copy(const Context& dev_ctx,
                          ctx_gpu_place));
    auto stream =
        blocking ? nullptr
-                 : reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
-                       dev_ctx)
-                       .stream();
+                 : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
    paddle::memory::Copy(
        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
@@ -140,9 +136,7 @@ void Copy(const Context& dev_ctx,
                          ctx_gpu_place.device));
    auto stream =
        blocking ? nullptr
-                 : reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
-                       dev_ctx)
-                       .stream();
+                 : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
    paddle::memory::Copy(
        dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
  } else if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
@@ -167,9 +161,7 @@ void Copy(const Context& dev_ctx,
                          ctx_gpu_place.device));
    auto stream =
        blocking ? nullptr
-                 : reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
-                       dev_ctx)
-                       .stream();
+                 : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
    paddle::memory::Copy(
        dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
@@ -185,9 +177,7 @@ void Copy(const Context& dev_ctx,
            ctx_place));
    auto stream =
        blocking ? nullptr
-                 : reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
-                       dev_ctx)
-                       .stream();
+                 : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
    if (paddle::platform::is_same_place(src_place, dst_place)) {
      paddle::memory::Copy(
          dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);

--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -2020,7 +2020,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
                                       T,
                                       kps::AddFunctor,
                                       kps::IdentityFunctor<T>>(
-          dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
+          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
    }
  }
  // dy
@@ -2038,7 +2038,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
                                       T,
                                       kps::AddFunctor,
                                       kps::IdentityFunctor<T>>(
-          dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
+          ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
    }
  }
 }
@@ -2137,7 +2137,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
                                       T,
                                       kps::AddFunctor,
                                       kps::IdentityFunctor<T>>(
-          dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
+          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
    }
  }
  // dy
@@ -2161,7 +2161,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
                                       T,
                                       kps::AddFunctor,
                                       kps::InverseFunctor<T>>(
-          dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
+          ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
    }
  }
 }

--- a/paddle/pten/kernels/gpu/reduce.h
+++ b/paddle/pten/kernels/gpu/reduce.h
@@ -1064,7 +1064,8 @@ template <typename Tx,
          typename Ty,
          template <typename> class ReduceOp,
          typename TransformOp>
-void TensorReduceFunctorImpl(const pten::DenseTensor& x,
+void TensorReduceFunctorImpl(const pten::GPUContext& dev_ctx,
+                             const pten::DenseTensor& x,
                             pten::DenseTensor* y,
                             const TransformOp& transform,
                             const std::vector<int>& origin_reduce_dims,
@@ -1088,13 +1089,11 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
  auto x_data = x.data<Tx>();
  auto y_data = y->data<Ty>();

-  auto* dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
-      paddle::platform::DeviceContextPool::Instance().Get(x.place()));
  if (config.reduce_num == 1) {
    std::vector<const DenseTensor*> inputs = {&x};
    std::vector<DenseTensor*> outputs = {y};
    funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, Tx, Ty>(
-        *dev_ctx, inputs, &outputs, transform);
+        dev_ctx, inputs, &outputs, transform);
    return;
  }

@@ -1244,13 +1243,23 @@ void Reduce(const GPUContext& dev_ctx,
                                                 data_t,
                                                 ReduceOp,
                                                 TransformOp<T, MPType>>(
-              x, out, TransformOp<T, MPType>(reduce_num), reduce_dims, stream);
+              dev_ctx,
+              x,
+              out,
+              TransformOp<T, MPType>(reduce_num),
+              reduce_dims,
+              stream);
        }));
  } else {
    using MPType = typename kps::details::MPTypeTrait<T>::Type;
    pten::kernels::
        TensorReduceFunctorImpl<T, T, ReduceOp, TransformOp<T, MPType>>(
-            x, out, TransformOp<T, MPType>(reduce_num), reduce_dims, stream);
+            dev_ctx,
+            x,
+            out,
+            TransformOp<T, MPType>(reduce_num),
+            reduce_dims,
+            stream);
  }
 }
 }  // namespace pten

--- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
@@ -60,9 +60,11 @@ struct ReduceSumForMatmulGrad<GPUContext, T> {
                  DenseTensor* output,
                  const std::vector<int>& reduce_dims) {
    auto stream = dev_ctx.stream();
-    kernels::
-        TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-            input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    kernels::TensorReduceFunctorImpl<T,
+                                     T,
+                                     kps::AddFunctor,
+                                     kps::IdentityFunctor<T>>(
+        dev_ctx, input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
  }
 };
 #endif

--- a/paddle/pten/tests/api/test_matmul_api.cc
+++ b/paddle/pten/tests/api/test_matmul_api.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/pten/api/include/api.h"

 #include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/kernels/copy_kernel.h"
@@ -122,7 +123,7 @@ TEST(API, matmul_cuda) {

  auto& pool = paddle::platform::DeviceContextPool::Instance();
  auto place = paddle::platform::CUDAPlace();
-  auto* dev_ctx = pool.GetByPlace(place);
+  auto* dev_ctx = static_cast<const pten::GPUContext*>(pool.GetByPlace(place));

  pten::Copy(*dev_ctx, *ref_x.get(), false, dense_x.get());
  pten::Copy(*dev_ctx, *ref_y.get(), false, dense_y.get());

--- a/paddle/pten/tests/api/test_sparse_utils_api.cc
+++ b/paddle/pten/tests/api/test_sparse_utils_api.cc
@@ -47,6 +47,7 @@ TEST(API, to_sparse_coo) {
  std::copy(&dense_data[0][0], &dense_data[0][0] + 9, dense_x_data);

  pten::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();

  // 1. test dense_to_sparse_coo
  paddle::experimental::Tensor x(dense_x);

--- a/paddle/pten/tests/core/test_device_context.cc
+++ b/paddle/pten/tests/core/test_device_context.cc
@@ -25,43 +25,29 @@ limitations under the License. */
 namespace pten {
 namespace tests {

+class InferenceCPUContext : public CPUContext {
+ public:
+  void SetEigenDevice(Eigen::DefaultDevice* eigen_device) {
+    CPUContext::SetEigenDevice(eigen_device);
+  }
+};
+
 TEST(DeviceContext, cpu_context) {
  std::cout << "test training scenarios" << std::endl;
  {
    pten::CPUContext ctx;
+    ctx.Init();
    EXPECT_TRUE(ctx.eigen_device() != nullptr);
  }

  std::cout << "test inference scenarios" << std::endl;
  Eigen::DefaultDevice* device = new Eigen::DefaultDevice();
  {
-    pten::CPUContextResource ctx_res{device};
-    pten::CPUContext ctx(ctx_res);
-    EXPECT_TRUE(ctx.eigen_device() != nullptr);
-  }
-  {
-    pten::CPUContextResource ctx_res{nullptr};
-    pten::CPUContext ctx(ctx_res);
+    InferenceCPUContext ctx;
    ctx.SetEigenDevice(device);
    EXPECT_TRUE(ctx.eigen_device() != nullptr);
  }
  delete device;
-
-  std::cout << "test copy constructor" << std::endl;
-  {
-    pten::CPUContext ctx1;
-    pten::CPUContext ctx2(ctx1);
-    EXPECT_EQ(ctx1.eigen_device(), ctx2.eigen_device());
-  }
-
-  std::cout << "test move constructor" << std::endl;
-  {
-    pten::CPUContext ctx1 = pten::CPUContext();
-    auto* eigen_device1 = ctx1.eigen_device();
-    pten::CPUContext ctx2(std::move(ctx1));
-    auto* eigen_device2 = ctx2.eigen_device();
-    EXPECT_EQ(eigen_device1, eigen_device2);
-  }
 }

 }  // namespace tests

--- a/paddle/pten/tests/kernels/test_cast_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_cast_dev_api.cc
@@ -50,10 +50,10 @@ TEST(DEV_API, cast) {
  }

  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();

  pten::DataType out_dtype = pten::DataType::FLOAT64;
  // 2. test API

--- a/paddle/pten/tests/kernels/test_concat_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_concat_dev_api.cc
@@ -59,10 +59,10 @@ TEST(DEV_API, concat) {

  // 2. test API
  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();
  auto out = pten::Concat<float>(dev_ctx, inputs, 0);

  // 3. check result

--- a/paddle/pten/tests/kernels/test_conj_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_conj_dev_api.cc
@@ -46,10 +46,10 @@ TEST(DEV_API, conj) {
  }

  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();

  // 2. test API
  auto out = pten::Conj<paddle::complex64>(dev_ctx, dense_x);

--- a/paddle/pten/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc
@@ -58,10 +58,10 @@ TEST(DEV_API, copy) {
  std::cout << typeid(a).name() << std::endl;
  // 2. test API
  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();
  pten::Copy(dev_ctx, *(dense_src.get()), false, dense_dst.get());

  // 3. check result

--- a/paddle/pten/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_creation_dev_api.cc
@@ -33,10 +33,10 @@ using DDim = pten::framework::DDim;
 TEST(DEV_API, empty) {
  // 1. create input
  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();

  // 2. test API
  auto out = pten::Empty<float>(dev_ctx, {3, 2}, pten::DataType::INT32);
@@ -64,10 +64,10 @@ TEST(DEV_API, empty_like) {

  // 2. test API
  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();
  auto out = pten::EmptyLike<float>(dev_ctx, dense_x);

  // 3. check result
@@ -84,10 +84,10 @@ TEST(DEV_API, full) {

  // 2. test API
  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();
  auto out = pten::Full<float>(dev_ctx, {3, 2}, val, pten::DataType::FLOAT32);

  // 3. check result
@@ -118,10 +118,10 @@ TEST(DEV_API, full_like) {
  float val = 1.0;

  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();

  // 2. test API
  auto out = pten::FullLike<float>(dev_ctx, dense_x, val);

--- a/paddle/pten/tests/kernels/test_dot_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_dot_dev_api.cc
@@ -60,10 +60,10 @@ TEST(DEV_API, dot) {

  // 2. test API
  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();
  auto out = pten::Dot<float>(dev_ctx, dense_x, dense_y);

  // 3. check result

--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
@@ -62,10 +62,10 @@ TEST(DEV_API, add) {

  // 2. test API
  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();
  auto dense_out = pten::Add<float>(dev_ctx, dense_x, dense_y);

  // 3. check result
@@ -116,10 +116,10 @@ TEST(DEV_API, subtract) {

  // 2. test API
  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();
  auto dense_out = pten::Subtract<float>(dev_ctx, dense_x, dense_y);

  // 3. check result
@@ -170,10 +170,10 @@ TEST(DEV_API, divide) {

  // 2. test API
  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();
  auto dense_out = pten::Divide<float>(dev_ctx, dense_x, dense_y);

  // 3. check result
@@ -224,10 +224,10 @@ TEST(DEV_API, multiply) {

  // 2. test API
  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();
  auto dense_out = pten::Multiply<float>(dev_ctx, dense_x, dense_y);

  // 3. check result

--- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
@@ -56,10 +56,10 @@ TEST(DEV_API, flatten) {
  }
  int start_axis = 1, stop_axis = 2;
  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();

  // 2. test API
  auto out = pten::Flatten<float>(dev_ctx, dense_x, start_axis, stop_axis);

--- a/paddle/pten/tests/kernels/test_matmul_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
@@ -55,10 +55,10 @@ TEST(DEV_API, dot) {

  // 2. test API
  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();
  auto out = Matmul<float, CPUContext>(dev_ctx, dense_x, dense_y, false, false);

  // 3. check result

--- a/paddle/pten/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc
@@ -49,10 +49,10 @@ TEST(DEV_API, mean) {

  // 2. test API
  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();
  auto out = pten::Mean<float>(dev_ctx, dense_x, dims, false);

  // 3. check result

--- a/paddle/pten/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
@@ -48,10 +48,10 @@ TEST(DEV_API, reshape) {

  // 2. test API
  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();
  auto out = pten::Reshape<float>(dev_ctx, dense_x, shape);
  // 3. check result
  std::vector<int64_t> expect_shape = {12, 3};

--- a/paddle/pten/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc
@@ -49,10 +49,11 @@ TEST(DEV_API, scale) {

  // 2. test API
  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();
+
  auto out =
      pten::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);

@@ -92,10 +93,11 @@ TEST(DEV_API, scale_host) {

  // 2. test API
  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();
+
  auto out =
      pten::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);


--- a/paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>

+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/common/place.h"
 #include "paddle/pten/kernels/copy_kernel.h"
 #include "paddle/pten/kernels/sparse/sparse_utils_kernel.h"

@@ -22,6 +24,8 @@ limitations under the License. */
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"

+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+
 namespace pten {
 namespace tests {

@@ -38,9 +42,8 @@ inline void CheckResult(
  ASSERT_EQ(coo.nnz(), non_zero_num);

 #if defined(PADDLE_WITH_CUDA)
-  if (coo.place() == paddle::platform::CUDAPlace()) {
-    const auto* dev_ctx_cuda =
-        static_cast<const paddle::platform::CUDADeviceContext*>(dev_ctx);
+  if (coo.place() == pten::GPUPlace()) {
+    const auto* dev_ctx_cuda = static_cast<const pten::GPUContext*>(dev_ctx);
    DenseTensor indices(
        alloc.get(),
        DenseTensorMeta(
@@ -86,6 +89,8 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
      paddle::platform::CPUPlace());

  pten::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+
  // 1. test cpu
  auto cpu_sparse_out =
      sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_x, sparse_dim);
@@ -98,9 +103,21 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,

 // 2. test cuda
 #if defined(PADDLE_WITH_CUDA)
-  paddle::platform::DeviceContextPool& pool =
-      paddle::platform::DeviceContextPool::Instance();
-  auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
+  // paddle::platform::DeviceContextPool& pool =
+  //     paddle::platform::DeviceContextPool::Instance();
+  // auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
+  pten::GPUContext dev_ctx_gpu;
+  dev_ctx_gpu.PartialInitWithoutAllocator();
+  dev_ctx_gpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
+          .get());
+  dev_ctx_gpu.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(pten::CPUPlace())
+          .get());
+  dev_ctx_gpu.PartialInitWithAllocator();
+
  const auto cuda_alloc =
      std::make_shared<paddle::experimental::DefaultAllocator>(
          paddle::platform::CUDAPlace());
@@ -108,10 +125,10 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
      cuda_alloc.get(),
      DenseTensorMeta(dense_x.dtype(), dense_x.dims(), dense_x.layout()));

-  pten::Copy(*dev_ctx_cuda, dense_x, true, &d_dense_x);
+  pten::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x);
  auto sparse_out =
-      sparse::DenseToSparseCoo<T>(*dev_ctx_cuda, d_dense_x, sparse_dim);
-  CheckResult<T, int64_t>(dev_ctx_cuda,
+      sparse::DenseToSparseCoo<T>(dev_ctx_gpu, d_dense_x, sparse_dim);
+  CheckResult<T, int64_t>(&dev_ctx_gpu,
                          sparse_out,
                          non_zero_data,
                          indices_data,
@@ -295,20 +312,32 @@ void TestSparseCsrToCoo(const DDim& dense_dims,
                          alloc);
 // 2. test cuda
 #if defined(PADDLE_WITH_CUDA)
+  pten::GPUContext dev_ctx_gpu;
+  dev_ctx_gpu.PartialInitWithoutAllocator();
+  dev_ctx_gpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
+          .get());
+  dev_ctx_gpu.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(pten::CPUPlace())
+          .get());
+  dev_ctx_gpu.PartialInitWithAllocator();
+
  const auto cuda_alloc =
      std::make_shared<paddle::experimental::DefaultAllocator>(
          paddle::platform::CUDAPlace());
-  auto& pool = paddle::platform::DeviceContextPool::Instance();
-  auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
+  // auto& pool = paddle::platform::DeviceContextPool::Instance();
+  // auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
  pten::DenseTensor d_crows(cuda_alloc.get(), crows_meta);
  pten::DenseTensor d_cols(cuda_alloc.get(), cols_meta);
  pten::DenseTensor d_values(cuda_alloc.get(), values_meta);
-  pten::Copy(*dev_ctx_cuda, crows, true, &d_crows);
-  pten::Copy(*dev_ctx_cuda, cols, true, &d_cols);
-  pten::Copy(*dev_ctx_cuda, values, true, &d_values);
+  pten::Copy(dev_ctx_gpu, crows, true, &d_crows);
+  pten::Copy(dev_ctx_gpu, cols, true, &d_cols);
+  pten::Copy(dev_ctx_gpu, values, true, &d_values);
  pten::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims);
-  auto cuda_sparse_out = sparse::SparseCsrToCoo<T>(*dev_ctx_cuda, d_csr);
-  CheckResult<T, int64_t>(dev_ctx_cuda,
+  auto cuda_sparse_out = sparse::SparseCsrToCoo<T>(dev_ctx_gpu, d_csr);
+  CheckResult<T, int64_t>(&dev_ctx_gpu,
                          cuda_sparse_out,
                          non_zero_data,
                          indices_data,

--- a/paddle/pten/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_sum_dev_api.cc
@@ -47,10 +47,11 @@ TEST(DEV_API, sum) {

  std::vector<int64_t> axis = {0, 1};
  pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.Init();
+
  // 2. test API
  auto out =
      pten::Sum<float>(dev_ctx, dense_x, axis, pten::DataType::FLOAT32, false);