[PTEN] Add Gpu context (#39305)

a821c4a9 · Wilber · GitHub · dcff7fa8 · a821c4a9 · a821c4a9
135 changed file
--- a/paddle/fluid/distributed/common/utils.h
+++ b/paddle/fluid/distributed/common/utils.h
@@ -33,7 +33,7 @@ namespace distributed {
 template <typename T>
 inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
 GetBlas() {
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
  return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
                                          T>(cpu_ctx);
 }

--- a/paddle/fluid/distributed/ps/service/communicator/communicator.cc
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
@@ -1155,7 +1155,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
    auto &t_latest = var_latest->Get<framework::LoDTensor>();
    auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();

-    auto cpu_ctx = paddle::platform::CPUDeviceContext();
+    paddle::platform::CPUDeviceContext cpu_ctx;
    auto *var_delta = delta_scope_->Var(varname);
    auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
    t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
@@ -1185,7 +1185,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
  RpcRecvDense(varnames, table_id, pserver_scope_.get());

  // 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
  for (auto &varname : varnames) {
    auto *var_latest = recv_scope_->FindVar(varname);
    auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
@@ -1292,7 +1292,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
  auto *t_old = var_old->GetMutable<framework::LoDTensor>();

  auto dims1 = t_latest.dims()[1];
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;

  auto *var_delta = delta_scope_->Var(varname);
  auto *t_delta = var_delta->GetMutable<pten::SelectedRows>();
@@ -1370,7 +1370,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
  std::vector<float> v_delta;
  v_delta.resize(numel);

-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
  auto blas =
      paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
          cpu_ctx);

--- a/paddle/fluid/distributed/ps/service/communicator/communicator.h
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h
@@ -179,7 +179,7 @@ inline void MergeVars(const std::string &var_name,
    }

    // set output tensor to 0.
-    auto cpu_ctx = paddle::platform::CPUDeviceContext();
+    paddle::platform::CPUDeviceContext cpu_ctx;
    paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T>
        constant_functor;
    constant_functor(cpu_ctx, out_t, static_cast<T>(0));
@@ -204,7 +204,7 @@ inline void MergeVars(const std::string &var_name,
    for (auto &var : vars) {
      inputs.push_back(&var->Get<pten::SelectedRows>());
    }
-    auto dev_ctx = paddle::platform::CPUDeviceContext();
+    paddle::platform::CPUDeviceContext dev_ctx;
    if (merge_add) {
      paddle::operators::math::scatter::MergeAdd<
          paddle::platform::CPUDeviceContext, T>

--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
@@ -21,7 +21,10 @@ TEST(DataTypeTransform, GPUTransform) {
  auto cpu_place = paddle::platform::CPUPlace();
  auto gpu_place = paddle::platform::CUDAPlace(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
-
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
  auto kernel_fp16 = paddle::framework::OpKernelType(
      paddle::framework::proto::VarType::FP16, gpu_place,
      paddle::framework::DataLayout::kAnyLayout,

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -1361,7 +1361,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
      auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
          pool.Get(member_->places_[dev_id]));
      auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
-      dev_ctx->set_bkcl_context(bkcl_ctx.comm());
+      dev_ctx->SetBkclContext(bkcl_ctx.comm());
    }
 #else
    PADDLE_THROW(

--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -77,6 +77,13 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> {
  using TYPE = pten::CPUContext;
 };

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+struct ConvertToPtenContext<platform::CUDADeviceContext> {
+  using TYPE = pten::GPUContext;
+};
+#endif
+
 #ifdef PADDLE_WITH_XPU
 template <>
 struct ConvertToPtenContext<platform::XPUDeviceContext> {

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -1085,7 +1085,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
    is.seekg(seekg, is.cur);

    void* buf;
-    auto ctx = platform::CPUDeviceContext();
+    platform::CPUDeviceContext ctx;
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
        platform::is_xpu_place(dev_ctx.GetPlace()) ||
@@ -1155,7 +1155,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
    tensor->Resize(framework::make_ddim(dims));
    void* buf;
-    auto ctx = platform::CPUDeviceContext();
+    platform::CPUDeviceContext ctx;
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
        platform::is_xpu_place(dev_ctx.GetPlace()) ||
@@ -1432,4 +1432,4 @@ std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) {
  VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
  return os;
 }
-}
+}  // namespace pten
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -73,6 +73,10 @@ TEST(TensorCopy, Tensor) {
    // CPU Tensor to GPU Tensor
    auto gpu_place = new platform::CUDAPlace(0);
    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(*gpu_place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
    TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);

    // GPU Tensor to CPU Tensor
@@ -166,6 +170,10 @@ TEST(TensorFromVector, Tensor) {
    gpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
    auto gpu_place = new paddle::platform::CUDAPlace();
    paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(*gpu_place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
    paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
    // Copy from GPU to CPU tensor for comparison
    paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
@@ -230,6 +238,10 @@ TEST(TensorToVector, Tensor) {
    paddle::framework::Tensor gpu_tensor;
    paddle::platform::CUDAPlace place;
    paddle::platform::CUDADeviceContext gpu_ctx(place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
    paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);

    std::vector<int> dst;
@@ -267,6 +279,10 @@ TEST(TensorToVector, Tensor_bool) {
    paddle::framework::Tensor gpu_tensor;
    paddle::platform::CUDAPlace place;
    paddle::platform::CUDADeviceContext gpu_ctx(place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
    paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);

    std::vector<bool> dst;
@@ -493,6 +509,10 @@ TEST(Tensor, FromAndToStream) {

    auto gpu_place = new platform::CUDAPlace();
    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(*gpu_place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();

    TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);


--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -46,6 +46,17 @@ void GLOOParallelContext::Init() {
  gloo_wrapper->Init();
  device_ = std::unique_ptr<platform::CPUDeviceContext>(
      new platform::CPUDeviceContext(platform::CPUPlace()));
+  device_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(platform::CPUPlace())
+                            .get());
+  device_->SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  device_->SetZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(platform::CPUPlace())
+          .get());
 }

 void GLOOParallelContext::InitWithRingID(int ring_id) {

--- a/paddle/fluid/inference/lite/test_engine_lite.cc
+++ b/paddle/fluid/inference/lite/test_engine_lite.cc
@@ -77,6 +77,10 @@ void make_fake_model(std::string* model, std::string* param) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  platform::CUDAPlace place;
  platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
 #else
  platform::CPUPlace place;
  platform::CPUDeviceContext ctx(place);

--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -27,6 +27,18 @@ class TensorRTEngineTest : public ::testing::Test {
 protected:
  void SetUp() override {
    ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
+    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
+                           .get());
+    ctx_->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx_->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(platform::CUDAPlace(0))
+            .get());
+    ctx_->PartialInitWithAllocator();

    engine_ = new TensorRTEngine(10, 1 << 10);
    engine_->InitNetwork();

--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
@@ -18,6 +18,7 @@
 #include <vector>

 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
@@ -44,6 +45,10 @@ TEST(BestFitAllocator, concurrent_cuda) {

  platform::CUDAPlace gpu(0);
  platform::CUDADeviceContext dev_ctx(gpu);
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu, dev_ctx.stream())
+                           .get());
+  dev_ctx.PartialInitWithAllocator();

  auto th_main = [&](std::random_device::result_type seed) {
    std::default_random_engine engine(seed);

--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -25,6 +25,7 @@
 #include <vector>

 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device_context.h"

@@ -105,8 +106,21 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
  main_stream_alloc_ptr.reset();

  for (int i = 0; i < NUM_STREAMS; ++i) {
-    dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
-        new platform::CUDADeviceContext(place)));
+    auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
+        new platform::CUDADeviceContext(place));
+    ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                          .GetAllocator(place, ctx->stream())
+                          .get());
+    ctx->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(place)
+            .get());
+    ctx->PartialInitWithAllocator();
+    dev_ctx.emplace_back(std::move(ctx));
    MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]);
  }

@@ -144,8 +158,21 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
  main_stream_alloc_ptr.reset();

  for (int i = 0; i < NUM_STREAMS; ++i) {
-    dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
-        new platform::CUDADeviceContext(place)));
+    auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
+        new platform::CUDADeviceContext(place));
+    ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                          .GetAllocator(place, ctx->stream())
+                          .get());
+    ctx->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(place)
+            .get());
+    ctx->PartialInitWithAllocator();
+    dev_ctx.emplace_back(std::move(ctx));
    threads.push_back(std::thread(MultiStreamCompute, &data[i], &second_data[i],
                                  std::cref(*dev_ctx[i])));
  }

--- a/paddle/fluid/operators/arg_min_max_op_base.cu.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.cu.h
@@ -110,7 +110,7 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
    return block_size;
  };

-  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x;
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
  int64_t height = pre * post;
  int64_t width = n;
  int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;

--- a/paddle/fluid/operators/argsort_op.cu
+++ b/paddle/fluid/operators/argsort_op.cu
@@ -131,7 +131,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,

  int block_size = ComputeBlockSize(num_cols);

-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
  // actually, int num_rows < max_grid_size
  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
  // Init a index array
@@ -212,7 +212,7 @@ void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,

  int block_size = ComputeBlockSize(num_cols);

-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
  // actually, int num_rows < max_grid_size
  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
  FillGrad<<<grid_size, block_size, 0, cu_stream>>>(

--- a/paddle/fluid/operators/broadcast_tensors_op.cu
+++ b/paddle/fluid/operators/broadcast_tensors_op.cu
@@ -90,8 +90,8 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
        // reduce_sum implementation on CUDA
        auto stream = context.cuda_device_context().stream();
        TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-            *input_tensor, output_tensor, kps::IdentityFunctor<T>(),
-            reduce_dims_vec, stream);
+            context.cuda_device_context(), *input_tensor, output_tensor,
+            kps::IdentityFunctor<T>(), reduce_dims_vec, stream);
      }
    }
  }

--- a/paddle/fluid/operators/cholesky_solve_op.cu
+++ b/paddle/fluid/operators/cholesky_solve_op.cu
@@ -115,7 +115,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
    }
    gpuStream_t stream = ctx.cuda_device_context().stream();
    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream);
+        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
+        out_reduce_dims, stream);
  }
 };


--- a/paddle/fluid/operators/clip_by_norm_op.cu
+++ b/paddle/fluid/operators/clip_by_norm_op.cu
@@ -77,7 +77,7 @@ class ClipByNormKernel<platform::CUDADeviceContext, platform::float16>
        {1}, dev_ctx);
    TensorReduceFunctorImpl<platform::float16, float, kps::AddFunctor,
                            kps::SquareFunctor<platform::float16, float>>(
-        *input, &tmp, kps::SquareFunctor<platform::float16, float>(),
+        dev_ctx, *input, &tmp, kps::SquareFunctor<platform::float16, float>(),
        reduce_dims, dev_ctx.stream());
    auto tmp_eigen = EigenVector<float>::Flatten(tmp);
    auto x_norm = tmp_eigen.sqrt();

--- a/paddle/fluid/operators/controlflow/compare_all_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_all_op.cu
@@ -65,7 +65,8 @@ class CompareReduceOpKernel
      auto stream = context.cuda_device_context().stream();
      TensorReduceFunctorImpl<bool, bool, BitwiseAdd,
                              kps::IdentityFunctor<bool>>(
-          tmp, z, kps::IdentityFunctor<bool>(), reduce_dims, stream);
+          context.cuda_device_context(), tmp, z, kps::IdentityFunctor<bool>(),
+          reduce_dims, stream);
    }
  }
 };

--- a/paddle/fluid/operators/copy_cross_scope_test.cc
+++ b/paddle/fluid/operators/copy_cross_scope_test.cc
@@ -131,12 +131,20 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
 TEST(copy_cross_scope, CUDA_fp32) {
  f::Scope scope;
  p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(p::CUDAPlace(0), ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
  Compare1<float>(&scope, ctx, "copy_cross_scope");
 }

 TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
  f::Scope scope;
  p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(p::CUDAPlace(0), ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
  Compare2<float>(&scope, ctx, "copy_cross_scope");
 }
 #elif PADDLE_WITH_ASCEND_CL

--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"

 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
@@ -51,8 +52,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
      auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
      auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod);
      auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
-      pten::MultiplyRawKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis,
-                                 pt_z.get());
+      pten::MultiplyRawKernel<T>(static_cast<const pten::GPUContext&>(cuda_ctx),
+                                 *pt_x.get(), *pt_y.get(), axis, pt_z.get());
    } else {
      PADDLE_THROW(platform::errors::InvalidArgument(
          "X's type[%s] is not supported by elementwise_op. X's type should be "

--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -1189,7 +1189,8 @@ void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
                   framework::Tensor *src, framework::Tensor *dst) {
  std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis);
  TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-      *src, dst, kps::IdentityFunctor<T>(), reduce_dims, dev_ctx.stream());
+      dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims,
+      dev_ctx.stream());
 }

 template <ElementwiseType ET, typename T, typename Functor>

--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -275,6 +275,18 @@ class TestFeedForward {
    output_size_ = 3 * num_head_ * dim_head_;
    input_size_ = dim_embed_;
    ctx_ = new platform::CUDADeviceContext(place_);
+    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(place_, ctx_->stream())
+                           .get());
+    ctx_->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx_->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(place_)
+            .get());
+    ctx_->PartialInitWithAllocator();

    size_src_ = bsz_seq_ * dim_embed_;         // src: [bs, seq_len, em_dim]
    size_weight_ = dim_embed_ * output_size_;  // weight: [output_size, em_dim]

--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -166,7 +166,8 @@ class AttnMatMul {
      if (support_case_1 || support_case_2) {
        gpuStream_t stream = dev_ctx_.stream();
        TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-            *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1}, stream);
+            dev_ctx_, *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1},
+            stream);
      } else {
        PADDLE_THROW(platform::errors::InvalidArgument(
            "Only support reduce when the input dims are [0,1,2,3,4] and "

--- a/paddle/fluid/operators/gelu_op.cu
+++ b/paddle/fluid/operators/gelu_op.cu
@@ -108,7 +108,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
        is_aligned(y, kAlignment)) {                                          \
      size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
      size_t block = (n / __vec_size + thread - 1) / thread;                  \
-      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize().x);     \
+      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
      VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block          \
               << " , thread = " << thread;                                   \
      FP16FastGeluFwdCUDAKernel<                                              \
@@ -144,7 +144,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
        is_aligned(x_g, kAlignment)) {                                        \
      size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
      size_t block = (n / __vec_size + thread - 1) / thread;                  \
-      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize().x);     \
+      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
      VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block          \
               << " , thread = " << thread;                                   \
      FP16FastGeluBwdCUDAKernel<                                              \

--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -260,7 +260,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, const T* input,
  int block = 1024;
 #endif
  const auto& dev_ctx = ctx.cuda_device_context();
-  int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+  int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
  int grid_tmp = (num_input + block - 1) / block;
  int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
  // 1. Insert data into keys and values.
@@ -334,7 +334,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
  int block = 1024;
 #endif
  const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
  int64_t grid_tmp = (outputs->size() + block - 1) / block;
  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
  ReindexSrcOutput<

--- a/paddle/fluid/operators/graph_send_recv_op.cu
+++ b/paddle/fluid/operators/graph_send_recv_op.cu
@@ -197,7 +197,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(
 #endif
  int64_t n = slice_size * index_size;
  const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
  int64_t grid_tmp = (n + block - 1) / block;
  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
  int64_t input_size = src_dims[0];
@@ -320,7 +320,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
 #endif
  int64_t n = slice_size * index_size;
  const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
  int64_t grid_tmp = (n + block - 1) / block;
  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
  int64_t input_size = src_dims[0];

--- a/paddle/fluid/operators/gumbel_softmax_op.cu
+++ b/paddle/fluid/operators/gumbel_softmax_op.cu
@@ -92,7 +92,7 @@ struct OneHotGenerator<platform::CUDADeviceContext, T> {
    const int size_from_axis = SizeFromAxis(axis, X.dims());
    const int size_out_axis = SizeOutAxis(axis, X.dims());
    constexpr int thread_size = 512;
-    int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize().x;
+    int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize()[0];
    int64_t height = size_to_axis * size_out_axis;
    int block_size = height < max_grid_dimx ? height : max_grid_dimx;


--- a/paddle/fluid/operators/index_sample_op.cu
+++ b/paddle/fluid/operators/index_sample_op.cu
@@ -27,10 +27,10 @@ namespace operators {

 namespace {
 void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) {
-  dim3 max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
+  auto max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
                          .GetCUDAMaxGridDimSize();
-  grid_dim->x = grid_dim->x < max_grid_dim.x ? grid_dim->x : max_grid_dim.x;
-  grid_dim->y = grid_dim->y < max_grid_dim.y ? grid_dim->y : max_grid_dim.y;
+  grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
+  grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
 }
 }


--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -45,11 +45,11 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D(
  int block_y = std::min(GetLastPow2(height), max_threads / block_x);
  int block_z = std::min(num_img, max_threads / block_x / block_y);

-  dim3 max_grid_dim = context.GetCUDAMaxGridDimSize();
-  int grid_x = std::min<int>(max_grid_dim.x, platform::DivUp(width, block_x));
-  int grid_y = std::min<int>(max_grid_dim.y, platform::DivUp(height, block_y));
+  auto max_grid_dim = context.GetCUDAMaxGridDimSize();
+  int grid_x = std::min<int>(max_grid_dim[0], platform::DivUp(width, block_x));
+  int grid_y = std::min<int>(max_grid_dim[1], platform::DivUp(height, block_y));
  int grid_z =
-      std::min<int>(max_grid_dim.z, platform::DivUp(num_img, block_z * 4));
+      std::min<int>(max_grid_dim[2], platform::DivUp(num_img, block_z * 4));

  const int capability = context.GetComputeCapability();
  platform::GpuLaunchConfig config;

--- a/paddle/fluid/operators/kron_op.h
+++ b/paddle/fluid/operators/kron_op.h
@@ -306,11 +306,11 @@ struct KronGradOpFunctor {
    auto stream = dev_ctx.stream();  // it is a cuda device_context
    if (dx) {
      TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
+          dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
    }
    if (dy) {
      TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
+          dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
    }
 #else
    auto* place = dev_ctx.eigen_device();

--- a/paddle/fluid/operators/kthvalue_op.cu
+++ b/paddle/fluid/operators/kthvalue_op.cu
@@ -54,7 +54,7 @@ bool SortKthvalue(const platform::CUDADeviceContext& ctx,
  input_indices.mutable_data<int64_t>(ctx.GetPlace());
  size_t temp_storage_bytes = -1;
  int block_size = getBlockSize(num_cols);
-  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
  unsigned int grid_size = num_rows < maxGridDimX
                               ? static_cast<unsigned int>(num_rows)
                               : maxGridDimX;

--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
@@ -72,6 +72,10 @@ TEST(LiteEngineOp, engine_op) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  platform::CUDAPlace place;
  platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
 #else
  platform::CPUPlace place;
  platform::CPUDeviceContext ctx(place);

--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -299,7 +299,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
    T* logits_max_buff = logits_max.mutable_data<T>(place);
    TensorReduceFunctorImpl<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>(
-        softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1},
+        dev_ctx, softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1},
        dev_ctx.stream());

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -321,7 +321,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
    T* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::ExpFunctor<T>>(
-        softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1},
+        dev_ctx, softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1},
        dev_ctx.stream());

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)

--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/beam_search.h"

 #include <gtest/gtest.h>
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"

 void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
                       paddle::framework::LoDTensor* scores,
@@ -129,6 +131,83 @@ void TestBeamSearch() {
  delete context;
 }

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+void TestBeamSearch<paddle::platform::CUDADeviceContext,
+                    paddle::platform::CUDAPlace>() {
+  paddle::framework::LoDTensor ids;
+  paddle::framework::LoDTensor scores;
+  paddle::framework::LoDTensor pre_ids;
+  paddle::framework::LoDTensor pre_scores;
+
+  auto* place = new paddle::platform::CUDAPlace();
+  auto* context = new paddle::platform::CUDADeviceContext(*place);
+  context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(*place, context->stream())
+                            .get());
+  context->PartialInitWithAllocator();
+  if (paddle::platform::is_cpu_place(*place)) {
+    PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores);
+  } else {
+    paddle::framework::LoDTensor cpu_ids;
+    paddle::framework::LoDTensor cpu_scores;
+    paddle::framework::LoDTensor cpu_pre_ids;
+    paddle::framework::LoDTensor cpu_pre_scores;
+
+    PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores);
+
+    paddle::framework::TensorCopySync(cpu_ids, *place, &ids);
+    paddle::framework::TensorCopySync(cpu_scores, *place, &scores);
+    paddle::framework::TensorCopySync(cpu_pre_ids, *place, &pre_ids);
+    paddle::framework::TensorCopySync(cpu_pre_scores, *place, &pre_scores);
+
+    ids.set_lod(cpu_ids.lod());
+    scores.set_lod(cpu_scores.lod());
+    pre_ids.set_lod(cpu_pre_ids.lod());
+    pre_scores.set_lod(cpu_pre_scores.lod());
+  }
+
+  paddle::framework::LoDTensor selected_ids;
+  paddle::framework::LoDTensor selected_scores;
+  paddle::framework::LoDTensor parent_idx;
+
+  size_t level = 0;
+  size_t beam_size = 2;
+  int end_id = 0;
+  paddle::operators::math::BeamSearchFunctor<
+      paddle::platform::CUDADeviceContext, float>
+      beamsearch;
+  beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
+             &selected_scores, &parent_idx, level, beam_size, end_id, true);
+
+  ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
+
+  paddle::framework::LoDTensor cpu_selected_ids;
+  paddle::framework::LoDTensor cpu_selected_scores;
+  if (paddle::platform::is_cpu_place(*place)) {
+    cpu_selected_ids = selected_ids;
+    cpu_selected_scores = selected_scores;
+  } else {
+    paddle::framework::TensorCopySync(
+        selected_ids, paddle::platform::CPUPlace(), &cpu_selected_ids);
+    paddle::framework::TensorCopySync(
+        selected_scores, paddle::platform::CPUPlace(), &cpu_selected_scores);
+    cpu_selected_ids.set_lod(selected_ids.lod());
+    cpu_selected_scores.set_lod(selected_scores.lod());
+  }
+
+  std::vector<int64_t> expected_ids({4, 5, 3, 8});
+  std::vector<float> expected_scores({0.6f, 0.5f, 0.9f, 0.7f});
+  for (int i = 0; i < 4; i++) {
+    ASSERT_EQ(expected_ids[i], cpu_selected_ids.data<int64_t>()[i]);
+    ASSERT_EQ(expected_scores[i], cpu_selected_scores.data<float>()[i]);
+  }
+
+  delete place;
+  delete context;
+}
+#endif
+
 TEST(BeamSearch, CPU) {
  TestBeamSearch<paddle::platform::CPUDeviceContext,
                 paddle::platform::CPUPlace>();

--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
--- a/paddle/fluid/operators/math/blas_impl.hip.h
+++ b/paddle/fluid/operators/math/blas_impl.hip.h
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -16,6 +16,8 @@ limitations under the License. */

 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"

 /**
 * case 1:
@@ -441,6 +443,31 @@ void TestConcatMain() {
  delete context;
 }

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+void TestConcatMain<paddle::platform::CUDADeviceContext,
+                    paddle::platform::CUDAPlace>() {
+  auto* context =
+      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
+  context->SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CUDAPlace(), context->stream())
+          .get());
+  context->PartialInitWithAllocator();
+
+  ConcatCase1<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
+      context);
+  ConcatCase2<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
+      context);
+  ConcatCase3<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
+      context);
+  ConcatCase4<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
+      context);
+
+  delete context;
+}
+#endif
+
 TEST(math, concat) {
  TestConcatMain<paddle::platform::CPUDeviceContext,
                 paddle::platform::CPUPlace>();

--- a/paddle/fluid/operators/math/cusparse_conversion_api_test.cc
+++ b/paddle/fluid/operators/math/cusparse_conversion_api_test.cc
@@ -24,6 +24,11 @@ void TestNNZ(const std::vector<T>& dense_data, const int correct_nnz,
             const int rows, const int cols) {
  paddle::platform::CUDADeviceContext* context =
      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
+  context->SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CUDAPlace(), context->stream())
+          .get());
+  context->PartialInitWithAllocator();
  auto sparse =
      paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext,
                                         T>(*context);
@@ -61,6 +66,11 @@ void TestDenseToSparse(const std::vector<T>& correct_dense_data,
                       const std::string& mode) {
  paddle::platform::CUDADeviceContext* context =
      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
+  context->SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CUDAPlace(), context->stream())
+          .get());
+  context->PartialInitWithAllocator();
  // get sparse
  auto sparse =
      paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext,

--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/im2col.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"

 template <typename DeviceContext, typename Place>
 void testIm2col() {
@@ -60,6 +62,7 @@ void testIm2col() {

  auto* place = new Place();
  DeviceContext* context = new DeviceContext(*place);
+
  if (paddle::platform::is_cpu_place(*place)) {
    input = input_tmp;
  } else {
@@ -164,6 +167,165 @@ void testIm2col() {
  delete context;
 }

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+void testIm2col<paddle::platform::CUDADeviceContext,
+                paddle::platform::CUDAPlace>() {
+  paddle::framework::Tensor input_tmp;
+  paddle::framework::Tensor input;
+  paddle::framework::Tensor output_cfo;
+  paddle::framework::Tensor output_ocf;
+  paddle::framework::Tensor output_tmp;
+
+  /**
+   * input = [0, 1, 2,
+   *          3, 4, 5]
+   *
+   * output_cfo = [0, 1
+   *               1, 2
+   *               3, 4
+   *               4, 5]
+   *
+   * output_ocf = [0, 1, 3, 4
+   *               1, 2, 4, 5]
+   *
+   * col2im_cfo = [0, 2, 2
+   *               3, 4, 5]
+   *
+   * col2im_ocf = [0, 2, 2
+   *               3, 4, 5]
+   */
+  int input_height = 2;
+  int input_width = 3;
+  int filter_size = 2;
+  std::vector<int> stride({1, 1});  // stride_y, stride_x
+  std::vector<int> padding(
+      {0, 0, 0, 0});                  // up_pad, left_pad, down_pad, right_pad
+  std::vector<int> dilation({1, 1});  // dilation_y, dilation_x
+  int output_height =
+      (input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1;
+  int output_width =
+      (input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1;
+  float* input_ptr = input_tmp.mutable_data<float>(
+      {1, input_height, input_width}, paddle::platform::CPUPlace());
+  float arr[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input_ptr, arr, 6 * sizeof(float));
+
+  auto* place = new paddle::platform::CUDAPlace();
+  auto* context = new paddle::platform::CUDADeviceContext(*place);
+  context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(*place, context->stream())
+                            .get());
+  context->PartialInitWithAllocator();
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+  output_cfo.mutable_data<float>(
+      {1, filter_size, filter_size, output_height, output_width}, *place);
+  output_ocf.mutable_data<float>(
+      {output_height, output_width, 1, filter_size, filter_size}, *place);
+
+  // Im2Col
+  paddle::operators::math::Im2ColFunctor<
+      paddle::operators::math::ColFormat::kCFO,
+      paddle::platform::CUDADeviceContext, float>
+      im2col;
+  paddle::operators::math::Im2ColFunctor<
+      paddle::operators::math::ColFormat::kOCF,
+      paddle::platform::CUDADeviceContext, float>
+      im2col_ocf;
+
+  im2col(*context, input, dilation, stride, padding, &output_cfo);
+  im2col_ocf(*context, input, dilation, stride, padding, &output_ocf);
+
+  float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
+  float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
+
+  float* out_cfo_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_cfo_ptr = output_cfo.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(output_cfo, paddle::platform::CPUPlace(),
+                                      &output_tmp);
+    out_cfo_ptr = output_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]);
+  }
+
+  float* out_ocf_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_ocf_ptr = output_ocf.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(output_ocf, paddle::platform::CPUPlace(),
+                                      &output_tmp);
+    out_ocf_ptr = output_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
+  }
+
+  // Col2Im: kCFO
+  paddle::operators::math::Col2ImFunctor<
+      paddle::operators::math::ColFormat::kCFO,
+      paddle::platform::CUDADeviceContext, float>
+      col2im;
+  paddle::operators::math::Col2ImFunctor<
+      paddle::operators::math::ColFormat::kOCF,
+      paddle::platform::CUDADeviceContext, float>
+      col2im_ocf;
+  float col2im_data[] = {0, 2, 2, 3, 8, 5};
+
+  memset(input_ptr, 0, 6 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+
+  col2im(*context, output_cfo, dilation, stride, padding, &input);
+
+  float* in_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
+                                      &input_tmp);
+    in_ptr = input_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(in_ptr[i], col2im_data[i]);
+  }
+
+  // Col2Im: kOCF
+  memset(input_ptr, 0, 6 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+
+  col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
+                                      &input_tmp);
+    in_ptr = input_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(in_ptr[i], col2im_data[i]);
+  }
+
+  delete place;
+  delete context;
+}
+#endif
+
 TEST(math, im2col) {
  testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -194,7 +194,7 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
  constexpr size_t kThreadNumY = 32;

  size_t grid_dim = (outer_dim + kThreadNumY - 1) / kThreadNumY;
-  grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize().x);
+  grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]);
  dim3 thread_dims(kThreadNumX, kThreadNumY);
  if (reverse) {
    InclusiveScanInnerDimCUDAKernel<

--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/kernels/funcs/eigen/common.h"

 namespace paddle {
@@ -44,6 +45,18 @@ template struct SetConstant<platform::CUDADeviceContext,
 template struct SetConstant<platform::CUDADeviceContext,
                            platform::complex<double>>;

+template struct SetConstant<pten::GPUContext, platform::float16>;
+template struct SetConstant<pten::GPUContext, platform::bfloat16>;
+template struct SetConstant<pten::GPUContext, float>;
+template struct SetConstant<pten::GPUContext, double>;
+template struct SetConstant<pten::GPUContext, uint8_t>;
+template struct SetConstant<pten::GPUContext, int>;
+template struct SetConstant<pten::GPUContext, int16_t>;
+template struct SetConstant<pten::GPUContext, int64_t>;
+template struct SetConstant<pten::GPUContext, bool>;
+template struct SetConstant<pten::GPUContext, platform::complex<float>>;
+template struct SetConstant<pten::GPUContext, platform::complex<double>>;
+
 template struct SetConstant<platform::CUDAPinnedDeviceContext,
                            platform::float16>;
 template struct SetConstant<platform::CUDAPinnedDeviceContext,

--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -223,6 +223,7 @@ TEST(math_funciton, set_constant) {
  t.Resize({10, 10});
  t.mutable_data<int>(paddle::platform::CPUPlace());
  auto* ctx = new paddle::platform::CPUDeviceContext();
+  ctx->Init();
  paddle::operators::math::set_constant(*ctx, &t, 10);
  for (int64_t i = 0; i < t.numel(); ++i) {
    PADDLE_ENFORCE_EQ(10, t.data<int>()[i],

--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/fluid/operators/math/math_function_test.cu
@@ -46,6 +46,10 @@ TEST(math_function, notrans_mul_trans_fp32) {
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();

  float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
  float arr[6] = {0, 1, 2, 3, 4, 5};
@@ -78,6 +82,10 @@ TEST(math_function, notrans_mul_trans_fp16) {
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();

  // fp16 GEMM in cublas requires GPU compute capability >= 53
  if (context.GetComputeCapability() < 53) {
@@ -117,6 +125,10 @@ TEST(math_function, trans_mul_notrans_fp32) {
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();

  float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
  float arr[6] = {0, 1, 2, 3, 4, 5};
@@ -155,6 +167,10 @@ TEST(math_function, trans_mul_notrans_fp16) {
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();

  // fp16 GEMM in cublas requires GPU compute capability >= 53
  if (context.GetComputeCapability() < 53) {
@@ -200,6 +216,10 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();

  int m = 2;
  int n = 3;
@@ -254,6 +274,10 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();

  // fp16 GEMM in cublas requires GPU compute capability >= 53
  if (context.GetComputeCapability() < 53) {
@@ -316,6 +340,10 @@ TEST(math_function, gemm_trans_cublas_fp32) {
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();

  int m = 2;
  int n = 3;
@@ -364,6 +392,10 @@ TEST(math_function, gemm_trans_cublas_fp16) {
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();

  // fp16 GEMM in cublas requires GPU compute capability >= 53
  if (context.GetComputeCapability() < 53) {
@@ -418,6 +450,10 @@ void GemvTest(int m, int n, bool trans) {
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();

  T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
  T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);

--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/vol2col.h"

 #include <gtest/gtest.h>
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"

 template <typename DeviceContext, typename Place>
 void testVol2col() {
@@ -25,7 +27,6 @@ void testVol2col() {

  auto* place = new Place();
  DeviceContext* context = new DeviceContext(*place);
-
  /**
   * input = [[0, 1, 2,
   *          3, 4, 5]
@@ -123,6 +124,124 @@ void testVol2col() {
  delete context;
 }

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+void testVol2col<paddle::platform::CUDADeviceContext,
+                 paddle::platform::CUDAPlace>() {
+  paddle::framework::Tensor input;
+  paddle::framework::Tensor input_tmp;
+  paddle::framework::Tensor output;
+  paddle::framework::Tensor output_tmp;
+
+  auto* place = new paddle::platform::CUDAPlace();
+  auto* context = new paddle::platform::CUDADeviceContext(*place);
+  context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(*place, context->stream())
+                            .get());
+  context->PartialInitWithAllocator();
+
+  /**
+   * input = [[0, 1, 2,
+   *          3, 4, 5]
+   *          [6, 7, 8,
+   *          9, 10, 11]]
+   *
+   * output = [0, 1
+   *           1, 2
+   *           3, 4
+   *           4, 5
+   *           6, 7
+   *           7, 8
+   *           9, 10
+   *           10, 11]
+   *
+   * col2vol = [[0, 2, 2,
+   *             3, 8, 5]
+   *            [6, 14, 8,
+   *             9, 20, 11]]
+   *
+   */
+  int input_depth = 2;
+  int input_height = 2;
+  int input_width = 3;
+  int filter_size = 2;
+  std::vector<int> strides({1, 1, 1});
+  std::vector<int> paddings({0, 0, 0});
+  std::vector<int> dilations({1, 1, 1});
+  int output_depth =
+      (input_depth - filter_size + 2 * paddings[0]) / strides[0] + 1;
+  int output_height =
+      (input_height - filter_size + 2 * paddings[1]) / strides[1] + 1;
+  int output_width =
+      (input_width - filter_size + 2 * paddings[2]) / strides[2] + 1;
+
+  // Vol2Col test
+  float* input_ptr =
+      input_tmp.mutable_data<float>({1, input_depth, input_height, input_width},
+                                    paddle::platform::CPUPlace());
+  float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input_ptr, arr, 12 * sizeof(float));
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+  output.mutable_data<float>({1, filter_size, filter_size, filter_size,
+                              output_depth, output_height, output_width},
+                             *place);
+
+  paddle::operators::math::Vol2ColFunctor<paddle::platform::CUDADeviceContext,
+                                          float>
+      vol2col;
+  vol2col(*context, input, dilations, strides, paddings, &output);
+
+  float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
+  float* out_cfo_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_cfo_ptr = output.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(output, paddle::platform::CPUPlace(),
+                                      &output_tmp);
+    out_cfo_ptr = output_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 16; ++i) {
+    EXPECT_EQ(out_cfo_ptr[i], vol_2_col[i]);
+  }
+
+  // Col2Vol test
+  float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11};
+  memset(input_ptr, 0, 12 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+
+  paddle::operators::math::Col2VolFunctor<paddle::platform::CUDADeviceContext,
+                                          float>
+      col2vol;
+  col2vol(*context, output, dilations, strides, paddings, &input);
+
+  float* in_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
+                                      &input_tmp);
+    in_ptr = input_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 12; ++i) {
+    EXPECT_EQ(in_ptr[i], col_2_vol[i]);
+  }
+
+  delete place;
+  delete context;
+}
+#endif
+
 TEST(math, vol2col) {
  testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -66,7 +66,8 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
      reduce_dims.push_back(i);
    }
    TensorReduceFunctorImpl<T, T, kernel_primitives::AddFunctor, Div>(
-        *input, output, Div(numel), reduce_dims, stream);
+        context.cuda_device_context(), *input, output, Div(numel), reduce_dims,
+        stream);
  }
 };


--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
@@ -57,7 +57,12 @@ class NCCLTester : public ::testing::Test {
    paddle::platform::CPUPlace cpu_place;
    for (size_t i = 0; i < gpu_list_.size(); ++i) {
      p::CUDAPlace place(i);
-      dev_ctxs_.emplace_back(new p::CUDADeviceContext(place));
+      auto *ctx = new p::CUDADeviceContext(place);
+      ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(place, ctx->stream())
+                            .get());
+      ctx->PartialInitWithAllocator();
+      dev_ctxs_.emplace_back(ctx);
    }

    NCCLInitOp();

--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -106,16 +106,20 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
    using MT = typename details::MPTypeTrait<T>::Type;
    if (porder == 0) {
      TensorReduceFunctorImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
-          *in_x, out_norm, NonzeroFunctor<T>(), reduce_axis, stream);
+          ctx.cuda_device_context(), *in_x, out_norm, NonzeroFunctor<T>(),
+          reduce_axis, stream);
    } else if (porder == INFINITY) {
      TensorReduceFunctorImpl<T, T, kps::MaxFunctor, AbsFunctor<T>>(
-          *in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream);
+          ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
+          reduce_axis, stream);
    } else if (porder == -INFINITY) {
      TensorReduceFunctorImpl<T, T, kps::MinFunctor, AbsFunctor<T>>(
-          *in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream);
+          ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
+          reduce_axis, stream);
    } else {
      TensorReduceFunctorImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
-          *in_x, out_norm, UnsignedPowFunctor<T>(porder), reduce_axis, stream);
+          ctx.cuda_device_context(), *in_x, out_norm,
+          UnsignedPowFunctor<T>(porder), reduce_axis, stream);

      const framework::Tensor* tmp_norm = out_norm;
      std::vector<const framework::Tensor*> ins = {tmp_norm};

--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -208,8 +208,8 @@ class PoolKernel : public framework::OpKernel<T> {
            auto stream = dev_ctx.stream();
            TensorReduceFunctorImpl<T, T, kps::AddFunctor,
                                    kps::DivideFunctor<T>>(
-                *in_x, out, kps::DivideFunctor<T>(reduce_num), reduce_dim,
-                stream);
+                dev_ctx, *in_x, out, kps::DivideFunctor<T>(reduce_num),
+                reduce_dim, stream);
 #else  // for cpu
            paddle::operators::math::Pool2dFunctor<
                DeviceContext, paddle::operators::math::AvgPool<T>, T>

--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
@@ -186,7 +186,8 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
    }

    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        dalpha_tmp, dalpha, kps::IdentityFunctor<T>(), reduce_dims, stream);
+        context.cuda_device_context(), dalpha_tmp, dalpha,
+        kps::IdentityFunctor<T>(), reduce_dims, stream);
  }
 };


--- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
@@ -222,6 +222,10 @@ TEST(SENDANDRECV, GPU) {
  framework::Scope* scope = (*micro_scope)[0];
  platform::CUDAPlace place;
  platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();

  framework::Executor exe(place);
  // create var on local scope

--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -39,14 +39,16 @@ namespace operators {

 template <typename Tx, typename Ty, template <typename> class ReduceOp,
          typename TransformOp>
-void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
+void TensorReduceFunctorImpl(const platform::CUDADeviceContext& dev_ctx,
+                             const framework::Tensor& x, framework::Tensor* y,
                             const TransformOp& transform,
                             const std::vector<int>& origin_reduce_dims,
                             gpuStream_t stream) {
  y->mutable_data<Ty>(x.place());

  pten::kernels::TensorReduceFunctorImpl<Tx, Ty, ReduceOp, TransformOp>(
-      x, y, transform, origin_reduce_dims, stream);
+      static_cast<const pten::GPUContext&>(dev_ctx), x, y, transform,
+      origin_reduce_dims, stream);
 }

 }  // namespace operators

--- a/paddle/fluid/operators/renorm_op.cu
+++ b/paddle/fluid/operators/renorm_op.cu
@@ -156,7 +156,8 @@ class CUDARenormKernel : public framework::OpKernel<T> {
        cuda_ctx, ins, &outs, func);
    std::vector<int> reduce_axis = {0, 2};
    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, stream);
+        cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis,
+        stream);
    RenormKernelFunc3<T><<<grid2, block2, 0, stream>>>(
        numel, dim_value.mutable_data<T>(context.GetPlace()), p, max_norm);
    RenormKernelFunc4<T><<<grid, block, 0, stream>>>(
@@ -213,10 +214,11 @@ class CUDAGradRenormKernel : public framework::OpKernel<T> {
        dim_divisor);
    std::vector<int> reduce_axis = {0, 2};
    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, stream);
+        ctx.cuda_device_context(), pow_value, &dim_value,
+        kps::IdentityFunctor<T>(), reduce_axis, stream);
    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        mul_value, &weight_derivative, kps::IdentityFunctor<T>(), reduce_axis,
-        stream);
+        ctx.cuda_device_context(), mul_value, &weight_derivative,
+        kps::IdentityFunctor<T>(), reduce_axis, stream);
    RenormGradKernelFunc2<T><<<grid, block, 0, stream>>>(
        x_data, dout_data, dx_data, numel,
        dim_value.mutable_data<T>(ctx.GetPlace()),

--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -389,7 +389,8 @@ class ReshapeKernel {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    if (platform::is_gpu_place(ctx.GetPlace())) {
      auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-      pten::ReshapeKernel(dev_ctx, *in, pt_scalar_shape, out);
+      pten::ReshapeKernel(static_cast<const pten::GPUContext &>(dev_ctx), *in,
+                          pt_scalar_shape, out);
    }
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -417,7 +418,8 @@ class ReshapeGradKernel {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    if (platform::is_gpu_place(ctx.GetPlace())) {
      auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-      pten::ReshapeGradKernel(dev_ctx, *d_out, d_x);
+      pten::ReshapeGradKernel(static_cast<const pten::GPUContext &>(dev_ctx),
+                              *d_out, d_x);
    }
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -445,7 +447,8 @@ class ReshapeDoubleGradKernel {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    if (platform::is_gpu_place(ctx.GetPlace())) {
      auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-      pten::ReshapeDoubleGradKernel(dev_ctx, *dd_x, dd_out);
+      pten::ReshapeDoubleGradKernel(
+          static_cast<const pten::GPUContext &>(dev_ctx), *dd_x, dd_out);
    }
 #endif
 #ifdef PADDLE_WITH_XPU

--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
@@ -183,8 +183,7 @@ void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,

  int64_t max_grid_dimx =
      reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
-          .GetCUDAMaxGridDimSize()
-          .x;
+          .GetCUDAMaxGridDimSize()[0];
  int64_t grid = height < max_grid_dimx ? height : max_grid_dimx;

  ScatterInitCUDAKernel<T, IndexT><<<

--- a/paddle/fluid/operators/solve_op.h
+++ b/paddle/fluid/operators/solve_op.h
@@ -46,7 +46,8 @@ void ReduceSumForSolve(const Tensor* input, Tensor* output,
 #if defined(__NVCC__) || defined(__HIPCC__)
  auto stream = ctx.cuda_device_context().stream();
  TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-      *input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
+      ctx.cuda_device_context(), *input, output, kps::IdentityFunctor<T>(),
+      reduce_dims, stream);
 #else
  ReduceKernelFunctor<DeviceContext, T, ops::SumFunctor>(
      input, output, reduce_dims, keep_dim, false, ctx)

--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/strided_memcpy.h"

 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"

 namespace paddle {
 namespace operators {
@@ -86,6 +87,10 @@ TEST(StridedMemcpy, GPUCrop) {
  platform::CPUPlace cpu;

  platform::CUDADeviceContext ctx(gpu0);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(gpu0, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();

  auto src_allocation = memory::Alloc(gpu0, sizeof(src));

@@ -124,6 +129,10 @@ TEST(StridedMemcpy, GPUConcat) {
  platform::CUDAPlace gpu0(0);
  platform::CPUPlace cpu;
  platform::CUDADeviceContext ctx(gpu0);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(gpu0, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
  auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src));
  int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());

--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -37,6 +37,10 @@ void CreateCUDATensor(framework::Scope* scope, const std::string& name,
  tensor->Resize(dims);
  platform::CUDAPlace place;
  platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
  inference::tensorrt::RandomizeTensor(tensor, place, ctx);
 }

@@ -133,6 +137,10 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
  framework::Scope scope;
  platform::CUDAPlace place;
  platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
  // Prepare variables.
  if (allow_build_at_runtime)
    CreateCUDATensor(&scope, "x", std::vector<int64_t>({3, 4, 1, 1}));
@@ -159,6 +167,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
  framework::Scope scope;
  platform::CUDAPlace place;
  platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();

  auto* block_ = program.Proto()->add_blocks();
  block_->set_idx(0);

--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -411,7 +411,7 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
  };
  int block_size = ComputeBlockSize(num_cols);

-  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
  // actually, int num_rows < max_grid_size
  unsigned int grid_size = num_rows < maxGridDimX
                               ? static_cast<unsigned int>(num_rows)

--- a/paddle/fluid/operators/trace_op.cu
+++ b/paddle/fluid/operators/trace_op.cu
@@ -40,7 +40,8 @@ class TraceCUDAKernel : public framework::OpKernel<T> {
      std::vector<int> reduce_dims;
      reduce_dims.push_back(out->dims().size());
      TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          diag, out, kps::IdentityFunctor<T>(), reduce_dims, stream);
+          context.cuda_device_context(), diag, out, kps::IdentityFunctor<T>(),
+          reduce_dims, stream);
    } else {
      math::SetConstant<DeviceContext, T> functor;
      functor(context.device_context<DeviceContext>(), out, static_cast<T>(0));

--- a/paddle/fluid/operators/triangular_solve_op.cu
+++ b/paddle/fluid/operators/triangular_solve_op.cu
@@ -45,7 +45,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
    }
    gpuStream_t stream = ctx.cuda_device_context().stream();
    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream);
+        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
+        out_reduce_dims, stream);
  }
 };


--- a/paddle/fluid/operators/viterbi_decode_op.cu
+++ b/paddle/fluid/operators/viterbi_decode_op.cu
@@ -148,7 +148,7 @@ struct Argmax<platform::CUDADeviceContext, T, IndType> {
    }
    const auto& dev_ctx = ctx.cuda_device_context();
    auto cu_stream = dev_ctx.stream();
-    int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+    int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
    int64_t height = pre * post;
    int64_t width = n;
    int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -138,6 +138,7 @@ if(WITH_CNCL)
 endif()

 if(WITH_GPU OR WITH_ROCM)
+    target_link_libraries(device_context gpu_info gpu_context pten_gpu_info)
    target_link_libraries(device_context gpu_resource_pool)
 endif()


--- a/paddle/fluid/platform/bfloat16_test.cu
+++ b/paddle/fluid/platform/bfloat16_test.cu
@@ -66,6 +66,10 @@ TEST(bfloat16, lod_tensor_on_gpu) {
  // CPU LoDTensor to GPU LoDTensor
  CUDAPlace gpu_place(0);
  CUDADeviceContext gpu_ctx(gpu_place);
+  gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, gpu_ctx.stream())
+                           .get());
+  gpu_ctx.PartialInitWithAllocator();
  framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);

  // GPU LoDTensor to CPU LoDTensor

--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/platform/collective_helper.h"
 #include <utility>

+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"

@@ -187,6 +188,18 @@ NCCLComm* NCCLCommContext::AssignNCCLComm(ncclComm_t comm, int nranks, int rank,
                                          int dev_id, int ring_id) {
  std::unique_ptr<CUDADeviceContext> dev_ctx(
      new CUDADeviceContext(CUDAPlace(dev_id)));
+  dev_ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(CUDAPlace(dev_id), dev_ctx->stream())
+                            .get());
+  dev_ctx->SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  dev_ctx->SetZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(CUDAPlace(dev_id))
+          .get());
+  dev_ctx->PartialInitWithAllocator();

  std::shared_ptr<platform::CudaEventObject> compute_event(
      platform::CudaEventResourcePool::Instance().New(dev_id));
@@ -329,7 +342,7 @@ BKCLComm* BKCLCommContext::AssignBKCLComm(BKCLContext_t comm, int nranks,
    auto* dev_ctx = static_cast<platform::XPUDeviceContext*>(
        platform::DeviceContextPool::Instance().Get(
            platform::XPUPlace(dev_id)));
-    dev_ctx->set_bkcl_context(comm);
+    dev_ctx->SetBkclContext(comm);
  }

  return comm_map_[ring_id][dev_id].get();

--- a/paddle/fluid/platform/device/gpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
 IF(WITH_GPU)
    add_subdirectory(cuda)
-    nv_library(gpu_info SRCS gpu_info.cc DEPS cuda_info gflags glog enforce monitor dynload_cuda)
+    nv_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)

    nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
    nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
 ELSEIF(WITH_ROCM)
    add_subdirectory(rocm)
-    hip_library(gpu_info SRCS gpu_info.cc DEPS rocm_info gflags glog enforce monitor dynload_cuda)
+    hip_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)

    hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
    hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)

--- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
-nv_library(cuda_info SRCS cuda_info.cc DEPS gflags glog enforce monitor dynload_cuda)
 nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
 nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)


--- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
@@ -14,8 +14,10 @@

 #pragma once

+#include <functional>
 #include <mutex>  // NOLINT

+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
@@ -96,8 +98,7 @@ class CublasHandleHolder {
    PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
  }

-  template <typename Callback>
-  inline void Call(Callback&& callback) const {
+  inline void Call(const std::function<void(blasHandle_t)>& callback) const {
    std::lock_guard<std::mutex> guard(mtx_);
    callback(handle_);
  }

--- a/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
@@ -14,11 +14,13 @@ limitations under the License. */

 #pragma once

+#include <functional>
 #include <mutex>  // NOLINT

 #include "paddle/fluid/platform/dynload/cusparse.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/pten/backends/gpu/gpu_decls.h"

 namespace paddle {
 namespace platform {
@@ -45,8 +47,8 @@ class CusparseHandleHolder {
 #endif
  }

-  template <typename Callback>
-  inline void Call(Callback&& callback) const {
+  inline void Call(
+      const std::function<void(pten::sparseHandle_t)>& callback) const {
    std::lock_guard<std::mutex> guard(mtx_);
    callback(handle_);
  }

--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include <array>
 #include <cstdlib>
 #include <mutex>
 #include <set>
@@ -39,11 +40,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"

+#include "paddle/pten/backends/gpu/gpu_info.h"
+
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(enable_cublas_tensor_op_math);
-DECLARE_string(selected_gpus);
 DECLARE_uint64(gpu_memory_limit_mb);

 constexpr static float fraction_reserve_gpu_memory = 0.05f;
@@ -51,23 +53,6 @@ constexpr static float fraction_reserve_gpu_memory = 0.05f;
 USE_GPU_MEM_STAT;
 namespace paddle {
 namespace platform {
-//! Get a list of device ids from environment variable or use all.
-std::vector<int> GetSelectedDevices() {
-  // use user specified GPUs in single-node multi-process mode.
-  std::vector<int> devices;
-  if (!FLAGS_selected_gpus.empty()) {
-    auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
-    for (auto id : devices_str) {
-      devices.push_back(atoi(id.c_str()));
-    }
-  } else {
-    int count = GetGPUDeviceCount();
-    for (int i = 0; i < count; ++i) {
-      devices.push_back(i);
-    }
-  }
-  return devices;
-}

 void GpuMemoryUsage(size_t *available, size_t *total) {
  size_t actual_available, actual_total;
@@ -382,5 +367,91 @@ void *GetGpuBasePtr(void *ptr, int dev_id) {
  return RecordedGpuMallocHelper::Instance(dev_id)->GetBasePtr(ptr);
 }

+int DnnVersion() { return pten::backends::gpu::DnnVersion(); }
+
+int GetGPUDeviceCount() { return pten::backends::gpu::GetGPUDeviceCount(); }
+
+int GetGPUComputeCapability(int id) {
+  return pten::backends::gpu::GetGPUComputeCapability(id);
+}
+
+int GetGPURuntimeVersion(int id) {
+  return pten::backends::gpu::GetGPURuntimeVersion(id);
+}
+
+int GetGPUDriverVersion(int id) {
+  return pten::backends::gpu::GetGPUDriverVersion(id);
+}
+
+bool TensorCoreAvailable() {
+  return pten::backends::gpu::TensorCoreAvailable();
+}
+
+int GetGPUMultiProcessors(int id) {
+  return pten::backends::gpu::GetGPUMultiProcessors(id);
+}
+
+int GetGPUMaxThreadsPerMultiProcessor(int id) {
+  return pten::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(id);
+}
+
+int GetGPUMaxThreadsPerBlock(int id) {
+  return pten::backends::gpu::GetGPUMaxThreadsPerBlock(id);
+}
+
+int GetCurrentDeviceId() { return pten::backends::gpu::GetCurrentDeviceId(); }
+
+std::array<int, 3> GetGpuMaxGridDimSize(int id) {
+  return pten::backends::gpu::GetGpuMaxGridDimSize(id);
+}
+
+std::vector<int> GetSelectedDevices() {
+  return pten::backends::gpu::GetSelectedDevices();
+}
+
+const gpuDeviceProp &GetDeviceProperties(int id) {
+  return pten::backends::gpu::GetDeviceProperties(id);
+}
+
+void SetDeviceId(int device_id) { pten::backends::gpu::SetDeviceId(device_id); }
+
+gpuError_t GpuGetLastError() { return pten::backends::gpu::GpuGetLastError(); }
+
+void GpuStreamSync(gpuStream_t stream) {
+  pten::backends::gpu::GpuStreamSync(stream);
+}
+
+void GpuDestroyStream(gpuStream_t stream) {
+  pten::backends::gpu::GpuDestroyStream(stream);
+}
+
+void GpuDeviceSync() { pten::backends::gpu::GpuDeviceSync(); }
+
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    gpuMemcpyKind kind, gpuStream_t stream) {
+  pten::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream);
+}
+
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   gpuMemcpyKind kind) {
+  pten::backends::gpu::GpuMemcpySync(dst, src, count, kind);
+}
+
+void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
+                        int src_device, size_t count, gpuStream_t stream) {
+  pten::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device,
+                                          count, stream);
+}
+
+void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
+                       int src_device, size_t count) {
+  pten::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device,
+                                         count);
+}
+
+void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
+  pten::backends::gpu::GpuMemsetAsync(dst, value, count, stream);
+}
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device/gpu/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

 #include <stddef.h>
+#include <array>
 #include <string>
 #include <vector>

@@ -52,7 +53,7 @@ int GetGPUMaxThreadsPerBlock(int id);
 int GetCurrentDeviceId();

 //! Get the maximum GridDim size for GPU buddy allocator.
-dim3 GetGpuMaxGridDimSize(int);
+std::array<int, 3> GetGpuMaxGridDimSize(int);

 //! Get a list of device ids from environment variable or use all.
 std::vector<int> GetSelectedDevices();
@@ -110,7 +111,7 @@ void GpuStreamSync(gpuStream_t stream);
 void GpuDestroyStream(gpuStream_t stream);

 // ! Blocks until device has completed all operations.
-void GpuDeviceync();
+void GpuDeviceSync();

 //! CudaMalloc with recorded info
 gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id);

--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -83,8 +83,21 @@ struct NCCLContext {
  std::unique_ptr<CUDADeviceContext> ctx_;
  ncclComm_t comm_;

-  explicit NCCLContext(int dev_id)
-      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
+  explicit NCCLContext(int dev_id) : comm_{nullptr} {
+    ctx_.reset(new CUDADeviceContext(CUDAPlace(dev_id)));
+    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(CUDAPlace(dev_id), ctx_->stream())
+                           .get());
+    ctx_->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx_->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(CUDAPlace(dev_id))
+            .get());
+    ctx_->PartialInitWithAllocator();
+  }

  gpuStream_t stream() const { return ctx_->stream(); }
  ncclComm_t comm() const { return comm_; }

--- a/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
-hip_library(rocm_info SRCS rocm_info.cc DEPS gflags glog enforce monitor dynload_cuda)
-
 hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -10,8 +10,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
+#include <functional>
 #include <memory>
 #include <set>
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/cuda_stream.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
@@ -149,16 +153,17 @@ inline void EmplaceDeviceContext(
              cuda_ctx,
              platform::errors::InvalidArgument(
                  "Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
-          dev_ctx->SetDeviceAllocator(
-              memory::allocation::AllocatorFacade::Instance()
-                  .GetAllocator(p, cuda_ctx->context()->RawStream())
-                  .get());
+          // Note: A trick method to init context, why GetAllocator interface
+          // needs a stream parameter?
+          dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
+                                    .GetAllocator(p, cuda_ctx->stream())
+                                    .get());
+          cuda_ctx->PartialInitWithAllocator();
 #endif
        } else {
-          dev_ctx->SetDeviceAllocator(
-              memory::allocation::AllocatorFacade::Instance()
-                  .GetAllocator(p)
-                  .get());
+          dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
+                                    .GetAllocator(p)
+                                    .get());
        }
        dev_ctx->SetHostAllocator(
            memory::allocation::AllocatorFacade::Instance()
@@ -251,14 +256,18 @@ DeviceContextPool::DeviceContextPool(
  }
 }

-CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {}
+CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {
+  pten::CPUContext::Init();
+}

-CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext() {}
+CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext(place) {
+  pten::CPUContext::Init();
+}

 #ifdef PADDLE_WITH_IPU
 IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}

-Place IPUDeviceContext::GetPlace() const { return place_; }
+const Place& IPUDeviceContext::GetPlace() const { return place_; }

 void IPUDeviceContext::Wait() const {
  /*! \brief  Wait for all operations completion in the stream. */
@@ -268,11 +277,14 @@ IPUDeviceContext::~IPUDeviceContext() {}

 #endif
 #ifdef PADDLE_WITH_XPU
-XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {}
+XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {
+  pten::XPUContext::Init();
+}

 XPUDeviceContext::~XPUDeviceContext() {}

 XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) {
+  pten::XPUContext::Init();
  LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
                          << static_cast<int>(place.device);
 }
@@ -302,7 +314,7 @@ void NPUDeviceContext::Wait() const {

 aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }

-Place NPUDeviceContext::GetPlace() const { return place_; }
+const Place& NPUDeviceContext::GetPlace() const { return place_; }

 aclrtContext NPUDeviceContext::context() const { return context_; }

@@ -319,7 +331,7 @@ Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
  return eigen_device_.get();
 }

-Place NPUPinnedDeviceContext::GetPlace() const { return place_; }
+const Place& NPUPinnedDeviceContext::GetPlace() const { return place_; }

 #endif

@@ -470,102 +482,28 @@ CUDAContext::~CUDAContext() {
 #endif
 }

-CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
-  CUDADeviceGuard guard(place_.device);
-  compute_capability_ = GetGPUComputeCapability(place_.device);
-  multi_process_ = GetGPUMultiProcessors(place_.device);
-  max_threads_per_mp_ = GetGPUMaxThreadsPerMultiProcessor(place_.device);
-  max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device);
-  max_threads_per_block_ = GetGPUMaxThreadsPerBlock(place_.device);
-
-  driver_version_ = GetGPUDriverVersion(place_.device);
-  runtime_version_ = GetGPURuntimeVersion(place_.device);
-
-  LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: "
-                          << static_cast<int>(place_.device)
-                          << ", GPU Compute Capability: "
-                          << compute_capability_ / 10 << "."
-                          << compute_capability_ % 10
-                          << ", Driver API Version: " << driver_version_ / 1000
-                          << "." << (driver_version_ % 100) / 10
-                          << ", Runtime API Version: "
-                          << runtime_version_ / 1000 << "."
-                          << (runtime_version_ % 100) / 10;
-#ifdef PADDLE_WITH_HIP
-  size_t version_major, version_minor, version_patch;
-  PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
-      &version_major, &version_minor, &version_patch));
-  LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
-                          << ", MIOpen Version: " << version_major << "."
-                          << version_minor << "." << version_patch;
-#else
-  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
-  LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
-                          << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
-                          << (cudnn_dso_ver % 1000) / 100 << ".";
-#endif
-  {
-    // Check CUDA/CUDNN version compatiblity
-    auto local_cuda_version =
-        (driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
-#ifdef PADDLE_WITH_HIP
-    auto compile_cuda_version = (HIP_VERSION / 100) * 10 + (HIP_VERSION % 10);
-#else
-    auto compile_cuda_version =
-        (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
-#endif
-    if (local_cuda_version < compile_cuda_version) {
-      LOG_FIRST_N(WARNING, 1)
-          << "WARNING: device: " << static_cast<int>(place_.device)
-          << ". The installed Paddle is compiled with CUDA "
-          << compile_cuda_version / 10 << "." << compile_cuda_version % 10
-          << ", but CUDA runtime version in your machine is "
-          << local_cuda_version / 10 << "." << local_cuda_version % 10
-          << ", which may cause serious incompatible bug. "
-          << "Please recompile or reinstall Paddle with compatible CUDA "
-             "version.";
-    }
-  }
-  default_ctx_.reset(new CUDAContext(place_));
-}
-
-CUDADeviceContext::~CUDADeviceContext() {
-  SetDeviceId(place_.device);
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  if (nccl_comm_) {
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
-  }
-#endif
-}
-
-Place CUDADeviceContext::GetPlace() const { return place_; }
-
-void CUDADeviceContext::Wait() const { context()->Stream()->Wait(); }
-
-int CUDADeviceContext::GetComputeCapability() const {
-  return compute_capability_;
-}
-
-int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
-  return multi_process_ * max_threads_per_mp_;
+CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
+    : pten::GPUContext(place) {
+  pten::GPUContext::PartialInitWithoutAllocator();
+  cuda_stream_.reset(
+      new stream::CUDAStream(pten::GPUContext::stream(), this->GetPlace()));
 }

-int CUDADeviceContext::GetSMCount() const { return multi_process_; }
-
-int CUDADeviceContext::GetMaxThreadsPerBlock() const {
-  return max_threads_per_block_;
-}
+CUDADeviceContext::~CUDADeviceContext() = default;

 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
-  return context()->EigenDevice().get();
-}
-
-bool CUDADeviceContext::tensor_core_available() const {
-  return context()->CublasTensorCoreHandle() != nullptr;
+  if (thread_ctx_.count(this)) {
+    return context()->EigenDevice().get();
+  }
+  return pten::GPUContext::eigen_device();
 }

-dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const {
-  return max_grid_dim_size_;
+void CUDADeviceContext::Wait() const {
+  if (thread_ctx_.count(this)) {
+    context()->Stream()->Wait();
+    return;
+  }
+  pten::GPUContext::Wait();
 }

 #ifdef PADDLE_WITH_HIP
@@ -573,33 +511,96 @@ miopenHandle_t CUDADeviceContext::cudnn_handle() const {
 #else
 cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
 #endif
-  return context()->CudnnHandle();
+  if (thread_ctx_.count(this)) {
+    return context()->CudnnHandle();
+  }
+  return pten::GPUContext::cudnn_handle();
 }

 #ifdef PADDLE_WITH_HIP
 rocblas_handle CUDADeviceContext::cublas_handle() const {
-  return context()->CublasHandle()->GetCublasHandle();
+  if (thread_ctx_.count(this)) {
+    return context()->CublasHandle()->GetCublasHandle();
+  }
+  return pten::GPUContext::cublas_handle();
 }
 #else
 cublasHandle_t CUDADeviceContext::cublas_handle() const {
-  return context()->CublasHandle()->GetCublasHandle();
+  if (thread_ctx_.count(this)) {
+    return context()->CublasHandle()->GetCublasHandle();
+  }
+  return pten::GPUContext::cublas_handle();
 }
 cusparseHandle_t CUDADeviceContext::cusparse_handle() const {
-  return context()->CusparseHandle()->GetCusparseHandle();
+  if (thread_ctx_.count(this)) {
+    return context()->CusparseHandle()->GetCusparseHandle();
+  }
+  return pten::GPUContext::cusparse_handle();
+}
+cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
+  if (thread_ctx_.count(this)) {
+    return context()->CusolverDnHandle();
+  }
+  return pten::GPUContext::cusolver_dn_handle();
 }
 #endif

+void CUDADeviceContext::RecordEvent(
+    gpuEvent_t ev, const std::function<void()>& callback) const {
+  if (thread_ctx_.count(this)) {
+    context()->Stream()->RecordEvent(ev, callback);
+    return;
+  }
+  pten::GPUContext::RecordEvent(ev, callback);
+}
+
+void CUDADeviceContext::AddStreamCallback(
+    const std::function<void()>& callback) const {
+  if (thread_ctx_.count(this)) {
+    context()->Stream()->AddCallback(callback);
+    return;
+  }
+  pten::GPUContext::AddStreamCallback(callback);
+}
+
+void CUDADeviceContext::WaitStreamCallback() const {
+  if (thread_ctx_.count(this)) {
+    context()->Stream()->WaitCallback();
+    return;
+  }
+  pten::GPUContext::WaitStreamCallback();
+}
+
 CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
  return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
 }

-#ifndef PADDLE_WITH_HIP
-cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
-  return context()->CusolverDnHandle();
+gpuStream_t CUDADeviceContext::stream() const {
+  if (thread_ctx_.count(this)) {
+    return context()->RawStream();
+  }
+  return pten::GPUContext::stream();
 }
-#endif

-gpuStream_t CUDADeviceContext::stream() const { return context()->RawStream(); }
+std::shared_ptr<CUDAContext> CUDADeviceContext::context() const {
+  if (!thread_ctx_.count(this)) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "CUDADeviceContext call context() failed, make sure in the "
+        "thread_local semantic."));
+  }
+  return thread_ctx_.at(this);
+}
+
+stream::CUDAStream* CUDADeviceContext::GetCudaStream() const {
+  return cuda_stream_.get();
+}
+
+stream::CUDAStream* CUDADeviceContext::SetCudaStream(
+    stream::CUDAStream* new_stream_ptr) {
+  auto* old_stream_ptr = cuda_stream_.release();
+  cuda_stream_.reset(new_stream_ptr);
+  return old_stream_ptr;
+}

 CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
  eigen_device_.reset(new Eigen::DefaultDevice());
@@ -614,7 +615,7 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
  return eigen_device_.get();
 }

-Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
+const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 #endif

 #ifdef PADDLE_WITH_MKLDNN

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once

+#include <functional>
 #include <future>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
@@ -18,7 +19,9 @@ limitations under the License. */
 #include <utility>
 #include <vector>

+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/backends/gpu/gpu_decls.h"
 #include "paddle/pten/core/device_context.h"

 #include "paddle/fluid/memory/malloc.h"
@@ -28,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
 #include "paddle/fluid/platform/dynload/cusparse.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
@@ -38,6 +42,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"  // NOLINT
 #include "paddle/fluid/platform/dynload/miopen.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"  // NOLINT
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
@@ -145,7 +150,7 @@ class IPUDeviceContext : public DeviceContext {
  explicit IPUDeviceContext(IPUPlace place);
  virtual ~IPUDeviceContext();
  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
-  Place GetPlace() const override;
+  const Place& GetPlace() const override;
  /*! \brief  Wait for all operations completion in the stream. */
  void Wait() const override;

@@ -187,7 +192,7 @@ class NPUDeviceContext : public DeviceContext {
  explicit NPUDeviceContext(NPUPlace place);
  virtual ~NPUDeviceContext();
  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
-  Place GetPlace() const override;
+  const Place& GetPlace() const override;
  aclrtContext context() const;

  /*! \brief  Wait for all operations completion in the stream. */
@@ -247,7 +252,7 @@ class NPUPinnedDeviceContext : public DeviceContext {
  NPUPinnedDeviceContext();
  explicit NPUPinnedDeviceContext(NPUPinnedPlace place);

-  Place GetPlace() const override;
+  const Place& GetPlace() const override;

  Eigen::DefaultDevice* eigen_device() const;

@@ -326,20 +331,20 @@ class CUDAContext {
 #endif

  /*! \brief  Call cublas function safely. */
-  template <typename Callback>
-  inline void CublasCall(Callback&& callback) const {
+  inline void CublasCall(
+      const std::function<void(blasHandle_t)>& callback) const {
    if (cublas_tf32_tensor_core_handle_) {
-      cublas_tf32_tensor_core_handle_->Call(std::forward<Callback>(callback));
+      cublas_tf32_tensor_core_handle_->Call(callback);
    } else {
-      cublas_handle_->Call(std::forward<Callback>(callback));
+      cublas_handle_->Call(callback);
    }
  }

 #ifndef PADDLE_WITH_HIP
  /*! \brief  Call cusparse function safely. */
-  template <typename Callback>
-  inline void CusparseCall(Callback&& callback) const {
-    cusparse_handle_->Call(std::forward<Callback>(callback));
+  inline void CusparseCall(
+      const std::function<void(pten::sparseHandle_t)>& callback) const {
+    cusparse_handle_->Call(callback);
  }
 #endif

@@ -348,12 +353,12 @@ class CUDAContext {

  /*! \brief  Call cublas function with Tensor Core safely. If
      Tensor Core is not available, use DEFAULT_MATH instead. */
-  template <typename Callback>
-  inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const {
+  inline void TensorCoreCublasCallIfAvailable(
+      const std::function<void(blasHandle_t)>& callback) const {
    if (cublas_tensor_core_handle_) {
-      cublas_tensor_core_handle_->Call(std::forward<Callback>(callback));
+      cublas_tensor_core_handle_->Call(callback);
    } else {
-      cublas_handle_->Call(std::forward<Callback>(callback));
+      cublas_handle_->Call(callback);
    }
  }

@@ -491,7 +496,7 @@ class CUDAContext {
  DISABLE_COPY_AND_ASSIGN(CUDAContext);
 };

-class CUDADeviceContext : public DeviceContext {
+class CUDADeviceContext : public pten::GPUContext {
 public:
  explicit CUDADeviceContext(CUDAPlace place);
  virtual ~CUDADeviceContext();
@@ -499,49 +504,40 @@ class CUDADeviceContext : public DeviceContext {
  /*! \brief  Wait for all operations completion in the stream. */
  void Wait() const override;

-  /*! \brief  Return place in the device context. */
-  Place GetPlace() const override;
-
-  /*! \brief  Return compute capability in the device context. */
-  int GetComputeCapability() const;
-
-  /*! \brief  Return the max physical thread count in the device context */
-  int GetMaxPhysicalThreadCount() const;
-
-  /*! \brief  Return the SM count in the device context */
-  int GetSMCount() const;
-
-  /*! \brief  Return the Max thread num of block in the device context */
-  int GetMaxThreadsPerBlock() const;
-
-  /*! \brief  Return the max grid dim size in the device context */
-  dim3 GetCUDAMaxGridDimSize() const;
-
  /*! \brief  Return eigen device in the device context. */
  Eigen::GpuDevice* eigen_device() const;

  /*! \brief  Call cublas function safely. */
-  template <typename Callback>
-  inline void CublasCall(Callback&& callback) const {
+  inline void CublasCall(
+      const std::function<void(blasHandle_t)>& callback) const {
+    if (!thread_ctx_.count(this)) {
+      pten::GPUContext::CublasCall(callback);
+      return;
+    }
    return context()->CublasCall(callback);
  }

 #ifndef PADDLE_WITH_HIP
  /*! \brief  Call cusparse function safely. */
-  template <typename Callback>
-  inline void CusparseCall(Callback&& callback) const {
-    return context()->CusparseCall(callback);
+  inline void CusparseCall(
+      const std::function<void(pten::sparseHandle_t)>& callback) const {
+    if (!thread_ctx_.count(this)) {
+      pten::GPUContext::CusparseCall(callback);
+      return;
+    }
+    context()->CusparseCall(callback);
  }
 #endif

-  /*! \brief  Check whether tensor core is supported */
-  bool tensor_core_available() const;
-
  /*! \brief  Call cublas function with Tensor Core safely. If
      Tensor Core is not available, use DEFAULT_MATH instead. */
-  template <typename Callback>
-  inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const {
-    return context()->TensorCoreCublasCallIfAvailable(callback);
+  inline void TensorCoreCublasCallIfAvailable(
+      const std::function<void(blasHandle_t)>& callback) const {
+    if (!thread_ctx_.count(this)) {
+      pten::GPUContext::TensorCoreCublasCallIfAvailable(callback);
+      return;
+    }
+    context()->TensorCoreCublasCallIfAvailable(callback);
  }

 /*! \brief  Return cudnn  handle in the device context. */
@@ -559,6 +555,10 @@ class CUDADeviceContext : public DeviceContext {
  cusparseHandle_t cusparse_handle() const;
 #endif

+#ifndef PADDLE_WITH_HIP
+  cusolverDnHandle_t cusolver_dn_handle() const;
+#endif
+
  /*! \brief  Return a cudnn workspace handle to call multiple cudnn
   *  functions without interrupting by other threads.
   *  Once the first cudnn function is called by the handle, a lock
@@ -568,60 +568,33 @@ class CUDADeviceContext : public DeviceContext {
   *  sequential cudnn function calls. */
  CudnnWorkspaceHandle cudnn_workspace_handle() const;

-#ifndef PADDLE_WITH_HIP
-  cusolverDnHandle_t cusolver_dn_handle() const;
-#endif
-
  /*! \brief  Return cuda stream in the device context. */
  gpuStream_t stream() const;

-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  /*! \brief  Return nccl communicators. */
-  ncclComm_t nccl_comm() const { return nccl_comm_; }
-
-  /*! \brief  Set nccl communicators. */
-  void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; }
-#endif
-
-  template <typename Callback>
-  void RecordEvent(gpuEvent_t ev, Callback callback) const {
-    return context()->Stream()->RecordEvent(ev, callback);
-  }
-
-  template <typename Callback>
-  void AddStreamCallback(Callback&& callback) const {
-    return context()->Stream()->AddCallback(callback);
-  }
+  void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const;

-  void WaitStreamCallback() const {
-    return context()->Stream()->WaitCallback();
-  }
+  void AddStreamCallback(const std::function<void()>& callback) const;

-  void ResetDefaultContext(const stream::Priority& priority) {
-    default_ctx_.reset(new CUDAContext(place_, priority));
-  }
+  void WaitStreamCallback() const;

  void ResetThreadContext(const stream::Priority& priority) {
    std::lock_guard<std::mutex> guard(ctx_mtx_);
-    thread_ctx_[this].reset(new CUDAContext(place_, priority));
+    thread_ctx_[this].reset(new CUDAContext(this->GetPlace(), priority));
  }

-  std::shared_ptr<CUDAContext> context() const {
-    if (!thread_ctx_.count(this)) {
-      return default_ctx_;
-    }
-    return thread_ctx_.at(this);
-  }
+  std::shared_ptr<CUDAContext> context() const;

  // Note: Can only be used under thread_local semantics.
  void SetThreadLocalStream(const gpuStream_t stream) {
    thread_ctx_.at(this)->SetStream(stream);
  }

- private:
-  CUDAPlace place_;
-  std::shared_ptr<CUDAContext> default_ctx_;
+  // NOTE: Just for compatibility with the past, please delete if there is an
+  // elegant way.
+  stream::CUDAStream* GetCudaStream() const;
+  stream::CUDAStream* SetCudaStream(stream::CUDAStream*);

+ private:
  // The thread_local static variable will be released before the
  // global static variable, so avoid using it in dtor.
  static thread_local std::unordered_map<const CUDADeviceContext*,
@@ -631,22 +604,9 @@ class CUDADeviceContext : public DeviceContext {

  mutable std::mutex cudnn_handle_mtx_;

-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  // NCCL communicator (single process version) for NCCL collective operations.
-  // NCCL collective operations provides fast collectives over multiple GPUs
-  // both within and across nodes.
-  // But, this collectives is used for collectives over multiple GPUs within
-  // nodes.
-  ncclComm_t nccl_comm_{nullptr};
-#endif
-
-  int compute_capability_;
-  int runtime_version_;
-  int driver_version_;
-  int multi_process_;
-  int max_threads_per_mp_;
-  int max_threads_per_block_;
-  dim3 max_grid_dim_size_;
+  // NOTE: Just for compatibility with the past, please delete if there is an
+  // elegant way.
+  std::unique_ptr<stream::CUDAStream> cuda_stream_;

  DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
 };
@@ -711,7 +671,7 @@ class CUDAPinnedDeviceContext : public DeviceContext {
  CUDAPinnedDeviceContext();
  explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place);

-  Place GetPlace() const override;
+  const Place& GetPlace() const override;

  Eigen::DefaultDevice* eigen_device() const;


--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -17,6 +17,7 @@ limitations under the License. */

 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"

 TEST(Device, Init) {
  using paddle::platform::DeviceContext;
@@ -26,6 +27,20 @@ TEST(Device, Init) {
  int count = paddle::platform::GetGPUDeviceCount();
  for (int i = 0; i < count; i++) {
    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
+    device_context->SetAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(CUDAPlace(i), device_context->stream())
+            .get());
+    device_context->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    device_context->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(CUDAPlace(i))
+            .get());
+    device_context->PartialInitWithAllocator();
+
    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
    ASSERT_NE(nullptr, gpu_device);
    delete device_context;
@@ -39,6 +54,19 @@ TEST(Device, CUDADeviceContext) {
  int count = paddle::platform::GetGPUDeviceCount();
  for (int i = 0; i < count; i++) {
    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
+    device_context->SetAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(CUDAPlace(i), device_context->stream())
+            .get());
+    device_context->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    device_context->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(CUDAPlace(i))
+            .get());
+    device_context->PartialInitWithAllocator();
    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
    ASSERT_NE(nullptr, gpu_device);
 #ifdef PADDLE_WITH_HIP

--- a/paddle/fluid/platform/device_event_gpu.cc
+++ b/paddle/fluid/platform/device_event_gpu.cc
@@ -53,7 +53,7 @@ void DeviceEventRecordCUDA(DeviceEvent* event, const DeviceContext* context) {
      platform::errors::PreconditionNotMet(
          "Failed to dynamic_cast context into CUDADeviceContext."));

-  wrapper->inner_event_.Record(*cuda_dev_ctx->context()->Stream());
+  wrapper->inner_event_.Record(cuda_dev_ctx->stream());
 }

 bool DeviceEventQueryCUDA(const DeviceEvent* event) {
@@ -82,8 +82,7 @@ void DeviceEventCUDAWaitCUDA(const DeviceEvent* event,
      platform::errors::PreconditionNotMet(
          "Failed to dynamic_cast context into CUDADeviceContext."));
  // calling cudaStreamWaitEvent(stream, event, 0)
-  cuda_dev_ctx->context()->Stream()->WaitEvent(
-      wrapper->inner_event_.GetRawCudaEvent());
+  cuda_dev_ctx->WaitEvent(wrapper->inner_event_.GetRawCudaEvent());
 }

 void DeviceEventCPUWaitCUDA(const DeviceEvent* event,

--- a/paddle/fluid/platform/device_event_test.cc
+++ b/paddle/fluid/platform/device_event_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/platform/device_event.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/platform/place.h"

 using ::paddle::platform::kCUDA;
 using ::paddle::platform::kCPU;
@@ -38,9 +39,11 @@ TEST(DeviceEvent, CUDA) {
  // case 1. test for event_creator
  DeviceEvent event(place);
  ASSERT_NE(event.GetEvent().get(), nullptr);
+  bool status = event.Query();
+  ASSERT_EQ(status, true);
  // case 2. test for event_recorder
  event.Record(context);
-  bool status = event.Query();
+  status = event.Query();
  ASSERT_EQ(status, false);
  // case 3. test for event_finisher
  event.Finish();

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -539,7 +539,7 @@ inline void retry_sleep(unsigned milliseconds) {
        ::paddle::platform::details::ExternalApiType<                   \
            __CUDA_STATUS_TYPE__>::kSuccess;                            \
    while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
-      retry_sleep(FLAGS_gpu_allocator_retry_time);                      \
+      paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time);    \
      __cond__ = (COND);                                                \
      ++retry_count;                                                    \
    }                                                                   \
@@ -727,7 +727,7 @@ inline void retry_sleep(unsigned millisecond) {
        ::paddle::platform::details::ExternalApiType<                   \
            __CUDA_STATUS_TYPE__>::kSuccess;                            \
    while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
-      retry_sleep(FLAGS_gpu_allocator_retry_time);                      \
+      ::paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time);  \
      __cond__ = (COND);                                                \
      ++retry_count;                                                    \
    }                                                                   \

--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -152,11 +152,11 @@ class CudaEvent {
 #endif
  }

-  void Record(const paddle::platform::stream::CUDAStream &stream) {
+  void Record(gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream.raw_stream()));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream));
 #else
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream.raw_stream()));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream));
 #endif
  }


--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -328,6 +328,10 @@ TEST(float16, lod_tensor_on_gpu) {
  // CPU LoDTensor to GPU LoDTensor
  CUDAPlace gpu_place(0);
  CUDADeviceContext gpu_ctx(gpu_place);
+  gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, gpu_ctx.stream())
+                           .get());
+  gpu_ctx.PartialInitWithAllocator();
  framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);

  // GPU LoDTensor to CPU LoDTensor

--- a/paddle/fluid/platform/for_range.h
+++ b/paddle/fluid/platform/for_range.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"

 namespace paddle {
 namespace platform {
@@ -72,6 +73,7 @@ __global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
  }
 }

+// NOTE: After the pten kernel is migrated, it needs to be deleted.
 template <>
 struct ForRange<CUDADeviceContext> {
  ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
@@ -106,6 +108,40 @@ struct ForRange<CUDADeviceContext> {
  size_t limit_;
 };

+template <>
+struct ForRange<pten::GPUContext> {
+  ForRange(const pten::GPUContext& dev_ctx, size_t limit)
+      : dev_ctx_(dev_ctx), limit_(static_cast<size_t>(limit)) {}
+
+  template <typename Function>
+  inline void operator()(Function func) const {
+#ifdef __HIPCC__
+    // HIP will throw core dump when threads > 256
+    constexpr int num_threads = 256;
+#elif WITH_NV_JETSON
+    // JETSON_NANO will throw core dump when threads > 128
+    int num_thread = 256;
+    platform::ChangeThreadNum(dev_ctx_, &num_thread, 128);
+    const int num_threads = num_thread;
+#else
+    constexpr int num_threads = 1024;
+#endif
+    size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
+    size_t grid_size = (limit_ + num_threads - 1) / num_threads;
+
+    if (grid_size == 1) {
+      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
+          func);
+    } else {
+      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
+          func, limit_);
+    }
+  }
+
+  const pten::GPUContext& dev_ctx_;
+  size_t limit_;
+};
+
 #endif

 }  // namespace platform

--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -14,6 +14,7 @@ limitations under the License. */

 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"

@@ -116,11 +117,8 @@ CUDAStream* get_current_stream(int deviceId) {

  platform::Place device = CUDAPlace(deviceId);

-  auto stream = static_cast<platform::CUDADeviceContext*>(pool.Get(device))
-                    ->context()
-                    ->Stream()
-                    .get();
-  return stream;
+  return static_cast<platform::CUDADeviceContext*>(pool.Get(device))
+      ->GetCudaStream();
 #else
  PADDLE_THROW(platform::errors::Unavailable(
      "Paddle is not compiled with CUDA. Cannot visit cuda current stream."));
@@ -133,12 +131,12 @@ CUDAStream* set_current_stream(CUDAStream* stream) {
  auto& device = stream->GetPlace();
  auto& pool = platform::DeviceContextPool::Instance();
  return static_cast<platform::CUDADeviceContext*>(pool.Get(device))
-      ->context()
-      ->SetStream(stream);
+      ->SetCudaStream(stream);
 #else
  PADDLE_THROW(platform::errors::Unavailable(
-      "Paddle is not compiled with CUDA. Cannot visit cuda current stream."));
-  return nullptr;
+      "Paddle is not compiled with CUDA. Cannot visit cuda current"
+      "stream."));
+  return CUDAStream(nullptr);
 #endif
 }
 }  // namespace stream

--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include <cstdint>
+#include <functional>
 #include <memory>

 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -51,24 +52,28 @@ class CUDAStream final {
                      const StreamFlag& flag = StreamFlag::kDefaultFlag) {
    Init(place, priority, flag);
  }
+  explicit CUDAStream(gpuStream_t stream, const Place& place)
+      : place_(place), stream_(stream) {
+    owned_stream_ = false;
+    callback_manager_.reset(new StreamCallbackManager<gpuStream_t>(stream_));
+  }
  virtual ~CUDAStream() { Destroy(); }

  bool Init(const Place& place, const Priority& priority = Priority::kNormal,
            const StreamFlag& flag = StreamFlag::kDefaultFlag);

-  template <typename Callback>
-  void AddCallback(Callback&& callback) const {
+  void AddCallback(std::function<void()> callback) const {
    callback_manager_->AddCallback(callback);
  }

-  template <typename Callback>
 #ifdef PADDLE_WITH_HIP
-  void RecordEvent(hipEvent_t ev, Callback callback) const {
+  void RecordEvent(hipEvent_t ev, const std::function<void()>& callback) const {
    callback();
    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_));
  }
 #else
-  void RecordEvent(cudaEvent_t ev, Callback callback) const {
+  void RecordEvent(cudaEvent_t ev,
+                   const std::function<void()>& callback) const {
    callback();
    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_));
  }
@@ -149,6 +154,7 @@ class CUDAStream final {
 };

 CUDAStream* get_current_stream(int deviceId);
+// NOTE: There is a problem with the interface and needs to be fixed
 CUDAStream* set_current_stream(CUDAStream* stream);

 }  // namespace stream

--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <gtest/gtest.h>
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/transform.h"
@@ -57,6 +58,10 @@ TEST(Transform, CPUUnary) {
 TEST(Transform, GPUUnary) {
  CUDAPlace gpu0(0);
  CUDADeviceContext ctx(gpu0);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(gpu0, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
  float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
  auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4);
  float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
@@ -84,6 +89,10 @@ TEST(Transform, GPUBinary) {
  int buf[4] = {1, 2, 3, 4};
  CUDAPlace gpu0(0);
  CUDADeviceContext ctx(gpu0);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(gpu0, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
  auto gpu_allocation = Alloc(gpu0, sizeof(buf));
  int* gpu_buf = static_cast<int*>(gpu_allocation->ptr());
  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());

--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -34,7 +34,7 @@ void BindCudaStream(py::module *m_ptr) {
          return paddle::platform::stream::get_current_stream(deviceId);
 #else
          PADDLE_THROW(platform::errors::Unavailable(
-              "Paddle is not compiled with CUDA. Cannot visit cuda current "
+              "Paddle is not compiled with CUDA. Cannot visit cuda current"
              "stream."));
 #endif
        },
@@ -119,7 +119,7 @@ void BindCudaStream(py::module *m_ptr) {
           [](paddle::platform::stream::CUDAStream &self,
              paddle::platform::stream::CUDAStream &stream) {
             paddle::platform::CudaEvent event;
-             event.Record(stream);
+             event.Record(stream.raw_stream());

             self.WaitEvent(event.GetRawCudaEvent());
           },
@@ -179,7 +179,7 @@ void BindCudaStream(py::module *m_ptr) {
             if (event == nullptr) {
               event = new paddle::platform::CudaEvent();
             }
-             event->Record(self);
+             event->Record(self.raw_stream());
             return event;
           },
           R"DOC(
@@ -321,7 +321,7 @@ void BindCudaStream(py::module *m_ptr) {
             if (stream == nullptr) {
               stream = paddle::platform::stream::get_current_stream(-1);
             }
-             self.Record(*stream);
+             self.Record(stream->raw_stream());
           },
           R"DOC(
          Records the event in the given stream.

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1596,7 +1596,20 @@ All parameter, weight, gradient are variables in Paddle.
      .def_static("create",
                  [](paddle::platform::CPUPlace& place)
                      -> paddle::platform::DeviceContext* {
-                    return new paddle::platform::CPUDeviceContext();
+    auto* context = new paddle::platform::CPUDeviceContext();
+    context->SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+        .GetAllocator(place)
+        .get());
+    context->SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+        .GetAllocator(paddle::platform::CPUPlace())
+        .get());
+    context->SetZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+        .GetZeroAllocator(place)
+        .get());
+    return context;
                  })
      .def_static("create",
                  [](paddle::platform::XPUPlace& place)
@@ -1607,7 +1620,20 @@ All parameter, weight, gradient are variables in Paddle.
                 "Cannot use XPUPlace in CPU/GPU version, "
                 "Please recompile or reinstall Paddle with XPU support."));
 #else
-                    return new paddle::platform::XPUDeviceContext(place);
+      auto* context = new paddle::platform::XPUDeviceContext(place);
+      context->SetAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(place)
+          .get());
+      context->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+      context->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(place)
+          .get());
+      return context;
 #endif
                  })
        .def_static("create",
@@ -1643,7 +1669,21 @@ All parameter, weight, gradient are variables in Paddle.
                 "Cannot use CUDAPlace in CPU only version, "
                 "Please recompile or reinstall Paddle with CUDA support."));
 #else
-                    return new paddle::platform::CUDADeviceContext(place);
+      auto* context = new paddle::platform::CUDADeviceContext(place);
+      context->SetAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(place, context->stream())
+          .get());
+      context->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+      context->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+        .GetZeroAllocator(place)
+        .get());
+      context->PartialInitWithAllocator();
+      return context;
 #endif
                  })
          .def_static("create",

--- a/paddle/pten/backends/CMakeLists.txt
+++ b/paddle/pten/backends/CMakeLists.txt
@@ -2,6 +2,10 @@ add_subdirectory(dynload)

 add_subdirectory(cpu)

+if(WITH_GPU OR WITH_ROCM)
+  add_subdirectory(gpu)
+endif()
+
 if(WITH_XPU)
  add_subdirectory(xpu)
 endif()
@@ -11,3 +15,7 @@ cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context)
 if(WITH_XPU)
  add_dependencies(pten_context xpu_context)
 endif()
+
+if(WITH_GPU)
+  add_dependencies(pten_context gpu_context)
+endif()
--- a/paddle/pten/backends/cpu/cpu_context.cc
+++ b/paddle/pten/backends/cpu/cpu_context.cc
@@ -15,75 +15,59 @@
 #include "paddle/pten/backends/cpu/cpu_context.h"

 #include "paddle/pten/api/ext/exception.h"
+#include "paddle/pten/common/place.h"

 // NOTE: The paddle framework should add WITH_EIGEN option to support compile
 // without eigen.
+#include "paddle/pten/core/device_context.h"
 #include "unsupported/Eigen/CXX11/Tensor"

 namespace pten {

-struct CPUContext::CPUImpl {
-  CPUImpl() { device_ = new Eigen::DefaultDevice(); }
+struct CPUContext::Impl {
+  Impl() : place_(CPUPlace()) {}

-  // Users need to manage external resources.
-  explicit CPUImpl(const CPUContextResource& ctx_res) : res_(ctx_res) {
-    device_ = res_.device;
-  }
+  explicit Impl(const Place& place) : place_(place) {}

-  ~CPUImpl() {
-    if (res_.device == nullptr && device_ != nullptr) {
-      delete device_;
-      device_ = nullptr;
+  ~Impl() {
+    if (owned_) {
+      delete eigen_device_;
    }
  }

-  Eigen::DefaultDevice* GetEigenDevice() const {
-    PD_CHECK(device_ != nullptr, "the eigen_device is nullptr.");
-    return device_;
+  void Init() {
+    owned_ = true;
+    eigen_device_ = new Eigen::DefaultDevice();
  }

-  void SetEigenDevice(Eigen::DefaultDevice* device) {
-    if (device == nullptr) {
-      return;
-    }
-    res_.device = device;
-    device_ = device;
+  Eigen::DefaultDevice* GetEigenDevice() const {
+    PD_CHECK(eigen_device_ != nullptr, "the cpu eigen_device is nullptr.");
+    return eigen_device_;
  }

-  Place GetPlace() const { return place_; }
-
-  Eigen::DefaultDevice* device_{nullptr};
-  CPUContextResource res_;
-  CPUPlace place_;
+  bool owned_{false};
+  Eigen::DefaultDevice* eigen_device_{nullptr};
+  Place place_;
 };

-CPUContext::CPUContext() : DeviceContext() {
-  cpu_impl_ = std::make_unique<CPUImpl>();
-}
-
-CPUContext::CPUContext(const CPUContext& other) : DeviceContext() {
-  cpu_impl_ = std::make_unique<CPUImpl>();
-  cpu_impl_->SetEigenDevice(other.eigen_device());
-}
+CPUContext::CPUContext()
+    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>()) {}

-CPUContext::CPUContext(CPUContext&& other) : DeviceContext() {
-  cpu_impl_ = std::move(other.cpu_impl_);
-}
+CPUContext::CPUContext(const Place& place)
+    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>(place)) {}

 CPUContext::~CPUContext() = default;

-CPUContext::CPUContext(const CPUContextResource& ctx_res) : DeviceContext() {
-  cpu_impl_ = std::make_unique<CPUImpl>(ctx_res);
-}
+void CPUContext::Init() { impl_->Init(); }

 Eigen::DefaultDevice* CPUContext::eigen_device() const {
-  return cpu_impl_->GetEigenDevice();
+  return impl_->GetEigenDevice();
 }

+const Place& CPUContext::GetPlace() const { return impl_->place_; }
+
 void CPUContext::SetEigenDevice(Eigen::DefaultDevice* device) {
-  cpu_impl_->SetEigenDevice(device);
+  impl_->eigen_device_ = device;
 }

-Place CPUContext::GetPlace() const { return cpu_impl_->GetPlace(); }
-
 }  // namespace pten
--- a/paddle/pten/backends/cpu/cpu_context.h
+++ b/paddle/pten/backends/cpu/cpu_context.h
@@ -24,37 +24,29 @@ limitations under the License. */

 namespace pten {

-struct CPUContextResource {
-  Eigen::DefaultDevice* device{nullptr};
-};
-
 class CPUContext : public DeviceContext {
 public:
-  // NOTE: DeviceContext hold resources. Used in training scenarios.
  CPUContext();
-
-  // NOTE: Share the same underlying resources, please ensure that resources are
-  // not released.
-  CPUContext(const CPUContext&);
-
-  CPUContext(CPUContext&&);
-
-  ~CPUContext();
-
+  explicit CPUContext(const Place&);
+  virtual ~CPUContext();
  Eigen::DefaultDevice* eigen_device() const;
-
-  // TODO(wilber): Whether the interface should be preserved.
-  Place GetPlace() const override;
+  const Place& GetPlace() const override;

 public:
-  // NOTE: External users manage resources. Used in inference scenarios.
-  explicit CPUContext(const CPUContextResource& ctx_res);
+  // NOTE: DeviceContext hold resources. Used in training scenarios.
+  // The interface used by the training scene, DeviceContext will initialize
+  // all resources and delete them when destructing.
+  void Init();

+ protected:
+  // NOTE: External users manage resources. Used in inference scenarios.
+  // The Set interface is for inference only, DeviceContext will mark the
+  // resource as external, and will not delete any resource when destructing.
  void SetEigenDevice(Eigen::DefaultDevice* device);

 private:
-  struct CPUImpl;
-  std::unique_ptr<CPUImpl> cpu_impl_;
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
 };

 }  // namespace pten
--- a/paddle/pten/backends/gpu/CMakeLists.txt
+++ b/paddle/pten/backends/gpu/CMakeLists.txt
+if(WITH_GPU)
+  add_subdirectory(cuda)
+  nv_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_cuda_info gflags glog enforce pten_dynload_cuda)
+elseif(WITH_ROCM)
+  add_subdirectory(rocm)
+  hip_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_rocm_info gflags glog enforce pten_dynload_cuda)
+endif()
+
+cc_library(gpu_context SRCS gpu_context.cc DEPS pten_device_context pten_gpu_info eigen3)
--- a/paddle/pten/backends/gpu/cuda/CMakeLists.txt
+++ b/paddle/pten/backends/gpu/cuda/CMakeLists.txt
+nv_library(pten_cuda_info SRCS cuda_info.cc DEPS gflags glog enforce pten_dynload_cuda)
--- a/paddle/pten/backends/gpu/cuda/cuda_helper.h
+++ b/paddle/pten/backends/gpu/cuda/cuda_helper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace pten {
+namespace backends {
+namespace gpu {
+
+/*
+ * Summary: Grid stride looping macro in CUDA kernel
+ *
+ *  [ Why need this macro? ]
+ *
+ *    The original looping in CUDA kernel is:
+ *
+ *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+ *        i += blockDim.x * gridDim.x)`
+ *
+ *    This for condition is risky. The value of `blockIdx.x * blockDim.x`
+ *    may be large, such as over 1GB, the first iteration is no problem here,
+ *    but when `i += blockDim.x * gridDim.x` is executed, the value of i
+ *    will greater than INT_MAX and overflow becomes negative value, at
+ *    this time, the cycle condition `i < (n)` is still satisfied, so it
+ *    will cause illegal access to cuda memory.
+ *
+ *    Here is a real example in ERINE, it will trigger above error.
+ *    The related data are:
+ *      - blockIdx.x = 2172938
+ *      - blockDim.x = 512
+ *      - blockIdx.x * blockDim.x = 1112543864
+ *      - INT_MAX = 2147483647
+ *
+ *    So we polish the for condition as follow, the int64_t __index__ will
+ *    prevent overflow in the loop increment.
+ *
+ * Parameters:
+ *    - i: loop index
+ *    - num: total element numbers
+ *
+ * Examples:
+ *    template <typename T>
+ *    __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
+ *                      const int d, const int remain) {
+ *    CUDA_KERNEL_LOOP(index, num) {
+ *      int idx_n = index / d;
+ *      int idx_remain = index % remain;
+ *      logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
+ *      }
+ *    }
+ *
+*/
+
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)            \
+  int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
+  for (index_type i = __index__; __index__ < (num);          \
+       __index__ += blockDim.x * gridDim.x, i = __index__)
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,20 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/pten/backends/gpu/gpu_info.h"
+
+// TODO(pten): remove fluid headers.
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/lock_guard_ptr.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/monitor.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"

 static std::once_flag g_device_props_size_init_flag;
 static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
-static std::vector<paddle::gpuDeviceProp> g_device_props;
+static std::vector<pten::gpuDeviceProp> g_device_props;
+
+namespace pten {
+namespace backends {
+namespace gpu {

-namespace paddle {
-namespace platform {
 int DnnVersion() {
  if (!dynload::HasCUDNN()) return -1;
  return dynload::cudnnGetVersion();
@@ -75,11 +74,13 @@ int GetGPUDeviceCount() {
 }

 int GetGPUComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  int major, minor;
  auto major_error_code =
      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
@@ -92,22 +93,26 @@ int GetGPUComputeCapability(int id) {
 }

 int GetGPURuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  int runtime_version = 0;
  PADDLE_ENFORCE_GPU_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
  return runtime_version;
 }

 int GetGPUDriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  int driver_version = 0;
  PADDLE_ENFORCE_GPU_SUCCESS(cudaDriverGetVersion(&driver_version));
  return driver_version;
@@ -120,11 +125,13 @@ bool TensorCoreAvailable() {
 }

 int GetGPUMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  int count;
  PADDLE_ENFORCE_GPU_SUCCESS(
      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
@@ -132,11 +139,13 @@ int GetGPUMultiProcessors(int id) {
 }

 int GetGPUMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  int count;
  PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(
      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
@@ -145,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
 }

 int GetGPUMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  int count;
  PADDLE_ENFORCE_GPU_SUCCESS(
      cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
@@ -162,32 +173,34 @@ int GetCurrentDeviceId() {
  return device_id;
 }

-dim3 GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+std::array<int, 3> GetGpuMaxGridDimSize(int id) {
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
-  dim3 ret;
+                        id,
+                        GetGPUDeviceCount()));
+  std::array<int, 3> ret;
  int size;
  auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
  PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
-  ret.x = size;
+  ret[0] = size;

  auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
  PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
-  ret.y = size;
+  ret[1] = size;

  auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
  PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
-  ret.z = size;
+  ret[2] = size;
  return ret;
 }

 const gpuDeviceProp &GetDeviceProperties(int id) {
  std::call_once(g_device_props_size_init_flag, [&] {
    int gpu_num = 0;
-    gpu_num = platform::GetGPUDeviceCount();
+    gpu_num = GetGPUDeviceCount();
    g_device_props_init_flags.resize(gpu_num);
    g_device_props.resize(gpu_num);
    for (int i = 0; i < gpu_num; ++i) {
@@ -196,16 +209,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
  });

  if (id == -1) {
-    id = platform::GetCurrentDeviceId();
+    id = GetCurrentDeviceId();
  }

  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
-    PADDLE_THROW(platform::errors::OutOfRange(
+    PADDLE_THROW(paddle::platform::errors::OutOfRange(
        "The device id %d is out of range [0, %d), where %d is the number of "
        "devices on this machine. Because the device id should be greater than "
        "or equal to zero and smaller than the number of gpus. Please input "
        "appropriate device again!",
-        id, static_cast<int>(g_device_props.size()),
+        id,
+        static_cast<int>(g_device_props.size()),
        static_cast<int>(g_device_props.size())));
  }

@@ -219,32 +233,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) {

 void SetDeviceId(int id) {
  // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
  PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
 }

-void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-                    gpuMemcpyKind kind, gpuStream_t stream) {
+void GpuMemcpyAsync(void *dst,
+                    const void *src,
+                    size_t count,
+                    gpuMemcpyKind kind,
+                    gpuStream_t stream) {
  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
 }

-void GpuMemcpySync(void *dst, const void *src, size_t count,
+void GpuMemcpySync(void *dst,
+                   const void *src,
+                   size_t count,
                   gpuMemcpyKind kind) {
  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(dst, src, count, kind));
 }

-void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
-                        int src_device, size_t count, gpuStream_t stream) {
+void GpuMemcpyPeerAsync(void *dst,
+                        int dst_device,
+                        const void *src,
+                        int src_device,
+                        size_t count,
+                        gpuStream_t stream) {
  PADDLE_ENFORCE_GPU_SUCCESS(
      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
 }

-void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
-                       int src_device, size_t count) {
+void GpuMemcpyPeerSync(
+    void *dst, int dst_device, const void *src, int src_device, size_t count) {
  PADDLE_ENFORCE_GPU_SUCCESS(
      cudaMemcpyPeer(dst, dst_device, src, src_device, count));
 }
@@ -264,5 +289,7 @@ void GpuDestroyStream(gpuStream_t stream) {
 void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); }

 gpuError_t GpuGetLastError() { return cudaGetLastError(); }
-}  // namespace platform
-}  // namespace paddle
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
--- a/paddle/pten/backends/gpu/forwards.h
+++ b/paddle/pten/backends/gpu/forwards.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// Forward-declares CUDA API types used in platform-agnostic wrapper headers.
+#pragma once
+
+/// Forward declaration of Eigen types.
+namespace Eigen {
+struct GpuDevice;
+}  // namespace Eigen
+
+/// Forward declaration of CUDA types.
+
+// Forward declaration of CUDA runtime types.
+using cudaStream_t = struct CUstream_st *;
+using cudaEvent_t = struct CUevent_st *;
+
+// Forward declaration of cuDNN types.
+using cudnnHandle_t = struct cudnnContext *;
+using cudnnTensorDescriptor_t = struct cudnnTensorStruct *;
+using cudnnConvolutionDescriptor_t = struct cudnnConvolutionStruct *;
+using cudnnPoolingDescriptor_t = struct cudnnPoolingStruct *;
+using cudnnFilterDescriptor_t = struct cudnnFilterStruct *;
+using cudnnLRNDescriptor_t = struct cudnnLRNStruct *;
+using cudnnActivationDescriptor_t = struct cudnnActivationStruct *;
+using cudnnSpatialTransformerDescriptor_t =
+    struct cudnnSpatialTransformerStruct *;
+using cudnnOpTensorDescriptor_t = struct cudnnOpTensorStruct *;
+using cudnnReduceTensorDescriptor_t = struct cudnnReduceTensorStruct *;
+using cudnnCTCLossDescriptor_t = struct cudnnCTCLossStruct *;
+using cudnnTensorTransformDescriptor_t = struct cudnnTensorTransformStruct *;
+using cudnnDropoutDescriptor_t = struct cudnnDropoutStruct *;
+using cudnnRNNDescriptor_t = struct cudnnRNNStruct *;
+using cudnnPersistentRNNPlan_t = struct cudnnPersistentRNNPlan *;
+using cudnnRNNDataDescriptor_t = struct cudnnRNNDataStruct *;
+using cudnnAlgorithmDescriptor_t = struct cudnnAlgorithmStruct *;
+using cudnnAlgorithmPerformance_t = struct cudnnAlgorithmPerformanceStruct *;
+using cudnnSeqDataDescriptor_t = struct cudnnSeqDataStruct *;
+using cudnnAttnDescriptor_t = struct cudnnAttnStruct *;
+using cudnnFusedOpsConstParamPack_t = struct cudnnFusedOpsConstParamStruct *;
+using cudnnFusedOpsVariantParamPack_t =
+    struct cudnnFusedOpsVariantParamStruct *;
+using cudnnFusedOpsPlan_t = struct cudnnFusedOpsPlanStruct *;
+
+// Forward declaration of cuBLAS types.
+using cublasHandle_t = struct cublasContext *;
+
+// Forward declaration of cuSOLVER types.
+using cusolverDnHandle_t = struct cusolverDnContext *;
+
+// Forward declaration of cuSparse types.
+using cusparseHandle_t = struct cusparseContext *;
+
+// Forward declaration of cuFFT types.
+using cufftHandle = int;
+
+// Forward declaration of NCCL types.
+using ncclComm_t = struct ncclComm *;
+
+/// Forward declaration of ROCM types.
+#include <cstddef>
+
+using hipDevice_t = int;
+using hipCtx_t = struct ihipCtx_t *;
+using hipModule_t = struct ihipModule_t *;
+using hipStream_t = struct ihipStream_t *;
+using hipEvent_t = struct ihipEvent_t *;
+using hipFunction_t = struct ihipModuleSymbol_t *;
+
+// Forward declaration of MIOpen types.
+using miopenHandle_t = struct miopenHandle *;
+using miopenAcceleratorQueue_t = hipStream_t;
+using miopenFusionOpDescriptor_t = struct miopenFusionOpDescriptor *;
+using miopenTensorDescriptor_t = struct miopenTensorDescriptor *;
+using miopenConvolutionDescriptor_t = struct miopenConvolutionDescriptor *;
+using miopenPoolingDescriptor_t = struct miopenPoolingDescriptor *;
+using miopenLRNDescriptor_t = struct miopenLRNDescriptor *;
+using miopenActivationDescriptor_t = struct miopenActivationDescriptor *;
+using miopenRNNDescriptor_t = struct miopenRNNDescriptor *;
+using miopenCTCLossDescriptor_t = struct miopenCTCLossDescriptor *;
+using miopenDropoutDescriptor_t = struct miopenDropoutDescriptor *;
+using miopenFusionPlanDescriptor_t = struct miopenFusionPlanDescriptor *;
+using miopenOperatorDescriptor_t = struct miopenOperatorDescriptor *;
+using miopenOperatorArgs_t = struct miopenOperatorArgs *;
+using miopenAllocatorFunction = void *(*)(void *context, size_t sizeBytes);
+// using miopenDeallocatorFunction = void *(*)(void *context, void *memory);
+// struct miopenConvAlgoPerf_t;
+// struct miopenConvSolution_t;
+
+// Forward declaration of rocBLAS types.
+using rocblas_handle = struct _rocblas_handle *;
+
+// Forward declaration of hipfft types.
+using hipfftHandle = struct hipfftHandle_t *;
+
+// Forward declaration of rocSOLVER types.
+using rocsolver_handle = rocblas_handle;
+
+// Forward declaration of rocSparse types.
+using rocsparse_handle = struct _rocsparse_handle *;
--- a/paddle/pten/backends/gpu/gpu_context.cc
+++ b/paddle/pten/backends/gpu/gpu_context.cc
--- a/paddle/pten/backends/gpu/gpu_context.h
+++ b/paddle/pten/backends/gpu/gpu_context.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,13 +14,162 @@ limitations under the License. */

 #pragma once

-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/device_context.h"
+#include <array>
+#include <functional>
+#include "paddle/pten/backends/gpu/forwards.h"
+#include "paddle/pten/backends/gpu/gpu_decls.h"
+#include "paddle/pten/backends/gpu/gpu_helper.h"
+#include "paddle/pten/common/place.h"
+#include "paddle/pten/core/device_context.h"

 namespace pten {
-using GPUContext = paddle::platform::CUDADeviceContext;
-}  // namespace pten

-#endif
+class DnnWorkspaceHandle;
+
+class GPUContext : public DeviceContext {
+ public:
+  GPUContext();
+
+  explicit GPUContext(const GPUPlace& place);
+
+  virtual ~GPUContext();
+
+  /*! \brief  Return place in the device context. */
+  const Place& GetPlace() const override;
+
+  /*! \brief  Return gpu stream in the device context. */
+  gpuStream_t stream() const;
+
+  /*! \brief  Return cudnn  handle in the device context. */
+  dnnHandle_t cudnn_handle() const;
+
+  /*! \brief  Return cublas handle in the device context. */
+  blasHandle_t cublas_handle() const;
+
+  /*! \brief  Return cusolver handle in the device context. */
+  solverHandle_t cusolver_dn_handle() const;
+
+  /*! \brief  Return cusparse handle in the device context. */
+  sparseHandle_t cusparse_handle() const;
+
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override;
+
+  /*! \brief  Wait for event in the stream. */
+  void WaitEvent(gpuEvent_t ev) const;
+
+  /*! \brief  Check whether tensor core is supported */
+  bool tensor_core_available() const;
+
+  /*! \brief  Return compute capability in the device context. */
+  int GetComputeCapability() const;
+
+  /*! \brief  Return the max physical thread count in the device context */
+  int GetMaxPhysicalThreadCount() const;
+
+  /*! \brief  Return the SM count in the device context */
+  int GetSMCount() const;
+
+  /*! \brief  Return the Max thread num of block in the device context */
+  int GetMaxThreadsPerBlock() const;
+
+  /*! \brief  Return the max grid dim size in the device context */
+  std::array<int, 3> GetCUDAMaxGridDimSize() const;
+
+  /*! \brief  Return eigen device in the device context. */
+  Eigen::GpuDevice* eigen_device() const;
+
+  /*! \brief  Return a cudnn workspace handle to call multiple cudnn
+   *  functions without interrupting by other threads.
+   *  Once the first cudnn function is called by the handle, a lock
+   *  would be acquired to prevent other threads from accessing the
+   *  workspace. Once the handle is destructed, the lock would be released.
+   */
+  DnnWorkspaceHandle* cudnn_workspace_handle();
+
+ public:
+  /*! \brief  Call cublas function safely. */
+  void CublasCall(const std::function<void(blasHandle_t)>&) const;
+
+  /*! \brief  Call cublas function with Tensor Core safely. If
+      Tensor Core is not available, use DEFAULT_MATH instead. */
+  void TensorCoreCublasCallIfAvailable(
+      const std::function<void(blasHandle_t)>&) const;
+
+  /*! \brief  Call cusparse function safely. */
+  void CusparseCall(const std::function<void(sparseHandle_t)>&) const;
+
+  void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const;
+
+  void RecordEvent(gpuEvent_t ev) const;
+
+  void AddStreamCallback(const std::function<void()>& callback) const;
+
+  void WaitStreamCallback() const;
+
+ public:
+  /*! \brief  Return nccl communicators. */
+  ncclComm_t nccl_comm() const;
+
+  /*! \brief  Set nccl communicators. */
+  void set_nccl_comm(ncclComm_t comm);
+
+ public:
+  // NOTE: DeviceContext hold resources. Used in training scenarios.
+  // The interface used by the training scene, DeviceContext will initialize
+  // all resources and delete them when destructing.
+  // Note that you must set the Allocator before calling Init function.
+  void Init();
+
+  // TODO(wilber): Why does the GetAllocator interface require a stream
+  // parameter?
+  // The temporary trick method bypasses this problem, and the following
+  // interfaces
+  // need to be deleted later.
+
+  // Note that this is a trick implementation, which can be used to partially
+  // initialize when the SetAllocator interface is not called.
+  void PartialInitWithoutAllocator();
+  // Note that this is a trick implementation that can be used to initialize
+  // resources that require an Allocator when the SetAllocator interface is
+  // called.
+  void PartialInitWithAllocator();
+
+ protected:
+  // NOTE: External users manage resources. Used in inference scenarios.
+  // The Set interface is for inference only, DeviceContext will mark the
+  // resource as external, and will not delete any resource when destructing.
+  void SetStream(gpuStream_t);
+
+  void SetEigenDevice(Eigen::GpuDevice*);
+
+  void SetBlasHandle(blasHandle_t);
+
+  void SetDnnHandle(dnnHandle_t);
+
+  void SetSolverHandle(solverHandle_t);
+
+  void SetSparseHandle(sparseHandle_t);
+
+  void SetDnnWorkspaceHandle(DnnWorkspaceHandle*);
+
+  void SetComputeCapability(int val);
+
+  void SetMaxThreadsPerMultiProcessor(int val);
+
+  void SetMultiProcessors(int val);
+
+  void SetMaxThreadsPerBlock(int val);
+
+  void SetMaxGridDimSize(const std::array<int, 3>& val);
+
+  void SetDriverVersion(int val);
+
+  void SetRuntimeVersion(int val);
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace pten
--- a/paddle/pten/backends/gpu/gpu_decls.h
+++ b/paddle/pten/backends/gpu/gpu_decls.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/backends/gpu/forwards.h"
+
+namespace pten {
+
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = ROCM_TYPE;
+
+#else  // PADDLE_WITH_CDUA
+
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = CUDA_TYPE;
+#endif
+
+DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t);
+DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t);
+
+DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
+                     cudnnActivationStruct,
+                     miopenActivationDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
+                     cudnnTensorStruct,
+                     miopenTensorDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
+                     cudnnFilterStruct,
+                     miopenTensorDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
+                     cudnnFilterDescriptor_t,
+                     miopenTensorDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
+                     cudnnConvolutionStruct,
+                     miopenConvolutionDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
+                     cudnnConvolutionDescriptor_t,
+                     miopenConvolutionDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
+                     cudnnPoolingDescriptor_t,
+                     miopenPoolingDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
+                     cudnnDropoutDescriptor_t,
+                     miopenDropoutDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
+
+DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
+
+DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle);
+
+DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle);
+
+#undef DECLARE_TYPE_FOR_GPU
+
+using CUDAGraphID = unsigned long long;  // NOLINT
+
+}  // namespace pten
--- a/paddle/pten/backends/gpu/gpu_helper.h
+++ b/paddle/pten/backends/gpu/gpu_helper.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/pten/backends/gpu/rocm/rocm_helper.h"
+#else
+#include "paddle/pten/backends/gpu/cuda/cuda_helper.h"
+#endif
+
+#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int)
+
+#endif
--- a/paddle/pten/backends/gpu/gpu_info.cc
+++ b/paddle/pten/backends/gpu/gpu_info.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/gpu/gpu_info.h"
+
+#include <vector>
+
+#include "gflags/gflags.h"
+
+DECLARE_string(selected_gpus);
+
+namespace pten {
+namespace backends {
+namespace gpu {
+
+static inline std::vector<std::string> Split(std::string const& original,
+                                             char separator) {
+  std::vector<std::string> results;
+  std::string token;
+  std::istringstream is(original);
+  while (std::getline(is, token, separator)) {
+    if (!token.empty()) {
+      results.push_back(token);
+    }
+  }
+  return results;
+}
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedDevices() {
+  // use user specified GPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_gpus.empty()) {
+    auto devices_str = Split(FLAGS_selected_gpus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetGPUDeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
--- a/paddle/pten/backends/gpu/gpu_info.h
+++ b/paddle/pten/backends/gpu/gpu_info.h
--- a/paddle/pten/backends/gpu/gpu_launch_config.h
+++ b/paddle/pten/backends/gpu/gpu_launch_config.h
--- a/paddle/pten/backends/gpu/gpu_types.h
+++ b/paddle/pten/backends/gpu/gpu_types.h
--- a/paddle/pten/backends/gpu/rocm/CMakeLists.txt
+++ b/paddle/pten/backends/gpu/rocm/CMakeLists.txt
--- a/paddle/pten/backends/gpu/rocm/rocm_helper.h
+++ b/paddle/pten/backends/gpu/rocm/rocm_helper.h
--- a/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc
--- a/paddle/pten/backends/xpu/xpu_context.cc
+++ b/paddle/pten/backends/xpu/xpu_context.cc
--- a/paddle/pten/backends/xpu/xpu_context.h
+++ b/paddle/pten/backends/xpu/xpu_context.h
--- a/paddle/pten/core/device_context.cc
+++ b/paddle/pten/core/device_context.cc
--- a/paddle/pten/core/device_context.h
+++ b/paddle/pten/core/device_context.h
--- a/paddle/pten/kernels/funcs/cuda_kernel_config.h
+++ b/paddle/pten/kernels/funcs/cuda_kernel_config.h
--- a/paddle/pten/kernels/funcs/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
--- a/paddle/pten/kernels/funcs/transpose.cu
+++ b/paddle/pten/kernels/funcs/transpose.cu
--- a/paddle/pten/kernels/gpu/concat_and_split.h
+++ b/paddle/pten/kernels/gpu/concat_and_split.h
--- a/paddle/pten/kernels/gpu/copy_kernel.cu
+++ b/paddle/pten/kernels/gpu/copy_kernel.cu
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
--- a/paddle/pten/kernels/gpu/reduce.h
+++ b/paddle/pten/kernels/gpu/reduce.h
--- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
--- a/paddle/pten/tests/api/test_matmul_api.cc
+++ b/paddle/pten/tests/api/test_matmul_api.cc
--- a/paddle/pten/tests/api/test_sparse_utils_api.cc
+++ b/paddle/pten/tests/api/test_sparse_utils_api.cc
--- a/paddle/pten/tests/core/test_device_context.cc
+++ b/paddle/pten/tests/core/test_device_context.cc
--- a/paddle/pten/tests/kernels/test_cast_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_cast_dev_api.cc
--- a/paddle/pten/tests/kernels/test_concat_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_concat_dev_api.cc
--- a/paddle/pten/tests/kernels/test_conj_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_conj_dev_api.cc
--- a/paddle/pten/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc
--- a/paddle/pten/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_creation_dev_api.cc
--- a/paddle/pten/tests/kernels/test_dot_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_dot_dev_api.cc
--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
--- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
--- a/paddle/pten/tests/kernels/test_matmul_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
--- a/paddle/pten/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc
--- a/paddle/pten/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
--- a/paddle/pten/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc
--- a/paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc
--- a/paddle/pten/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_sum_dev_api.cc