From 934171ae01581b1ec2928e25c6465859b651b244 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Thu, 1 Sep 2022 10:05:26 +0800
Subject: [PATCH] remove circular dependency of device_context and allocator
 (#45455)

* refine cmake of framework

* add deps for dense tensor

* fix deps

* remove alloc(ctx)

* add depends on mkldnn
---
 paddle/fluid/framework/CMakeLists.txt         | 13 +++-
 .../framework/details/nan_inf_utils_detail.cu |  6 +-
 paddle/fluid/framework/operator.h             | 24 ++-----
 paddle/fluid/framework/var_type_traits.cc     |  1 -
 paddle/fluid/memory/allocation/CMakeLists.txt |  2 +-
 paddle/fluid/memory/malloc.h                  |  2 -
 paddle/fluid/memory/malloc_test.cu            | 10 ++-
 .../operators/deformable_psroi_pooling_op.cu  | 10 ++-
 .../fluid/operators/detection/bbox_util.cu.h  |  5 +-
 paddle/fluid/operators/dgc_op.h               | 14 ++++-
 paddle/fluid/operators/fake_quantize_op.h     | 11 ++--
 paddle/fluid/operators/layer_norm_kernel.cu.h | 12 ++--
 paddle/fluid/operators/mean_iou_op.cu         |  5 +-
 paddle/fluid/operators/partial_concat_op.cu   | 10 ++-
 paddle/fluid/operators/partial_sum_op.cu      | 11 +++-
 paddle/fluid/operators/prroi_pool_op.cu       | 10 ++-
 paddle/fluid/operators/quantize_linear_op.h   |  5 +-
 paddle/fluid/operators/sum_op.cu              | 11 +++-
 paddle/fluid/platform/device_context.cc       | 62 -------------------
 paddle/phi/backends/CMakeLists.txt            |  2 +-
 paddle/phi/core/CMakeLists.txt                |  5 +-
 .../kernels/funcs/concat_and_split_functor.cu | 21 +++++--
 .../phi/kernels/funcs/elementwise_grad_base.h | 35 ++++++++---
 paddle/phi/kernels/funcs/matrix_inverse.cu.cc | 15 ++++-
 paddle/phi/kernels/funcs/matrix_solve.cu      | 10 ++-
 .../funcs/sparse/sparse_blas_impl.cu.h        | 18 ++++--
 .../kernels/funcs/values_vectors_functor.h    | 10 ++-
 paddle/phi/kernels/gpu/add_n_kernel.cu        |  6 +-
 paddle/phi/kernels/gpu/amp_kernel.cu          | 22 +++++--
 paddle/phi/kernels/gpu/box_coder.cu           |  5 +-
 paddle/phi/kernels/gpu/cholesky_kernel.cu     | 10 ++-
 paddle/phi/kernels/gpu/flip_kernel.cu         | 15 ++++-
 .../gpu/generate_proposals_v2_kernel.cu       |  6 +-
 paddle/phi/kernels/gpu/lu_kernel.cu           |  5 +-
 .../phi/kernels/gpu/matrix_rank_tol_kernel.cu | 25 ++++++--
 paddle/phi/kernels/gpu/nms_kernel.cu          |  4 +-
 paddle/phi/kernels/gpu/randperm_kernel.cu     |  5 +-
 .../phi/kernels/gpu/roi_align_grad_kernel.cu  |  6 +-
 paddle/phi/kernels/gpu/roi_align_kernel.cu    |  5 +-
 .../phi/kernels/gpu/roi_pool_grad_kernel.cu   |  5 +-
 paddle/phi/kernels/gpu/roi_pool_kernel.cu     |  5 +-
 paddle/phi/kernels/gpu/stack_grad_kernel.cu   |  6 +-
 paddle/phi/kernels/gpu/stack_kernel.cu        |  5 +-
 paddle/phi/kernels/gpu/svd_kernel.cu          | 10 ++-
 .../phi/kernels/gpu/sync_batch_norm_kernel.cu |  5 +-
 .../phi/kernels/gpu/sync_batch_norm_utils.h   |  5 +-
 .../kernels/gpu/triangular_solve_kernel.cu    |  5 +-
 paddle/phi/kernels/gpu/yolo_box_kernel.cu     |  6 +-
 48 files changed, 321 insertions(+), 185 deletions(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 7a9c631941e..369dc395fae 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -73,7 +73,8 @@ cc_library(
 cc_library(
   data_type
   SRCS data_type.cc
-  DEPS framework_proto ddim device_context)
+  DEPS framework_proto)
+
 cc_test(
   data_type_test
   SRCS data_type_test.cc
@@ -183,7 +184,7 @@ cc_test(
 cc_library(
   var_type_traits
   SRCS var_type_traits.cc
-  DEPS selected_rows_utils framework_proto scope)
+  DEPS framework_proto scope)
 if(WITH_GPU)
   target_link_libraries(var_type_traits dynload_cuda)
 endif()
@@ -364,7 +365,13 @@ cc_library(
 cc_library(
   shape_inference
   SRCS shape_inference.cc
-  DEPS ddim attribute device_context)
+  DEPS ddim attribute)
+
+# every source file that includes "dnnl.h" must depends on mkldnn
+# or, the first one should depends on mkldnn
+if(WITH_MKLDNN)
+  add_dependencies(shape_inference mkldnn)
+endif()
 
 cc_test(
   no_need_buffer_vars_inference_test
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index d91225a8141..4aa24f8cb6a 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -161,8 +161,10 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
 
     std::lock_guard<std::mutex> guard(op_var2gpu_str_mutex);
     if (op_var2gpu_str.find(op_var) == op_var2gpu_str.end()) {  // insert
-      auto gpu_str_tensor =
-          paddle::memory::Alloc(*dev_ctx, op_var.length() + 1);
+      auto gpu_str_tensor = paddle::memory::Alloc(
+          dev_ctx->GetPlace(),
+          op_var.length() + 1,
+          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx->stream())));
       gpu_str_ptr = reinterpret_cast<char*>(gpu_str_tensor->ptr());
 
       op_var2gpu_str.emplace(op_var, std::move(gpu_str_tensor));
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 27ce31c25c0..43a5b7a0bb9 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -445,26 +445,10 @@ class ExecutionContext {
   template <typename T, typename DevContext>
   Tensor AllocateTmpTensor(const framework::DDim& dim,
                            const DevContext& dev_ctx) const {
-    auto tmp_allocation_ptr = memory::Alloc(dev_ctx, product(dim) * sizeof(T));
-    auto& deleter = tmp_allocation_ptr.get_deleter();
-    auto* allocation_ptr = tmp_allocation_ptr.release();
-    auto shared_allocation =
-        std::shared_ptr<phi::Allocation>(allocation_ptr, deleter);
-
-    PADDLE_ENFORCE_GE(
-        allocation_ptr->size(),
-        phi::product(dim) * sizeof(T),
-        platform::errors::PreconditionNotMet(
-            "The data memory size(%d) is less than the tensor needed memory "
-            "size(%d).",
-            allocation_ptr->size(),
-            phi::product(dim) * sizeof(T)));
-
-    paddle::framework::Tensor temp_tensor(framework::TransToPhiDataType(
-        framework::ToDataType(std::type_index(typeid(T)))));
-    temp_tensor.Resize(dim);
-    temp_tensor.ResetHolder(std::move(shared_allocation));
-    return temp_tensor;
+    phi::DenseTensor tmp;
+    tmp.Resize(dim);
+    dev_ctx.template Alloc<T>(&tmp);
+    return tmp;
   }
 
   const RuntimeContext Context() const { return ctx_; }
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index a3231b1b8b9..6331ee8861a 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -17,7 +17,6 @@
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #include "paddle/fluid/platform/macros.h"
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 13a405c7d3d..515524f68bd 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(ExternalProject)
 
-set(ALLOCATOR_DEPS place stats profiler phi_backends)
+set(ALLOCATOR_DEPS place stats profiler phi_backends device_context)
 set(ALLOCATOR_SRCS
     allocator.cc
     cpu_allocator.cc
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 49ced76c337..b8f5f0289c4 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -33,8 +33,6 @@ extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
 
 extern AllocationPtr Alloc(const platform::Place& place, size_t size);
 
-extern AllocationPtr Alloc(const phi::DeviceContext& dev_ctx, size_t size);
-
 extern uint64_t Release(const platform::Place& place);
 
 extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu
index b3308ffdd30..0bf5e99b773 100644
--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -67,7 +67,10 @@ void MultiStreamCompute(float **data,
                         float **second_data,
                         const phi::GPUContext &ctx) {
   // multi-streams
-  AllocationPtr allocation_ptr = Alloc(ctx, N * sizeof(float));
+  AllocationPtr allocation_ptr =
+      Alloc(ctx.GetPlace(),
+            N * sizeof(float),
+            phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
   EXPECT_GE(allocation_ptr->size(), N * sizeof(float));
   *data = reinterpret_cast<float *>(allocation_ptr->ptr());
 #ifdef PADDLE_WITH_HIP
@@ -77,7 +80,10 @@ void MultiStreamCompute(float **data,
 #endif
 
   // allocate and compute on same stream again
-  allocation_ptr = Alloc(ctx, N * sizeof(float));
+  allocation_ptr =
+      Alloc(ctx.GetPlace(),
+            N * sizeof(float),
+            phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
   EXPECT_GE(allocation_ptr->size(), N * sizeof(float));
   *second_data = reinterpret_cast<float *>(allocation_ptr->ptr());
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
index d974a60197d..2fcdebd5e82 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -266,7 +266,10 @@ class DeformablePSROIPoolCUDAKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.cuda_device_context();
     int bytes = roi_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = memory::Alloc(dev_ctx, bytes);
+    auto roi_ptr = memory::Alloc(
+        dev_ctx.GetPlace(),
+        bytes,
+        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
     int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
     const auto gplace = ctx.GetPlace();
     memory::Copy(gplace,
@@ -577,7 +580,10 @@ class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel<T> {
     }
 
     int bytes = roi_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = memory::Alloc(dev_ctx, bytes);
+    auto roi_ptr = memory::Alloc(
+        dev_ctx.GetPlace(),
+        bytes,
+        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
     int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
     const auto gplace = ctx.GetPlace();
     memory::Copy(gplace,
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index 90be767e2f2..e4accef0fa9 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -301,7 +301,10 @@ static void NMS(const phi::GPUContext &ctx,
 
   const T *boxes = proposals.data<T>();
   auto place = ctx.GetPlace();
-  auto mask_ptr = memory::Alloc(ctx, boxes_num * col_blocks * sizeof(uint64_t));
+  auto mask_ptr =
+      memory::Alloc(ctx.GetPlace(),
+                    boxes_num * col_blocks * sizeof(uint64_t),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
   uint64_t *mask_dev = reinterpret_cast<uint64_t *>(mask_ptr->ptr());
 
   NMSKernel<<<blocks, threads, 0, ctx.stream()>>>(
diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h
index 80e8ea17c11..82e002cbb33 100644
--- a/paddle/fluid/operators/dgc_op.h
+++ b/paddle/fluid/operators/dgc_op.h
@@ -187,7 +187,19 @@ class DGCOpKernel : public framework::OpKernel<T> {
                                  ctx.GetPlace());
 
     int buf_size = paddle::communication::dgc::get_buffer_size(k);
-    auto tmp_ious_data = memory::Alloc(dev_ctx, buf_size);
+    paddle::memory::allocation::AllocationPtr tmp_ious_data;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
+      tmp_ious_data = memory::Alloc(
+          dev_ctx.GetPlace(),
+          buf_size,
+          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+    }
+#endif
+    if (platform::is_cpu_place(dev_ctx.GetPlace())) {
+      tmp_ious_data = memory::Alloc(dev_ctx.GetPlace(), buf_size);
+    }
+
     void* buf = reinterpret_cast<void*>(tmp_ious_data->ptr());
 
     if (!paddle::communication::dgc::k_select(
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 9d0527d7104..92aaa1fb248 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -328,8 +328,10 @@ class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
     // training
     auto *in_accum = context.Input<framework::Tensor>("InAccum");
     auto *in_state = context.Input<framework::Tensor>("InState");
-    auto cur_scale = memory::Alloc(dev_ctx, sizeof(T));
-    T *cur_scale_data = static_cast<T *>(cur_scale->ptr());
+
+    phi::DenseTensor tmp_scale;
+    tmp_scale.Resize(phi::make_dim(1));
+    T *cur_scale_data = dev_ctx.template Alloc<T>(&tmp_scale);
 
     FindAbsMaxFunctor<DeviceContext, T>()(
         dev_ctx, in->data<T>(), in->numel(), cur_scale_data);
@@ -417,8 +419,9 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
     // training
     auto *in_accum = context.Input<framework::Tensor>("InAccum");
     auto *in_state = context.Input<framework::Tensor>("InState");
-    auto cur_scale = memory::Alloc(dev_ctx, sizeof(T));
-    T *cur_scale_data = static_cast<T *>(cur_scale->ptr());
+    phi::DenseTensor tmp_scale;
+    tmp_scale.Resize(phi::make_dim(1));
+    T *cur_scale_data = dev_ctx.template Alloc<T>(&tmp_scale);
 
     FindAbsMaxFunctor<DeviceContext, T>()(
         dev_ctx, in->data<T>(), in->numel(), cur_scale_data);
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 8ed706a5443..0c41429c61e 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -1815,10 +1815,14 @@ static void LayerNormBackward(
         constexpr int part_size = BDIMY2 * VPT;
         const dim3 blocks2((feature_size + BDIMX2 - 1) / BDIMX2, part_size, 1);
 
-        auto part_grad_gamma_ptr =
-            memory::Alloc(dev_ctx, part_size * feature_size * sizeof(U));
-        auto part_grad_beta_ptr =
-            memory::Alloc(dev_ctx, part_size * feature_size * sizeof(U));
+        auto part_grad_gamma_ptr = memory::Alloc(
+            dev_ctx.GetPlace(),
+            part_size * feature_size * sizeof(U),
+            phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+        auto part_grad_beta_ptr = memory::Alloc(
+            dev_ctx.GetPlace(),
+            part_size * feature_size * sizeof(U),
+            phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
         U *part_grad_gamma = reinterpret_cast<U *>(part_grad_gamma_ptr->ptr());
         U *part_grad_beta = reinterpret_cast<U *>(part_grad_beta_ptr->ptr());
 
diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu
index 08ab074718b..ee31607c63a 100644
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
@@ -116,7 +116,10 @@ class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
     auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
 
     // Temporary memory
-    auto tmp_ious_data = memory::Alloc(dev_ctx, num_classes * sizeof(float));
+    auto tmp_ious_data = memory::Alloc(
+        dev_ctx.GetPlace(),
+        num_classes * sizeof(float),
+        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
     float* ious_data = static_cast<float*>(tmp_ious_data->ptr());
 
     // Init out_wrong, out_correct and out_mean_iou
diff --git a/paddle/fluid/operators/partial_concat_op.cu b/paddle/fluid/operators/partial_concat_op.cu
index f4d8f7083b0..a6b2700a1a4 100644
--- a/paddle/fluid/operators/partial_concat_op.cu
+++ b/paddle/fluid/operators/partial_concat_op.cu
@@ -126,7 +126,10 @@ class PartialConcatOpCUDAKernel : public framework::OpKernel<T> {
     for (int i = 0; i < in_num; ++i)
       in_data.emplace_back(in_vars[i]->data<T>());
 
-    auto tmp_in_array = memory::Alloc(dev_ctx, in_data.size() * sizeof(T *));
+    auto tmp_in_array = memory::Alloc(
+        dev_ctx.GetPlace(),
+        in_data.size() * sizeof(T *),
+        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
     memory::Copy(dev_ctx.GetPlace(),
                  tmp_in_array->ptr(),
                  platform::CPUPlace(),
@@ -202,7 +205,10 @@ class PartialConcatGradOpCUDAKernel : public framework::OpKernel<T> {
     for (size_t i = 0; i < in_num; ++i) {
       out_data.emplace_back(outs[i]->data<T>());
     }
-    auto tmp_out_array = memory::Alloc(dev_ctx, out_data.size() * sizeof(T *));
+    auto tmp_out_array = memory::Alloc(
+        dev_ctx.GetPlace(),
+        out_data.size() * sizeof(T *),
+        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
 
     memory::Copy(dev_ctx.GetPlace(),
                  tmp_out_array->ptr(),
diff --git a/paddle/fluid/operators/partial_sum_op.cu b/paddle/fluid/operators/partial_sum_op.cu
index 69517233bf3..e0703532c12 100644
--- a/paddle/fluid/operators/partial_sum_op.cu
+++ b/paddle/fluid/operators/partial_sum_op.cu
@@ -122,7 +122,10 @@ class PartialSumOpCUDAKernel : public framework::OpKernel<T> {
     }
 
     if (!in_data.empty()) {
-      auto tmp_in_array = memory::Alloc(dev_ctx, in_data.size() * sizeof(T *));
+      auto tmp_in_array = memory::Alloc(
+          dev_ctx.GetPlace(),
+          in_data.size() * sizeof(T *),
+          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
 
       memory::Copy(dev_ctx.GetPlace(),
                    tmp_in_array->ptr(),
@@ -204,8 +207,10 @@ class PartialSumGradOpCUDAKernel : public framework::OpKernel<T> {
     }
 
     if (!out_data.empty()) {
-      auto tmp_out_array =
-          memory::Alloc(dev_ctx, out_data.size() * sizeof(T *));
+      auto tmp_out_array = memory::Alloc(
+          dev_ctx.GetPlace(),
+          out_data.size() * sizeof(T *),
+          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
 
       memory::Copy(dev_ctx.GetPlace(),
                    tmp_out_array->ptr(),
diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu
index ac4666bb174..e95201c472a 100644
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ b/paddle/fluid/operators/prroi_pool_op.cu
@@ -287,7 +287,10 @@ class GPUPRROIPoolOpKernel : public framework::OpKernel<T> {
     auto cplace = platform::CPUPlace();
     auto& dev_ctx = ctx.cuda_device_context();
     int bytes = rois_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = memory::Alloc(dev_ctx, bytes);
+    auto roi_ptr = memory::Alloc(
+        dev_ctx.GetPlace(),
+        bytes,
+        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
     int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
     const auto gplace = ctx.GetPlace();
     memory::Copy(gplace,
@@ -377,7 +380,10 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
       auto cplace = platform::CPUPlace();
       auto& dev_ctx = ctx.cuda_device_context();
       int bytes = rois_batch_id_list.numel() * sizeof(int);
-      auto roi_ptr = memory::Alloc(dev_ctx, bytes);
+      auto roi_ptr = memory::Alloc(
+          dev_ctx.GetPlace(),
+          bytes,
+          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
       int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
       const auto gplace = ctx.GetPlace();
       memory::Copy(gplace,
diff --git a/paddle/fluid/operators/quantize_linear_op.h b/paddle/fluid/operators/quantize_linear_op.h
index 315ca108273..fd0579023b3 100644
--- a/paddle/fluid/operators/quantize_linear_op.h
+++ b/paddle/fluid/operators/quantize_linear_op.h
@@ -60,8 +60,9 @@ class QuantizeLinearKernel : public framework::OpKernel<T> {
         // training
         auto* in_accum = context.Input<framework::Tensor>("InAccum");
         auto* in_state = context.Input<framework::Tensor>("InState");
-        auto cur_scale = memory::Alloc(dev_ctx, sizeof(T));
-        T* cur_scale_data = static_cast<T*>(cur_scale->ptr());
+        phi::DenseTensor tmp_scale;
+        tmp_scale.Resize(phi::make_dim(1));
+        T* cur_scale_data = dev_ctx.template Alloc<T>(&tmp_scale);
 
         FindAbsMaxFunctor<DeviceContext, T>()(
             dev_ctx, in->data<T>(), in->numel(), cur_scale_data);
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index 2cc17de1820..35a1680d84d 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -200,8 +200,10 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
       }
     }
     if (!sr_in_out_data.empty()) {
-      auto tmp_sr_in_out_array =
-          memory::Alloc(dev_ctx, sr_in_out_data.size() * sizeof(T *));
+      auto tmp_sr_in_out_array = memory::Alloc(
+          dev_ctx.GetPlace(),
+          sr_in_out_data.size() * sizeof(T *),
+          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
 
       memory::Copy(dev_ctx.GetPlace(),
                    tmp_sr_in_out_array->ptr(),
@@ -221,7 +223,10 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
   }
   // if indata not null, merge into one kernel call.
   if (!in_data.empty()) {
-    auto tmp_in_array = memory::Alloc(dev_ctx, in_data.size() * sizeof(T *));
+    auto tmp_in_array = memory::Alloc(
+        dev_ctx.GetPlace(),
+        in_data.size() * sizeof(T *),
+        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
 
     memory::Copy(dev_ctx.GetPlace(),
                  tmp_in_array->ptr(),
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 98c2e92f2c3..e2fec11c190 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -39,68 +39,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/mlu/device_context_allocator.h"
 #endif
 
-namespace paddle {
-namespace memory {
-
-AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
-  auto place = dev_ctx.GetPlace();
-  if (size == 0) {
-    return Alloc(place, size);
-  }
-
-  if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    auto* default_dev_ctx = static_cast<phi::GPUContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-    auto& desired_dev_ctx = static_cast<const phi::GPUContext&>(dev_ctx);
-    if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
-      return paddle::memory::Alloc(desired_dev_ctx.GetPlace(),
-                                   size,
-                                   phi::Stream(reinterpret_cast<phi::StreamId>(
-                                       desired_dev_ctx.stream())));
-    } else {
-      return allocation::GPUContextAllocatorPool::Instance().Alloc(
-          desired_dev_ctx, size);
-    }
-#else
-    PADDLE_THROW(platform::errors::PermissionDenied(
-        "Paddle can't use CUDA device since it's not compiled with CUDA,"
-        "Please recompile or reinstall Paddle with GPU support."));
-#endif
-  } else if (platform::is_xpu_place(place)) {
-#ifdef PADDLE_WITH_XPU
-    // TODO(liuyuhui): Consider xpu stream later
-    return Alloc(place, size);
-#else
-    PADDLE_THROW(platform::errors::PermissionDenied(
-        "Paddle can't use XPU device since it's not compiled with XPU,"
-        "Please recompile or reinstall Paddle with XPU support."));
-#endif
-  } else if (platform::is_mlu_place(place)) {
-#ifdef PADDLE_WITH_MLU
-    auto* default_dev_ctx = static_cast<platform::MLUDeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-    auto& desired_dev_ctx =
-        static_cast<const platform::MLUDeviceContext&>(dev_ctx);
-    if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
-      return Alloc(place, size);
-    } else {
-      return allocation::MLUDeviceContextAllocatorPool::Instance().Alloc(
-          desired_dev_ctx, size);
-    }
-#else
-    PADDLE_THROW(platform::errors::PermissionDenied(
-        "Paddle can't use MLU device since it's not compiled with MLU,"
-        "Please recompile or reinstall Paddle with MLU support."));
-#endif
-  } else {
-    return Alloc(place, size);
-  }
-}
-
-}  // namespace memory
-}  // namespace paddle
-
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index 6a55c34266f..9a26aed5f34 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_subdirectory(dynload)
 
 set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc)
-set(BACKENDS_DEPS enforce place flags eigen3)
+set(BACKENDS_DEPS enforce place flags eigen3 phi_device_context)
 
 if(WITH_GPU OR WITH_ROCM)
   list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index 099cc87834c..e48f73694af 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -40,7 +40,10 @@ cc_library(
 cc_library(
   dense_tensor
   SRCS dense_tensor.cc dense_tensor_impl.cc
-  DEPS convert_utils fluid_convert_utils tensor_meta tensor_base)
+  DEPS convert_utils fluid_convert_utils tensor_meta tensor_base ddim)
+
+target_link_libraries(dense_tensor malloc)
+
 cc_library(
   sparse_coo_tensor
   SRCS sparse_coo_tensor.cc
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index 01701ee2873..57cf64d8df1 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -315,7 +315,10 @@ struct ConcatFunctor<phi::GPUContext, T> {
     paddle::memory::allocation::AllocationPtr tmp_dev_ins_data;
     const T** dev_ins_data = nullptr;
     if (!has_same_shape || in_num < 2 || in_num > 4) {
-      tmp_dev_ins_data = paddle::memory::Alloc(context, in_num * sizeof(T*));
+      tmp_dev_ins_data = paddle::memory::Alloc(
+          context.GetPlace(),
+          in_num * sizeof(T*),
+          phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
       auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
           inputs_data, in_num);
       paddle::memory::Copy(context.GetPlace(),
@@ -360,8 +363,10 @@ struct ConcatFunctor<phi::GPUContext, T> {
             dev_ins_data, in_num, in_col, out_row, out_col, output->data<T>());
       }
     } else {
-      auto tmp_dev_ins_col_data =
-          paddle::memory::Alloc(context, inputs_col_num * sizeof(int64_t));
+      auto tmp_dev_ins_col_data = paddle::memory::Alloc(
+          context.GetPlace(),
+          inputs_col_num * sizeof(int64_t),
+          phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
 
       auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
           inputs_col, inputs_col_num);
@@ -475,7 +480,10 @@ class SplitFunctor<phi::GPUContext, T> {
     T** dev_out_gpu_data = nullptr;
     if (!has_same_shape || o_num < 2 || o_num > 4) {
       // TODO(chentianyu03): try to find a method to remove the Alloc function
-      tmp_dev_outs_data = paddle::memory::Alloc(context, o_num * sizeof(T*));
+      tmp_dev_outs_data = paddle::memory::Alloc(
+          context.GetPlace(),
+          o_num * sizeof(T*),
+          phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
       auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
           outputs_data, o_num);
       paddle::memory::Copy(context.GetPlace(),
@@ -523,7 +531,10 @@ class SplitFunctor<phi::GPUContext, T> {
       auto tmp_dev_ins_col_data =
           // TODO(chentianyu03): try to find a method to remove the Alloc
           // function
-          paddle::memory::Alloc(context, outputs_cols_num * sizeof(int64_t));
+          paddle::memory::Alloc(
+              context.GetPlace(),
+              outputs_cols_num * sizeof(int64_t),
+              phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
       auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
           outputs_cols, outputs_cols_num);
       paddle::memory::Copy(context.GetPlace(),
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index 9ca21b967a4..62889b530af 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -1524,7 +1524,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
   ComputeBroadcastKernelSize(
       y_dims_array, out_dims_array, &y_blocks, &y_threads, max_dim);
 
-  auto x_strides_array_tmp = paddle::memory::Alloc(ctx, bytes);
+  auto x_strides_array_tmp = paddle::memory::Alloc(
+      ctx.GetPlace(),
+      bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
   int *x_strides_array_gpu =
       reinterpret_cast<int *>(x_strides_array_tmp->ptr());
   paddle::memory::Copy(gplace,
@@ -1534,7 +1537,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                        bytes,
                        ctx.stream());
 
-  auto y_strides_array_tmp = paddle::memory::Alloc(ctx, bytes);
+  auto y_strides_array_tmp = paddle::memory::Alloc(
+      ctx.GetPlace(),
+      bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
   int *y_strides_array_gpu =
       reinterpret_cast<int *>(y_strides_array_tmp->ptr());
   paddle::memory::Copy(gplace,
@@ -1544,7 +1550,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                        bytes,
                        ctx.stream());
 
-  auto out_dims_array_tmp = paddle::memory::Alloc(ctx, bytes);
+  auto out_dims_array_tmp = paddle::memory::Alloc(
+      ctx.GetPlace(),
+      bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
   int *out_dims_array_gpu = reinterpret_cast<int *>(out_dims_array_tmp->ptr());
   paddle::memory::Copy(
       gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream());
@@ -1554,7 +1563,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
   int x_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, x_threads);
   int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads);
   if (dx) {
-    auto x_strides_order_tmp = paddle::memory::Alloc(ctx, bytes);
+    auto x_strides_order_tmp = paddle::memory::Alloc(
+        ctx.GetPlace(),
+        bytes,
+        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
     int *x_strides_order_gpu =
         reinterpret_cast<int *>(x_strides_order_tmp->ptr());
     paddle::memory::Copy(gplace,
@@ -1564,7 +1576,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                          bytes,
                          ctx.stream());
 
-    auto x_dims_order_tmp = paddle::memory::Alloc(ctx, bytes);
+    auto x_dims_order_tmp = paddle::memory::Alloc(
+        ctx.GetPlace(),
+        bytes,
+        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
     int *x_dims_order_gpu = reinterpret_cast<int *>(x_dims_order_tmp->ptr());
     paddle::memory::Copy(gplace,
                          x_dims_order_gpu,
@@ -1589,7 +1604,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                                                       dx_op);
   }
   if (dy) {
-    auto y_strides_order_tmp = paddle::memory::Alloc(ctx, bytes);
+    auto y_strides_order_tmp = paddle::memory::Alloc(
+        ctx.GetPlace(),
+        bytes,
+        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
     int *y_strides_order_gpu =
         reinterpret_cast<int *>(y_strides_order_tmp->ptr());
     paddle::memory::Copy(gplace,
@@ -1599,7 +1617,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                          bytes,
                          ctx.stream());
 
-    auto y_dims_order_tmp = paddle::memory::Alloc(ctx, bytes);
+    auto y_dims_order_tmp = paddle::memory::Alloc(
+        ctx.GetPlace(),
+        bytes,
+        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
     int *y_dims_order_gpu = reinterpret_cast<int *>(y_dims_order_tmp->ptr());
     paddle::memory::Copy(gplace,
                          y_dims_order_gpu,
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
index eb9434396cc..c43c3c04755 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
@@ -36,7 +36,10 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
   if (n >= 32) {
     // Copy all elements of input matrix A to a temporary memory space to
     // avoid being overriden by getrf.
-    tmp_gpu_mat_data = paddle::memory::Alloc(dev_ctx, a.numel() * sizeof(T));
+    tmp_gpu_mat_data = paddle::memory::Alloc(
+        dev_ctx.GetPlace(),
+        a.numel() * sizeof(T),
+        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
     paddle::memory::Copy(dev_ctx.GetPlace(),
                          tmp_gpu_mat_data->ptr(),
                          dev_ctx.GetPlace(),
@@ -54,7 +57,10 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
 
   // Copy the addresses of A and A_inv from host to device.
   paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-      paddle::memory::Alloc(dev_ctx, cpu_ptrs.size() * sizeof(T*));
+      paddle::memory::Alloc(
+          dev_ctx.GetPlace(),
+          cpu_ptrs.size() * sizeof(T*),
+          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   paddle::memory::Copy(dev_ctx.GetPlace(),
                        tmp_gpu_ptrs_data->ptr(),
                        phi::CPUPlace(),
@@ -67,7 +73,10 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
   // Allocate device memory for info and pivots.
   int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
   paddle::memory::allocation::AllocationPtr tmp_gpu_info_data =
-      paddle::memory::Alloc(dev_ctx, num_ints * sizeof(int));
+      paddle::memory::Alloc(
+          dev_ctx.GetPlace(),
+          num_ints * sizeof(int),
+          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
 
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
diff --git a/paddle/phi/kernels/funcs/matrix_solve.cu b/paddle/phi/kernels/funcs/matrix_solve.cu
index 004375bc240..e2f344c9890 100644
--- a/paddle/phi/kernels/funcs/matrix_solve.cu
+++ b/paddle/phi/kernels/funcs/matrix_solve.cu
@@ -80,7 +80,10 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
 
   // Copy the addresses of A and tmp_b from host to device.
   paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-      paddle::memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
+      paddle::memory::Alloc(
+          context.GetPlace(),
+          cpu_ptrs.size() * sizeof(T*),
+          phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
   paddle::memory::Copy(context.GetPlace(),
                        tmp_gpu_ptrs_data->ptr(),
                        phi::CPUPlace(),
@@ -94,7 +97,10 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
   // Allocate device memory for BatchedGETRF's info and pivots.
   int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
   paddle::memory::allocation::AllocationPtr tmp_gpu_info_data =
-      paddle::memory::Alloc(context, num_ints * sizeof(int));
+      paddle::memory::Alloc(
+          context.GetPlace(),
+          num_ints * sizeof(int),
+          phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
   int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
 
   auto blas = phi::funcs::GetBlas<Context, T>(context);
diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
index 9ced02fcb69..738f9280267 100644
--- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
@@ -337,8 +337,10 @@ void SparseBlas<phi::GPUContext>::SPMM(bool transa,
                                           &buffer_size);
   });
 
-  paddle::memory::allocation::AllocationPtr tmp_buffer =
-      paddle::memory::Alloc(dev_ctx_, buffer_size);
+  paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::Alloc(
+      dev_ctx_.GetPlace(),
+      buffer_size,
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
   void* tmp_buffer_ptr = tmp_buffer->ptr();
   dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
     phi::dynload::cusparseSpMM(handle,
@@ -383,8 +385,10 @@ void SparseBlas<phi::GPUContext>::SPMV(bool transa,
                                           &buffer_size);
   });
 
-  paddle::memory::allocation::AllocationPtr tmp_buffer =
-      paddle::memory::Alloc(dev_ctx_, buffer_size);
+  paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::Alloc(
+      dev_ctx_.GetPlace(),
+      buffer_size,
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
   void* tmp_buffer_ptr = tmp_buffer->ptr();
   dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
     phi::dynload::cusparseSpMV(handle,
@@ -431,8 +435,10 @@ void SparseBlas<phi::GPUContext>::SDDMM(bool transa,
                                            &buffer_size);
   });
 
-  paddle::memory::allocation::AllocationPtr tmp_buffer =
-      paddle::memory::Alloc(dev_ctx_, buffer_size);
+  paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::Alloc(
+      dev_ctx_.GetPlace(),
+      buffer_size,
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
   void* tmp_buffer_ptr = tmp_buffer->ptr();
 
   dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
index a6a6d409703..88bef61fa92 100644
--- a/paddle/phi/kernels/funcs/values_vectors_functor.h
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -223,7 +223,10 @@ struct MatrixEighFunctor<GPUContext, T> {
         has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
 
     ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
-    auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_size);
+    auto info = paddle::memory::Alloc(
+        dev_ctx.GetPlace(),
+        sizeof(int) * batch_size,
+        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
     auto *info_ptr = reinterpret_cast<int *>(info->ptr());
 
     DenseTensor input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
@@ -260,7 +263,10 @@ struct MatrixEighFunctor<GPUContext, T> {
                 out_value,
                 &workspace_size);
     }
-    auto work = paddle::memory::Alloc(dev_ctx, sizeof(T) * workspace_size);
+    auto work = paddle::memory::Alloc(
+        dev_ctx.GetPlace(),
+        sizeof(T) * workspace_size,
+        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
     auto *work_ptr = reinterpret_cast<T *>(work->ptr());
 
     for (auto i = 0; i < batch_size; ++i) {
diff --git a/paddle/phi/kernels/gpu/add_n_kernel.cu b/paddle/phi/kernels/gpu/add_n_kernel.cu
index d165edaaed7..981ec24dac6 100644
--- a/paddle/phi/kernels/gpu/add_n_kernel.cu
+++ b/paddle/phi/kernels/gpu/add_n_kernel.cu
@@ -122,8 +122,10 @@ void AddNKernel(const Context &dev_ctx,
 
   // if indata not null, merge into one kernel call.
   if (!in_data.empty()) {
-    auto tmp_in_array =
-        paddle::memory::Alloc(dev_ctx, in_data.size() * sizeof(T *));
+    auto tmp_in_array = paddle::memory::Alloc(
+        dev_ctx.GetPlace(),
+        in_data.size() * sizeof(T *),
+        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
 
     paddle::memory::Copy(dev_ctx.GetPlace(),
                          tmp_in_array->ptr(),
diff --git a/paddle/phi/kernels/gpu/amp_kernel.cu b/paddle/phi/kernels/gpu/amp_kernel.cu
index b0b0f9c5e79..51e11cc44b8 100644
--- a/paddle/phi/kernels/gpu/amp_kernel.cu
+++ b/paddle/phi/kernels/gpu/amp_kernel.cu
@@ -163,8 +163,10 @@ class LazyZeros<phi::GPUContext, T> {
         paddle::memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
     int64_t* h_starts = reinterpret_cast<int64_t*>(h_in_starts_mem->ptr());
 
-    auto d_in_starts_mem =
-        paddle::memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t));
+    auto d_in_starts_mem = paddle::memory::Alloc(
+        dev_ctx.GetPlace(),
+        (xs_size + 1) * sizeof(int64_t),
+        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
     int64_t* d_starts = reinterpret_cast<int64_t*>(d_in_starts_mem->ptr());
 
     // the start index value of each tensor is
@@ -186,7 +188,10 @@ class LazyZeros<phi::GPUContext, T> {
         paddle::memory::Alloc(cpu_place, xs_size * sizeof(T*));
     T** h_out_addrs = reinterpret_cast<T**>(h_out_addrs_mem->ptr());
 
-    auto d_out_addrs_mem = paddle::memory::Alloc(dev_ctx, xs_size * sizeof(T*));
+    auto d_out_addrs_mem = paddle::memory::Alloc(
+        dev_ctx.GetPlace(),
+        xs_size * sizeof(T*),
+        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
     T** d_out_addrs = reinterpret_cast<T**>(d_out_addrs_mem->ptr());
 
     for (size_t i = 0; i < xs_size; ++i) {
@@ -287,8 +292,10 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
       paddle::memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
   int64_t* h_starts = reinterpret_cast<int64_t*>(h_starts_tensor->ptr());
 
-  auto d_starts_tensor =
-      paddle::memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t));
+  auto d_starts_tensor = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      (xs_size + 1) * sizeof(int64_t),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   int64_t* d_starts = reinterpret_cast<int64_t*>(d_starts_tensor->ptr());
 
   // the start index value of each tensor is
@@ -311,7 +318,10 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
   const T** h_xs = reinterpret_cast<const T**>(h_mem->ptr());
   T** h_outs = reinterpret_cast<T**>(h_mem->ptr()) + xs_size;
 
-  auto d_mem = paddle::memory::Alloc(dev_ctx, 2 * xs_size * sizeof(T*));
+  auto d_mem = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      2 * xs_size * sizeof(T*),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   const T** d_xs = reinterpret_cast<const T**>(d_mem->ptr());
   T** d_outs = reinterpret_cast<T**>(d_mem->ptr()) + xs_size;
 
diff --git a/paddle/phi/kernels/gpu/box_coder.cu b/paddle/phi/kernels/gpu/box_coder.cu
index e72c5f9cee1..bca18c25c3e 100644
--- a/paddle/phi/kernels/gpu/box_coder.cu
+++ b/paddle/phi/kernels/gpu/box_coder.cu
@@ -199,7 +199,10 @@ void BoxCoderKernel(const Context &dev_ctx,
   int grid = (row * col + block - 1) / block;
 
   int bytes = var_size * sizeof(float);
-  auto dev_var = paddle::memory::Alloc(dev_ctx, bytes);
+  auto dev_var = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   float *dev_var_data = reinterpret_cast<float *>(dev_var->ptr());
   auto cplace = phi::CPUPlace();
   const auto gplace = dev_ctx.GetPlace();
diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu
index 66b3bf22b04..7c4a497703e 100644
--- a/paddle/phi/kernels/gpu/cholesky_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu
@@ -81,7 +81,10 @@ struct MatrixBandPartFunctor {
     int workspace_size = 0;                                              \
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf_bufferSize( \
         handle, uplo, n, A, lda, &workspace_size));                      \
-    auto workspace = paddle::memory::Alloc(dev_ctx, workspace_size);     \
+    auto workspace = paddle::memory::Alloc(                              \
+        dev_ctx.GetPlace(),                                              \
+        workspace_size,                                                  \
+        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); \
     T* workspace_ptr = reinterpret_cast<T*>(workspace->ptr());           \
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf(            \
         handle, uplo, n, A, lda, workspace_ptr, workspace_size, info));  \
@@ -146,7 +149,10 @@ void CholeskyKernel(const Context& dev_ctx,
     for_range(matrix_band_part_functor);
   }
 
-  auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_count);
+  auto info = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      sizeof(int) * batch_count,
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   auto* info_ptr = reinterpret_cast<int*>(info->ptr());
 
 #if CUDA_VERSION >= 9020 && !defined(_WIN32)
diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu
index 08ef6ddc5b2..d9829710290 100644
--- a/paddle/phi/kernels/gpu/flip_kernel.cu
+++ b/paddle/phi/kernels/gpu/flip_kernel.cu
@@ -85,7 +85,10 @@ void FlipKernel(const Context& dev_ctx,
   std::vector<int64_t> x_stride_v = phi::vectorize(x_stride);
 
   int bytes = total_dims * sizeof(int64_t);
-  auto x_strides_array_tmp = paddle::memory::Alloc(dev_ctx, bytes);
+  auto x_strides_array_tmp = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   int64_t* x_strides_array_gpu =
       reinterpret_cast<int64_t*>(x_strides_array_tmp->ptr());
   paddle::memory::Copy(gplace,
@@ -95,7 +98,10 @@ void FlipKernel(const Context& dev_ctx,
                        bytes,
                        dev_ctx.stream());
 
-  auto x_shape_array_tmp = paddle::memory::Alloc(dev_ctx, bytes);
+  auto x_shape_array_tmp = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   int64_t* x_shape_array_gpu =
       reinterpret_cast<int64_t*>(x_shape_array_tmp->ptr());
   paddle::memory::Copy(gplace,
@@ -106,7 +112,10 @@ void FlipKernel(const Context& dev_ctx,
                        dev_ctx.stream());
 
   bytes = flip_dims_size * sizeof(int);
-  auto flip_dims_array_tmp = paddle::memory::Alloc(dev_ctx, bytes);
+  auto flip_dims_array_tmp = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   int* flip_dims_array_gpu = reinterpret_cast<int*>(flip_dims_array_tmp->ptr());
   paddle::memory::Copy(gplace,
                        flip_dims_array_gpu,
diff --git a/paddle/phi/kernels/gpu/generate_proposals_v2_kernel.cu b/paddle/phi/kernels/gpu/generate_proposals_v2_kernel.cu
index bcda357fd8f..91abb290dd8 100644
--- a/paddle/phi/kernels/gpu/generate_proposals_v2_kernel.cu
+++ b/paddle/phi/kernels/gpu/generate_proposals_v2_kernel.cu
@@ -303,8 +303,10 @@ static void NMS(const phi::GPUContext &ctx,
 
   const T *boxes = proposals.data<T>();
   auto place = ctx.GetPlace();
-  auto mask_ptr =
-      paddle::memory::Alloc(ctx, boxes_num * col_blocks * sizeof(uint64_t));
+  auto mask_ptr = paddle::memory::Alloc(
+      place,
+      boxes_num * col_blocks * sizeof(uint64_t),
+      phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
   uint64_t *mask_dev = reinterpret_cast<uint64_t *>(mask_ptr->ptr());
 
   NMSKernel<<<blocks, threads, 0, ctx.stream()>>>(
diff --git a/paddle/phi/kernels/gpu/lu_kernel.cu b/paddle/phi/kernels/gpu/lu_kernel.cu
index 7f6070a805c..7ae27df5729 100644
--- a/paddle/phi/kernels/gpu/lu_kernel.cu
+++ b/paddle/phi/kernels/gpu/lu_kernel.cu
@@ -105,7 +105,10 @@ void lu_decomposed_kernel(const Context& dev_ctx,
   int lwork;
   cusolver_bufferSize(cusolverH, m, n, d_A, lda, &lwork);
 
-  auto work_buff = paddle::memory::Alloc(dev_ctx, lwork * sizeof(T));
+  auto work_buff = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      lwork * sizeof(T),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   T* d_work = reinterpret_cast<T*>(work_buff->ptr());
 
   /* step 3: LU factorization */
diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
index 050c6d2faf5..48a73d143d6 100644
--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -92,7 +92,10 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
                                             ldt,
                                             &lwork,
                                             gesvdj_params));
-  auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float));
+  auto workspace = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      lwork * sizeof(float),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
   int stride_A = lda * n;
   int stride_U = ldu * (thin_UV ? k : m);
@@ -168,7 +171,10 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
                                             ldt,
                                             &lwork,
                                             gesvdj_params));
-  auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double));
+  auto workspace = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      lwork * sizeof(double),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
   int stride_A = lda * n;
   int stride_U = ldu * (thin_UV ? k : m);
@@ -229,7 +235,10 @@ void SyevjBatched<float>(const phi::GPUContext& dev_ctx,
   PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params));
   PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize(
       handle, jobz, uplo, n, A, lda, W, &lwork, params));
-  auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float));
+  auto workspace = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      lwork * sizeof(float),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
   for (int i = 0; i < batchSize; i++) {
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj(handle,
@@ -281,7 +290,10 @@ void SyevjBatched<double>(const phi::GPUContext& dev_ctx,
   PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params));
   PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj_bufferSize(
       handle, jobz, uplo, n, A, lda, W, &lwork, params));
-  auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double));
+  auto workspace = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      lwork * sizeof(double),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
 
   for (int i = 0; i < batchSize; i++) {
@@ -340,7 +352,10 @@ void MatrixRankTolKernel(const Context& dev_ctx,
   // Must Copy X once, because the gesvdj will destory the content when exit.
   DenseTensor x_tmp;
   paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), &x_tmp);
-  auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batches);
+  auto info = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      sizeof(int) * batches,
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   int* info_ptr = reinterpret_cast<int*>(info->ptr());
 
   DenseTensor eigenvalue_tensor;
diff --git a/paddle/phi/kernels/gpu/nms_kernel.cu b/paddle/phi/kernels/gpu/nms_kernel.cu
index 5a52cb33662..490753f1313 100644
--- a/paddle/phi/kernels/gpu/nms_kernel.cu
+++ b/paddle/phi/kernels/gpu/nms_kernel.cu
@@ -65,7 +65,9 @@ void NMSKernel(const Context& dev_ctx,
   dim3 block(threadsPerBlock);
   dim3 grid(blocks_per_line, blocks_per_line);
   auto mask_data = paddle::memory::Alloc(
-      dev_ctx, num_boxes * blocks_per_line * sizeof(uint64_t));
+      dev_ctx.GetPlace(),
+      num_boxes * blocks_per_line * sizeof(uint64_t),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   uint64_t* mask_dev = reinterpret_cast<uint64_t*>(mask_data->ptr());
   NMS<T><<<grid, block, 0, dev_ctx.stream()>>>(
       boxes.data<T>(), threshold, num_boxes, mask_dev);
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index d1c8265f2fa..94f59c84693 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -127,7 +127,10 @@ void RandpermRawKernel(
                                           end_bit < 32 ? end_bit : 32,
                                           dev_ctx.stream());
 
-  auto d_temp_storage = paddle::memory::Alloc(dev_ctx, temp_storage_bytes);
+  auto d_temp_storage = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      temp_storage_bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   cub::DeviceRadixSort::SortPairs<int, T>(d_temp_storage->ptr(),
                                           temp_storage_bytes,
                                           key.data<int>(),
diff --git a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
index d173030a9c9..0673eda8d5f 100644
--- a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
@@ -219,8 +219,10 @@ void RoiAlignGradKernel(const Context& dev_ctx,
       }
     }
   }
-  auto roi_ptr =
-      paddle::memory::Alloc(dev_ctx, box_batch_id_list.numel() * sizeof(int));
+  auto roi_ptr = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      box_batch_id_list.numel() * sizeof(int),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
   int bytes = box_batch_id_list.numel() * sizeof(int);
   paddle::memory::Copy(
diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu
index a1824cc8fe4..c11ad5971c3 100644
--- a/paddle/phi/kernels/gpu/roi_align_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu
@@ -227,7 +227,10 @@ void RoiAlignKernel(const Context& dev_ctx,
     }
   }
   int bytes = roi_batch_id_list.numel() * sizeof(int);
-  auto roi_ptr = paddle::memory::Alloc(dev_ctx, bytes);
+  auto roi_ptr = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
   paddle::memory::Copy(
       gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream());
diff --git a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
index 8b74e4bf2c6..6b888b200e1 100644
--- a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
@@ -120,7 +120,10 @@ void RoiPoolGradKernel(const Context& dev_ctx,
       }
     }
     int bytes = box_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = paddle::memory::Alloc(dev_ctx, bytes);
+    auto roi_ptr = paddle::memory::Alloc(
+        dev_ctx.GetPlace(),
+        bytes,
+        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
     int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
     paddle::memory::Copy(gplace,
                          roi_id_data,
diff --git a/paddle/phi/kernels/gpu/roi_pool_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
index 03c92e8727f..084ecc5d131 100644
--- a/paddle/phi/kernels/gpu/roi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
@@ -184,7 +184,10 @@ void RoiPoolKernel(const Context& dev_ctx,
   }
 
   int bytes = box_batch_id_list.numel() * sizeof(int);
-  auto box_ptr = paddle::memory::Alloc(dev_ctx, bytes);
+  auto box_ptr = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   int* box_id_data = reinterpret_cast<int*>(box_ptr->ptr());
   paddle::memory::Copy(gplace,
                        box_id_data,
diff --git a/paddle/phi/kernels/gpu/stack_grad_kernel.cu b/paddle/phi/kernels/gpu/stack_grad_kernel.cu
index 97923eedcfc..f99747b0593 100644
--- a/paddle/phi/kernels/gpu/stack_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/stack_grad_kernel.cu
@@ -92,8 +92,10 @@ void StackGradKernel(const Context& dev_ctx,
   }
   dy_suf = out.numel() / (split_dim * dy_pre);
 
-  auto tmp_out_data =
-      paddle::memory::Alloc(dev_ctx, outputs.size() * sizeof(T*));
+  auto tmp_out_data = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      outputs.size() * sizeof(T*),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   paddle::memory::Copy(dev_ctx.GetPlace(),
                        tmp_out_data->ptr(),
                        phi::CPUPlace(),
diff --git a/paddle/phi/kernels/gpu/stack_kernel.cu b/paddle/phi/kernels/gpu/stack_kernel.cu
index 22e8a3e4870..e5c8d392e60 100644
--- a/paddle/phi/kernels/gpu/stack_kernel.cu
+++ b/paddle/phi/kernels/gpu/stack_kernel.cu
@@ -57,7 +57,10 @@ void StackKernel(const Context& dev_ctx,
     x_datas[i] = x[i]->data<T>();
   }
 
-  auto tmp_x_data = paddle::memory::Alloc(dev_ctx, x_datas.size() * sizeof(T*));
+  auto tmp_x_data = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      x_datas.size() * sizeof(T*),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   paddle::memory::Copy(dev_ctx.GetPlace(),
                        tmp_x_data->ptr(),
                        phi::CPUPlace(),
diff --git a/paddle/phi/kernels/gpu/svd_kernel.cu b/paddle/phi/kernels/gpu/svd_kernel.cu
index d7fd3c9dffd..4d4c19cde2b 100644
--- a/paddle/phi/kernels/gpu/svd_kernel.cu
+++ b/paddle/phi/kernels/gpu/svd_kernel.cu
@@ -77,7 +77,10 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
                                                  ldt,
                                                  &lwork,
                                                  gesvdj_params));
-  auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float));
+  auto workspace = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      lwork * sizeof(float),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
   int stride_A = lda * n;
   int stride_U = ldu * (thin_UV ? k : m);
@@ -155,7 +158,10 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
                                                  ldt,
                                                  &lwork,
                                                  gesvdj_params));
-  auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double));
+  auto workspace = paddle::memory::Alloc(
+      dev_ctx.GetPlace(),
+      lwork * sizeof(double),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
   int stride_A = lda * n;
   int stride_U = ldu * (thin_UV ? k : m);
diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
index 1d8d8a6dfac..555c45b8ad3 100644
--- a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
@@ -86,7 +86,10 @@ void SyncBatchNormKernel(const Context &ctx,
     // x, x^2, 1, here 1 is used to calc device num
     // device num also can be got from platform::DeviceContextPool
     const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
-    alloc_ptr = paddle::memory::Alloc(ctx, bytes);
+    alloc_ptr = paddle::memory::Alloc(
+        ctx.GetPlace(),
+        bytes,
+        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
 
     auto *stats = reinterpret_cast<BatchNormParamType<T> *>(alloc_ptr->ptr());
     const int threads = 256;
diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
index 27c3cd4ad44..348c4982d04 100644
--- a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
@@ -365,7 +365,10 @@ void SyncBatchNormGradFunctor(
   const auto *saved_inv_var =
       saved_variance.template data<BatchNormParamType<T>>();
   const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
-  auto alloc_ptr = paddle::memory::Alloc(ctx, bytes);
+  auto alloc_ptr = paddle::memory::Alloc(
+      ctx.GetPlace(),
+      bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
   auto *stats = reinterpret_cast<BatchNormParamType<T> *>(alloc_ptr->ptr());
 
   const int block = 512;
diff --git a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
index a48afeb2c79..4728ae38d01 100644
--- a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
+++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
@@ -94,7 +94,10 @@ void TriangularSolveKernel(const Context& dev_ctx,
 
     // Copy the addresses of A and tmp_b from host to device.
     paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-        paddle::memory::Alloc(dev_ctx, cpu_ptrs.size() * sizeof(T*));
+        paddle::memory::Alloc(
+            dev_ctx.GetPlace(),
+            cpu_ptrs.size() * sizeof(T*),
+            phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
 
     paddle::memory::Copy(dev_ctx.GetPlace(),
                          tmp_gpu_ptrs_data->ptr(),
diff --git a/paddle/phi/kernels/gpu/yolo_box_kernel.cu b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
index 446a21e6a80..8baf339f0c6 100644
--- a/paddle/phi/kernels/gpu/yolo_box_kernel.cu
+++ b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
@@ -129,9 +129,9 @@ void YoloBoxKernel(const Context& dev_ctx,
   int input_size_w = downsample_ratio * w;
 
   int bytes = sizeof(int) * anchors.size();
-  auto anchors_ptr =
-      paddle::memory::Alloc(dev_ctx, sizeof(int) * anchors.size());
-  int* anchors_data = reinterpret_cast<int*>(anchors_ptr->ptr());
+  DenseTensor tmp_anchors;
+  tmp_anchors.Resize(phi::make_dim(anchors.size()));
+  int* anchors_data = dev_ctx.template Alloc<int>(&tmp_anchors);
   const auto gplace = dev_ctx.GetPlace();
   const auto cplace = phi::CPUPlace();
   paddle::memory::Copy(
-- 
GitLab