diff --git a/paddle/phi/tests/common/test_scalar.cu b/paddle/phi/tests/common/test_scalar.cu
index ad41201b9302fe494618c67ddd6ee0730ac185ad..8768fc6f538c509edb102c6551ef665790573f2d 100644
--- a/paddle/phi/tests/common/test_scalar.cu
+++ b/paddle/phi/tests/common/test_scalar.cu
@@ -15,9 +15,9 @@ limitations under the License. */
 #include <map>  // NOLINT
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
@@ -44,12 +44,10 @@ TEST(Scalar, ConstructFromDenseTensor1) {
       alloc.get(),
       phi::DenseTensorMeta(
           phi::DataType::FLOAT16, phi::make_ddim({1}), phi::DataLayout::NCHW));
-  phi::CPUContext dev_ctx;
-  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(phi::CPUPlace())
-                           .get());
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
 
-  auto* dense_x_data = dev_ctx.Alloc<float16>(&dense_x);
+  auto* dense_x_data = dev_ctx->Alloc<float16>(&dense_x);
   dense_x_data[0] = 1;
   phi::Scalar scalar_test(dense_x);
   ASSERT_NEAR(1, scalar_test.to<float16>(), 1e-6);
@@ -63,12 +61,10 @@ TEST(Scalar, ConstructFromDenseTensor2) {
       alloc.get(),
       phi::DenseTensorMeta(
           phi::DataType::INT16, phi::make_ddim({1}), phi::DataLayout::NCHW));
-  phi::CPUContext dev_ctx;
-  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(phi::CPUPlace())
-                           .get());
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
 
-  auto* dense_x_data = dev_ctx.Alloc<int16_t>(&dense_x);
+  auto* dense_x_data = dev_ctx->Alloc<int16_t>(&dense_x);
   dense_x_data[0] = 1;
   phi::Scalar scalar_test(dense_x);
   ASSERT_EQ(1, scalar_test.to<int16_t>());
@@ -82,12 +78,10 @@ TEST(Scalar, ConstructFromDenseTensor3) {
       alloc.get(),
       phi::DenseTensorMeta(
           phi::DataType::INT8, phi::make_ddim({1}), phi::DataLayout::NCHW));
-  phi::CPUContext dev_ctx;
-  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(phi::CPUPlace())
-                           .get());
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
 
-  auto* dense_x_data = dev_ctx.Alloc<int8_t>(&dense_x);
+  auto* dense_x_data = dev_ctx->Alloc<int8_t>(&dense_x);
   dense_x_data[0] = 1;
   phi::Scalar scalar_test(dense_x);
   ASSERT_EQ(1, scalar_test.to<int8_t>());
@@ -101,12 +95,10 @@ TEST(Scalar, ConstructFromDenseTensor4) {
       alloc.get(),
       phi::DenseTensorMeta(
           phi::DataType::BOOL, phi::make_ddim({1}), phi::DataLayout::NCHW));
-  phi::CPUContext dev_ctx;
-  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(phi::CPUPlace())
-                           .get());
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
 
-  auto* dense_x_data = dev_ctx.Alloc<bool>(&dense_x);
+  auto* dense_x_data = dev_ctx->Alloc<bool>(&dense_x);
   dense_x_data[0] = true;
   phi::Scalar scalar_test(dense_x);
   ASSERT_EQ(true, scalar_test.to<bool>());
@@ -120,12 +112,10 @@ TEST(Scalar, ConstructFromDenseTensor5) {
                            phi::DenseTensorMeta(phi::DataType::COMPLEX64,
                                                 phi::make_ddim({1}),
                                                 phi::DataLayout::NCHW));
-  phi::CPUContext dev_ctx;
-  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(phi::CPUPlace())
-                           .get());
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
 
-  auto* dense_x_data = dev_ctx.Alloc<complex64>(&dense_x);
+  auto* dense_x_data = dev_ctx->Alloc<complex64>(&dense_x);
   dense_x_data[0] = 1;
   phi::Scalar scalar_test(dense_x);
   complex64 expected_value(1, 0);
@@ -140,12 +130,10 @@ TEST(Scalar, ConstructFromDenseTensor6) {
                            phi::DenseTensorMeta(phi::DataType::COMPLEX128,
                                                 phi::make_ddim({1}),
                                                 phi::DataLayout::NCHW));
-  phi::CPUContext dev_ctx;
-  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(phi::CPUPlace())
-                           .get());
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
 
-  auto* dense_x_data = dev_ctx.Alloc<complex128>(&dense_x);
+  auto* dense_x_data = dev_ctx->Alloc<complex128>(&dense_x);
   dense_x_data[0] = 1;
   phi::Scalar scalar_test(dense_x);
   complex128 expected_value(1, 0);
@@ -160,15 +148,12 @@ TEST(Scalar, ConstructFromDenseTensor7) {
       alloc.get(),
       phi::DenseTensorMeta(
           phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW));
-  phi::GPUContext dev_ctx{phi::GPUPlace()};
-  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(phi::GPUPlace())
-                           .get());
-  dev_ctx.Init();
-
-  auto* dense_x_data = dev_ctx.Alloc<float>(&dense_x);
-  FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data);
-  dev_ctx.Wait();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
+
+  auto* dense_x_data = dev_ctx->Alloc<float>(&dense_x);
+  FillTensor<<<1, 1, 0, dev_ctx->stream()>>>(dense_x_data);
+  dev_ctx->Wait();
   phi::Scalar scalar_test(dense_x);
   ASSERT_NEAR(1, scalar_test.to<float>(), 1e-6);
 }
@@ -182,14 +167,12 @@ TEST(Scalar, ConstructFromTensor) {
       phi::DenseTensorMeta(
           phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW));
 
-  phi::GPUContext dev_ctx{phi::GPUPlace()};
-  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(phi::GPUPlace())
-                           .get());
-  dev_ctx.Init();
-  auto* dense_x_data = dev_ctx.Alloc<float>(dense_x.get());
-  FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data);
-  dev_ctx.Wait();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
+
+  auto* dense_x_data = dev_ctx->Alloc<float>(dense_x.get());
+  FillTensor<<<1, 1, 0, dev_ctx->stream()>>>(dense_x_data);
+  dev_ctx->Wait();
   paddle::experimental::Tensor x(dense_x);
   paddle::experimental::Scalar scalar_test(x);
   ASSERT_NEAR(1, scalar_test.to<float>(), 1e-6);
diff --git a/paddle/phi/tests/kernels/strided_memcpy_test.cc b/paddle/phi/tests/kernels/strided_memcpy_test.cc
index 7ffc83bb31bb8c55e484fb177b2e660606a3ba20..c9b3842a5299d35e6d46eebf2298935f61f79157 100644
--- a/paddle/phi/tests/kernels/strided_memcpy_test.cc
+++ b/paddle/phi/tests/kernels/strided_memcpy_test.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/strided_memcpy.h"
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/all_context.h"
 
 namespace phi {
 namespace tests {
@@ -89,16 +91,13 @@ TEST(StridedMemcpy, GPUCrop) {
   phi::GPUPlace gpu0(0);
   phi::CPUPlace cpu;
 
-  phi::GPUContext ctx(gpu0);
-  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                       .GetAllocator(gpu0, ctx.stream())
-                       .get());
-  ctx.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
 
   auto src_allocation = paddle::memory::Alloc(gpu0, sizeof(src));
 
   int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
-  paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
+  paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
 
   phi::DDim src_stride({5, 1});
 
@@ -110,10 +109,10 @@ TEST(StridedMemcpy, GPUCrop) {
   phi::DDim dst_stride({2, 1});
 
   phi::funcs::StridedMemcpy<int>(
-      ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);
+      *ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);
 
-  paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
-  ctx.Wait();
+  paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
+  ctx->Wait();
 
   ASSERT_EQ(1, dst[0]);
   ASSERT_EQ(2, dst[1]);
@@ -131,14 +130,13 @@ TEST(StridedMemcpy, GPUConcat) {
 
   phi::GPUPlace gpu0(0);
   phi::CPUPlace cpu;
-  phi::GPUContext ctx(gpu0);
-  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                       .GetAllocator(gpu0, ctx.stream())
-                       .get());
-  ctx.PartialInitWithAllocator();
+
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
+
   auto gpu_src_allocation = paddle::memory::Alloc(gpu0, sizeof(src));
   int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
-  paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
+  paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
 
   int dst[8];
   auto gpu_dst_allocation = paddle::memory::Alloc(gpu0, sizeof(dst));
@@ -149,12 +147,12 @@ TEST(StridedMemcpy, GPUConcat) {
   phi::DDim dst_stride({4, 1});
 
   phi::funcs::StridedMemcpy<int>(
-      ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
+      *ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
   phi::funcs::StridedMemcpy<int>(
-      ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2);
+      *ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2);
 
-  paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
-  ctx.Wait();
+  paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
+  ctx->Wait();
 
   // clang-format off
   int expect_dst[] = {
diff --git a/paddle/phi/tests/kernels/test_math_function.cu b/paddle/phi/tests/kernels/test_math_function.cu
index aa16c232c7988c0390ef461b1a60bb9b767ab62a..cbb76989da1fba1d66660599271825872680e609 100644
--- a/paddle/phi/tests/kernels/test_math_function.cu
+++ b/paddle/phi/tests/kernels/test_math_function.cu
@@ -14,6 +14,7 @@
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -51,11 +52,8 @@ TEST(math_function, notrans_mul_trans_fp32) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
-  phi::GPUContext context(gpu_place);
-  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(gpu_place, context.stream())
-                           .get());
-  context.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
 
   float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
   float arr[6] = {0, 1, 2, 3, 4, 5};
@@ -65,13 +63,13 @@ TEST(math_function, notrans_mul_trans_fp32) {
   paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
 
   out_gpu.mutable_data<float>({2, 2}, gpu_place);
-  GetBlas<float>(context).MatMul(
+  GetBlas<float>(*context).MatMul(
       input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
 
   paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
 
   float* out_ptr = out.data<float>();
-  context.Wait();
+  context->Wait();
   EXPECT_EQ(out_ptr[0], 5);
   EXPECT_EQ(out_ptr[1], 14);
   EXPECT_EQ(out_ptr[2], 14);
@@ -87,14 +85,11 @@ TEST(math_function, notrans_mul_trans_fp16) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
-  phi::GPUContext context(gpu_place);
-  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(gpu_place, context.stream())
-                           .get());
-  context.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
 
   // fp16 GEMM in cublas requires GPU compute capability >= 53
-  if (context.GetComputeCapability() < 53) {
+  if (context->GetComputeCapability() < 53) {
     return;
   }
 
@@ -107,18 +102,18 @@ TEST(math_function, notrans_mul_trans_fp16) {
 
   out_gpu.mutable_data<phi::dtype::float16>({2, 2}, gpu_place);
 
-  GetBlas<phi::dtype::float16>(context).MatMul(input1_gpu,
-                                               false,
-                                               input2_gpu,
-                                               true,
-                                               phi::dtype::float16(1),
-                                               &out_gpu,
-                                               phi::dtype::float16(0));
+  GetBlas<phi::dtype::float16>(*context).MatMul(input1_gpu,
+                                                false,
+                                                input2_gpu,
+                                                true,
+                                                phi::dtype::float16(1),
+                                                &out_gpu,
+                                                phi::dtype::float16(0));
 
   paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
 
   phi::dtype::float16* out_ptr = out.data<phi::dtype::float16>();
-  context.Wait();
+  context->Wait();
   EXPECT_EQ(static_cast<float>(out_ptr[0]), 5);
   EXPECT_EQ(static_cast<float>(out_ptr[1]), 14);
   EXPECT_EQ(static_cast<float>(out_ptr[2]), 14);
@@ -134,11 +129,8 @@ TEST(math_function, trans_mul_notrans_fp32) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
-  phi::GPUContext context(gpu_place);
-  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(gpu_place, context.stream())
-                           .get());
-  context.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
 
   float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
   float arr[6] = {0, 1, 2, 3, 4, 5};
@@ -149,13 +141,13 @@ TEST(math_function, trans_mul_notrans_fp32) {
 
   out_gpu.mutable_data<float>({3, 3}, gpu_place);
 
-  GetBlas<float>(context).MatMul(
+  GetBlas<float>(*context).MatMul(
       input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
 
   paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
 
   float* out_ptr = out.data<float>();
-  context.Wait();
+  context->Wait();
   EXPECT_EQ(out_ptr[0], 9);
   EXPECT_EQ(out_ptr[1], 12);
   EXPECT_EQ(out_ptr[2], 15);
@@ -176,14 +168,11 @@ TEST(math_function, trans_mul_notrans_fp16) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
-  phi::GPUContext context(gpu_place);
-  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(gpu_place, context.stream())
-                           .get());
-  context.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
 
   // fp16 GEMM in cublas requires GPU compute capability >= 53
-  if (context.GetComputeCapability() < 53) {
+  if (context->GetComputeCapability() < 53) {
     return;
   }
 
@@ -196,18 +185,18 @@ TEST(math_function, trans_mul_notrans_fp16) {
 
   out_gpu.mutable_data<phi::dtype::float16>({3, 3}, gpu_place);
 
-  GetBlas<phi::dtype::float16>(context).MatMul(input1_gpu,
-                                               true,
-                                               input2_gpu,
-                                               false,
-                                               phi::dtype::float16(1),
-                                               &out_gpu,
-                                               phi::dtype::float16(0));
+  GetBlas<phi::dtype::float16>(*context).MatMul(input1_gpu,
+                                                true,
+                                                input2_gpu,
+                                                false,
+                                                phi::dtype::float16(1),
+                                                &out_gpu,
+                                                phi::dtype::float16(0));
 
   paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
 
   phi::dtype::float16* out_ptr = out.data<phi::dtype::float16>();
-  context.Wait();
+  context->Wait();
   EXPECT_EQ(static_cast<float>(out_ptr[0]), 9);
   EXPECT_EQ(static_cast<float>(out_ptr[1]), 12);
   EXPECT_EQ(static_cast<float>(out_ptr[2]), 15);
@@ -229,11 +218,8 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
-  phi::GPUContext context(gpu_place);
-  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(gpu_place, context.stream())
-                           .get());
-  context.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
 
   int m = 2;
   int n = 3;
@@ -255,7 +241,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(gpu_place);
 
-  GetBlas<float>(context).GEMM(
+  GetBlas<float>(*context).GEMM(
       false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
 
   paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
@@ -266,7 +252,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
   // c = np.arange(8).reshape(2, 4)[:, 1:]
   // out = np.arange(8).reshape(2, 4)
   // out[:, 1:] = np.dot(a, b) + c
-  context.Wait();
+  context->Wait();
   EXPECT_EQ(input3_ptr[0], 0);
   EXPECT_EQ(input3_ptr[1], 24);
   EXPECT_EQ(input3_ptr[2], 28);
@@ -287,14 +273,11 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
-  phi::GPUContext context(gpu_place);
-  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(gpu_place, context.stream())
-                           .get());
-  context.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
 
   // fp16 GEMM in cublas requires GPU compute capability >= 53
-  if (context.GetComputeCapability() < 53) {
+  if (context->GetComputeCapability() < 53) {
     return;
   }
 
@@ -320,7 +303,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
   phi::dtype::float16* c =
       input3_gpu.mutable_data<phi::dtype::float16>(gpu_place);
 
-  GetBlas<phi::dtype::float16>(context).GEMM(
+  GetBlas<phi::dtype::float16>(*context).GEMM(
       false,
       false,
       m,
@@ -343,7 +326,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
   // c = np.arange(8).reshape(2, 4)[:, 1:]
   // out = np.arange(8).reshape(2, 4)
   // out[:, 1:] = np.dot(a, b) + c
-  context.Wait();
+  context->Wait();
   EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
   EXPECT_EQ(static_cast<float>(input3_ptr[1]), 24);
   EXPECT_EQ(static_cast<float>(input3_ptr[2]), 28);
@@ -364,11 +347,8 @@ TEST(math_function, gemm_trans_cublas_fp32) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
-  phi::GPUContext context(gpu_place);
-  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(gpu_place, context.stream())
-                           .get());
-  context.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
 
   int m = 2;
   int n = 3;
@@ -390,12 +370,12 @@ TEST(math_function, gemm_trans_cublas_fp32) {
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(gpu_place);
 
-  GetBlas<float>(context).GEMM(
+  GetBlas<float>(*context).GEMM(
       false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
 
   paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
 
-  context.Wait();
+  context->Wait();
   EXPECT_EQ(input3_ptr[0], 0);
   EXPECT_EQ(input3_ptr[1], 24);
   EXPECT_EQ(input3_ptr[2], 28);
@@ -416,14 +396,11 @@ TEST(math_function, gemm_trans_cublas_fp16) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
-  phi::GPUContext context(gpu_place);
-  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(gpu_place, context.stream())
-                           .get());
-  context.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
 
   // fp16 GEMM in cublas requires GPU compute capability >= 53
-  if (context.GetComputeCapability() < 53) {
+  if (context->GetComputeCapability() < 53) {
     return;
   }
 
@@ -449,7 +426,7 @@ TEST(math_function, gemm_trans_cublas_fp16) {
   phi::dtype::float16* c =
       input3_gpu.mutable_data<phi::dtype::float16>(gpu_place);
 
-  GetBlas<phi::dtype::float16>(context).GEMM(
+  GetBlas<phi::dtype::float16>(*context).GEMM(
       false,
       true,
       m,
@@ -466,7 +443,7 @@ TEST(math_function, gemm_trans_cublas_fp16) {
 
   paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
 
-  context.Wait();
+  context->Wait();
   EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
   EXPECT_EQ(static_cast<float>(input3_ptr[1]), 24);
   EXPECT_EQ(static_cast<float>(input3_ptr[2]), 28);
@@ -485,11 +462,8 @@ void GemvTest(int m, int n, bool trans) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
-  phi::GPUContext context(gpu_place);
-  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(gpu_place, context.stream())
-                           .get());
-  context.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
 
   T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
   T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
@@ -512,14 +486,14 @@ void GemvTest(int m, int n, bool trans) {
   paddle::framework::TensorCopySync(mat_a, gpu_place, &g_mat_a);
   paddle::framework::TensorCopySync(vec_b, gpu_place, &g_vec_b);
 
-  GetBlas<T>(context).GEMV(trans,
-                           static_cast<int>(m),
-                           static_cast<int>(n),
-                           1.,
-                           g_data_a,
-                           g_data_b,
-                           0.,
-                           g_data_c);
+  GetBlas<T>(*context).GEMV(trans,
+                            static_cast<int>(m),
+                            static_cast<int>(n),
+                            1.,
+                            g_data_a,
+                            g_data_b,
+                            0.,
+                            g_data_c);
 
   paddle::framework::TensorCopySync(g_vec_c, cpu_place, &vec_c);