[phi decoupling] remove AllocatorFacade in phi (#50380)

* remove AllocatorFacade in phi * fix include * fix bugs

[phi decoupling] remove AllocatorFacade in phi (#50380)
* remove AllocatorFacade in phi * fix include * fix bugs
d1bfb4b7 · Huang Jiyi · GitHub · 13f57ec0 · d1bfb4b7 · d1bfb4b7
3 changed file
--- a/paddle/phi/tests/common/test_scalar.cu
+++ b/paddle/phi/tests/common/test_scalar.cu
@@ -15,9 +15,9 @@ limitations under the License. */
 #include <map>  // NOLINT

 #include "gtest/gtest.h"
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
@@ -44,12 +44,10 @@ TEST(Scalar, ConstructFromDenseTensor1) {
      alloc.get(),
      phi::DenseTensorMeta(
          phi::DataType::FLOAT16, phi::make_ddim({1}), phi::DataLayout::NCHW));
-  phi::CPUContext dev_ctx;
-  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(phi::CPUPlace())
-                           .get());
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));

-  auto* dense_x_data = dev_ctx.Alloc<float16>(&dense_x);
+  auto* dense_x_data = dev_ctx->Alloc<float16>(&dense_x);
  dense_x_data[0] = 1;
  phi::Scalar scalar_test(dense_x);
  ASSERT_NEAR(1, scalar_test.to<float16>(), 1e-6);
@@ -63,12 +61,10 @@ TEST(Scalar, ConstructFromDenseTensor2) {
      alloc.get(),
      phi::DenseTensorMeta(
          phi::DataType::INT16, phi::make_ddim({1}), phi::DataLayout::NCHW));
-  phi::CPUContext dev_ctx;
-  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(phi::CPUPlace())
-                           .get());
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));

-  auto* dense_x_data = dev_ctx.Alloc<int16_t>(&dense_x);
+  auto* dense_x_data = dev_ctx->Alloc<int16_t>(&dense_x);
  dense_x_data[0] = 1;
  phi::Scalar scalar_test(dense_x);
  ASSERT_EQ(1, scalar_test.to<int16_t>());
@@ -82,12 +78,10 @@ TEST(Scalar, ConstructFromDenseTensor3) {
      alloc.get(),
      phi::DenseTensorMeta(
          phi::DataType::INT8, phi::make_ddim({1}), phi::DataLayout::NCHW));
-  phi::CPUContext dev_ctx;
-  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(phi::CPUPlace())
-                           .get());
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));

-  auto* dense_x_data = dev_ctx.Alloc<int8_t>(&dense_x);
+  auto* dense_x_data = dev_ctx->Alloc<int8_t>(&dense_x);
  dense_x_data[0] = 1;
  phi::Scalar scalar_test(dense_x);
  ASSERT_EQ(1, scalar_test.to<int8_t>());
@@ -101,12 +95,10 @@ TEST(Scalar, ConstructFromDenseTensor4) {
      alloc.get(),
      phi::DenseTensorMeta(
          phi::DataType::BOOL, phi::make_ddim({1}), phi::DataLayout::NCHW));
-  phi::CPUContext dev_ctx;
-  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(phi::CPUPlace())
-                           .get());
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));

-  auto* dense_x_data = dev_ctx.Alloc<bool>(&dense_x);
+  auto* dense_x_data = dev_ctx->Alloc<bool>(&dense_x);
  dense_x_data[0] = true;
  phi::Scalar scalar_test(dense_x);
  ASSERT_EQ(true, scalar_test.to<bool>());
@@ -120,12 +112,10 @@ TEST(Scalar, ConstructFromDenseTensor5) {
                           phi::DenseTensorMeta(phi::DataType::COMPLEX64,
                                                phi::make_ddim({1}),
                                                phi::DataLayout::NCHW));
-  phi::CPUContext dev_ctx;
-  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(phi::CPUPlace())
-                           .get());
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));

-  auto* dense_x_data = dev_ctx.Alloc<complex64>(&dense_x);
+  auto* dense_x_data = dev_ctx->Alloc<complex64>(&dense_x);
  dense_x_data[0] = 1;
  phi::Scalar scalar_test(dense_x);
  complex64 expected_value(1, 0);
@@ -140,12 +130,10 @@ TEST(Scalar, ConstructFromDenseTensor6) {
                           phi::DenseTensorMeta(phi::DataType::COMPLEX128,
                                                phi::make_ddim({1}),
                                                phi::DataLayout::NCHW));
-  phi::CPUContext dev_ctx;
-  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(phi::CPUPlace())
-                           .get());
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));

-  auto* dense_x_data = dev_ctx.Alloc<complex128>(&dense_x);
+  auto* dense_x_data = dev_ctx->Alloc<complex128>(&dense_x);
  dense_x_data[0] = 1;
  phi::Scalar scalar_test(dense_x);
  complex128 expected_value(1, 0);
@@ -160,15 +148,12 @@ TEST(Scalar, ConstructFromDenseTensor7) {
      alloc.get(),
      phi::DenseTensorMeta(
          phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW));
-  phi::GPUContext dev_ctx{phi::GPUPlace()};
-  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(phi::GPUPlace())
-                           .get());
-  dev_ctx.Init();
-
-  auto* dense_x_data = dev_ctx.Alloc<float>(&dense_x);
-  FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data);
-  dev_ctx.Wait();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
+
+  auto* dense_x_data = dev_ctx->Alloc<float>(&dense_x);
+  FillTensor<<<1, 1, 0, dev_ctx->stream()>>>(dense_x_data);
+  dev_ctx->Wait();
  phi::Scalar scalar_test(dense_x);
  ASSERT_NEAR(1, scalar_test.to<float>(), 1e-6);
 }
@@ -182,14 +167,12 @@ TEST(Scalar, ConstructFromTensor) {
      phi::DenseTensorMeta(
          phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW));

-  phi::GPUContext dev_ctx{phi::GPUPlace()};
-  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(phi::GPUPlace())
-                           .get());
-  dev_ctx.Init();
-  auto* dense_x_data = dev_ctx.Alloc<float>(dense_x.get());
-  FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data);
-  dev_ctx.Wait();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
+
+  auto* dense_x_data = dev_ctx->Alloc<float>(dense_x.get());
+  FillTensor<<<1, 1, 0, dev_ctx->stream()>>>(dense_x_data);
+  dev_ctx->Wait();
  paddle::experimental::Tensor x(dense_x);
  paddle::experimental::Scalar scalar_test(x);
  ASSERT_NEAR(1, scalar_test.to<float>(), 1e-6);

--- a/paddle/phi/tests/kernels/strided_memcpy_test.cc
+++ b/paddle/phi/tests/kernels/strided_memcpy_test.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/strided_memcpy.h"

 #include "gtest/gtest.h"
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/all_context.h"

 namespace phi {
 namespace tests {
@@ -89,16 +91,13 @@ TEST(StridedMemcpy, GPUCrop) {
  phi::GPUPlace gpu0(0);
  phi::CPUPlace cpu;

-  phi::GPUContext ctx(gpu0);
-  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                       .GetAllocator(gpu0, ctx.stream())
-                       .get());
-  ctx.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));

  auto src_allocation = paddle::memory::Alloc(gpu0, sizeof(src));

  int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
-  paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
+  paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());

  phi::DDim src_stride({5, 1});

@@ -110,10 +109,10 @@ TEST(StridedMemcpy, GPUCrop) {
  phi::DDim dst_stride({2, 1});

  phi::funcs::StridedMemcpy<int>(
-      ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);
+      *ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);

-  paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
-  ctx.Wait();
+  paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
+  ctx->Wait();

  ASSERT_EQ(1, dst[0]);
  ASSERT_EQ(2, dst[1]);
@@ -131,14 +130,13 @@ TEST(StridedMemcpy, GPUConcat) {

  phi::GPUPlace gpu0(0);
  phi::CPUPlace cpu;
-  phi::GPUContext ctx(gpu0);
-  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                       .GetAllocator(gpu0, ctx.stream())
-                       .get());
-  ctx.PartialInitWithAllocator();
+
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
+
  auto gpu_src_allocation = paddle::memory::Alloc(gpu0, sizeof(src));
  int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
-  paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
+  paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());

  int dst[8];
  auto gpu_dst_allocation = paddle::memory::Alloc(gpu0, sizeof(dst));
@@ -149,12 +147,12 @@ TEST(StridedMemcpy, GPUConcat) {
  phi::DDim dst_stride({4, 1});

  phi::funcs::StridedMemcpy<int>(
-      ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
+      *ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
  phi::funcs::StridedMemcpy<int>(
-      ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2);
+      *ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2);

-  paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
-  ctx.Wait();
+  paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
+  ctx->Wait();

  // clang-format off
  int expect_dst[] = {

--- a/paddle/phi/tests/kernels/test_math_function.cu
+++ b/paddle/phi/tests/kernels/test_math_function.cu
@@ -14,6 +14,7 @@

 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

@@ -51,11 +52,8 @@ TEST(math_function, notrans_mul_trans_fp32) {

  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
-  phi::GPUContext context(gpu_place);
-  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(gpu_place, context.stream())
-                           .get());
-  context.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));

  float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
  float arr[6] = {0, 1, 2, 3, 4, 5};
@@ -65,13 +63,13 @@ TEST(math_function, notrans_mul_trans_fp32) {
  paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);

  out_gpu.mutable_data<float>({2, 2}, gpu_place);
-  GetBlas<float>(context).MatMul(
+  GetBlas<float>(*context).MatMul(
      input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);

  paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);

  float* out_ptr = out.data<float>();
-  context.Wait();
+  context->Wait();
  EXPECT_EQ(out_ptr[0], 5);
  EXPECT_EQ(out_ptr[1], 14);
  EXPECT_EQ(out_ptr[2], 14);
@@ -87,14 +85,11 @@ TEST(math_function, notrans_mul_trans_fp16) {

  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
-  phi::GPUContext context(gpu_place);
-  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(gpu_place, context.stream())
-                           .get());
-  context.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));

  // fp16 GEMM in cublas requires GPU compute capability >= 53
-  if (context.GetComputeCapability() < 53) {
+  if (context->GetComputeCapability() < 53) {
    return;
  }

@@ -107,18 +102,18 @@ TEST(math_function, notrans_mul_trans_fp16) {

  out_gpu.mutable_data<phi::dtype::float16>({2, 2}, gpu_place);

-  GetBlas<phi::dtype::float16>(context).MatMul(input1_gpu,
-                                               false,
-                                               input2_gpu,
-                                               true,
-                                               phi::dtype::float16(1),
-                                               &out_gpu,
-                                               phi::dtype::float16(0));
+  GetBlas<phi::dtype::float16>(*context).MatMul(input1_gpu,
+                                                false,
+                                                input2_gpu,
+                                                true,
+                                                phi::dtype::float16(1),
+                                                &out_gpu,
+                                                phi::dtype::float16(0));

  paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);

  phi::dtype::float16* out_ptr = out.data<phi::dtype::float16>();
-  context.Wait();
+  context->Wait();
  EXPECT_EQ(static_cast<float>(out_ptr[0]), 5);
  EXPECT_EQ(static_cast<float>(out_ptr[1]), 14);
  EXPECT_EQ(static_cast<float>(out_ptr[2]), 14);
@@ -134,11 +129,8 @@ TEST(math_function, trans_mul_notrans_fp32) {

  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
-  phi::GPUContext context(gpu_place);
-  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(gpu_place, context.stream())
-                           .get());
-  context.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));

  float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
  float arr[6] = {0, 1, 2, 3, 4, 5};
@@ -149,13 +141,13 @@ TEST(math_function, trans_mul_notrans_fp32) {

  out_gpu.mutable_data<float>({3, 3}, gpu_place);

-  GetBlas<float>(context).MatMul(
+  GetBlas<float>(*context).MatMul(
      input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);

  paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);

  float* out_ptr = out.data<float>();
-  context.Wait();
+  context->Wait();
  EXPECT_EQ(out_ptr[0], 9);
  EXPECT_EQ(out_ptr[1], 12);
  EXPECT_EQ(out_ptr[2], 15);
@@ -176,14 +168,11 @@ TEST(math_function, trans_mul_notrans_fp16) {

  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
-  phi::GPUContext context(gpu_place);
-  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(gpu_place, context.stream())
-                           .get());
-  context.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));

  // fp16 GEMM in cublas requires GPU compute capability >= 53
-  if (context.GetComputeCapability() < 53) {
+  if (context->GetComputeCapability() < 53) {
    return;
  }

@@ -196,18 +185,18 @@ TEST(math_function, trans_mul_notrans_fp16) {

  out_gpu.mutable_data<phi::dtype::float16>({3, 3}, gpu_place);

-  GetBlas<phi::dtype::float16>(context).MatMul(input1_gpu,
-                                               true,
-                                               input2_gpu,
-                                               false,
-                                               phi::dtype::float16(1),
-                                               &out_gpu,
-                                               phi::dtype::float16(0));
+  GetBlas<phi::dtype::float16>(*context).MatMul(input1_gpu,
+                                                true,
+                                                input2_gpu,
+                                                false,
+                                                phi::dtype::float16(1),
+                                                &out_gpu,
+                                                phi::dtype::float16(0));

  paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);

  phi::dtype::float16* out_ptr = out.data<phi::dtype::float16>();
-  context.Wait();
+  context->Wait();
  EXPECT_EQ(static_cast<float>(out_ptr[0]), 9);
  EXPECT_EQ(static_cast<float>(out_ptr[1]), 12);
  EXPECT_EQ(static_cast<float>(out_ptr[2]), 15);
@@ -229,11 +218,8 @@ TEST(math_function, gemm_notrans_cublas_fp32) {

  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
-  phi::GPUContext context(gpu_place);
-  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(gpu_place, context.stream())
-                           .get());
-  context.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));

  int m = 2;
  int n = 3;
@@ -255,7 +241,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
  float* b = input2_gpu.data<float>();
  float* c = input3_gpu.mutable_data<float>(gpu_place);

-  GetBlas<float>(context).GEMM(
+  GetBlas<float>(*context).GEMM(
      false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);

  paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
@@ -266,7 +252,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
  // c = np.arange(8).reshape(2, 4)[:, 1:]
  // out = np.arange(8).reshape(2, 4)
  // out[:, 1:] = np.dot(a, b) + c
-  context.Wait();
+  context->Wait();
  EXPECT_EQ(input3_ptr[0], 0);
  EXPECT_EQ(input3_ptr[1], 24);
  EXPECT_EQ(input3_ptr[2], 28);
@@ -287,14 +273,11 @@ TEST(math_function, gemm_notrans_cublas_fp16) {

  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
-  phi::GPUContext context(gpu_place);
-  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(gpu_place, context.stream())
-                           .get());
-  context.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));

  // fp16 GEMM in cublas requires GPU compute capability >= 53
-  if (context.GetComputeCapability() < 53) {
+  if (context->GetComputeCapability() < 53) {
    return;
  }

@@ -320,7 +303,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
  phi::dtype::float16* c =
      input3_gpu.mutable_data<phi::dtype::float16>(gpu_place);

-  GetBlas<phi::dtype::float16>(context).GEMM(
+  GetBlas<phi::dtype::float16>(*context).GEMM(
      false,
      false,
      m,
@@ -343,7 +326,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
  // c = np.arange(8).reshape(2, 4)[:, 1:]
  // out = np.arange(8).reshape(2, 4)
  // out[:, 1:] = np.dot(a, b) + c
-  context.Wait();
+  context->Wait();
  EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
  EXPECT_EQ(static_cast<float>(input3_ptr[1]), 24);
  EXPECT_EQ(static_cast<float>(input3_ptr[2]), 28);
@@ -364,11 +347,8 @@ TEST(math_function, gemm_trans_cublas_fp32) {

  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
-  phi::GPUContext context(gpu_place);
-  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(gpu_place, context.stream())
-                           .get());
-  context.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));

  int m = 2;
  int n = 3;
@@ -390,12 +370,12 @@ TEST(math_function, gemm_trans_cublas_fp32) {
  float* b = input2_gpu.data<float>();
  float* c = input3_gpu.mutable_data<float>(gpu_place);

-  GetBlas<float>(context).GEMM(
+  GetBlas<float>(*context).GEMM(
      false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);

  paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);

-  context.Wait();
+  context->Wait();
  EXPECT_EQ(input3_ptr[0], 0);
  EXPECT_EQ(input3_ptr[1], 24);
  EXPECT_EQ(input3_ptr[2], 28);
@@ -416,14 +396,11 @@ TEST(math_function, gemm_trans_cublas_fp16) {

  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
-  phi::GPUContext context(gpu_place);
-  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(gpu_place, context.stream())
-                           .get());
-  context.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));

  // fp16 GEMM in cublas requires GPU compute capability >= 53
-  if (context.GetComputeCapability() < 53) {
+  if (context->GetComputeCapability() < 53) {
    return;
  }

@@ -449,7 +426,7 @@ TEST(math_function, gemm_trans_cublas_fp16) {
  phi::dtype::float16* c =
      input3_gpu.mutable_data<phi::dtype::float16>(gpu_place);

-  GetBlas<phi::dtype::float16>(context).GEMM(
+  GetBlas<phi::dtype::float16>(*context).GEMM(
      false,
      true,
      m,
@@ -466,7 +443,7 @@ TEST(math_function, gemm_trans_cublas_fp16) {

  paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);

-  context.Wait();
+  context->Wait();
  EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
  EXPECT_EQ(static_cast<float>(input3_ptr[1]), 24);
  EXPECT_EQ(static_cast<float>(input3_ptr[2]), 28);
@@ -485,11 +462,8 @@ void GemvTest(int m, int n, bool trans) {

  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
-  phi::GPUContext context(gpu_place);
-  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(gpu_place, context.stream())
-                           .get());
-  context.PartialInitWithAllocator();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));

  T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
  T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
@@ -512,14 +486,14 @@ void GemvTest(int m, int n, bool trans) {
  paddle::framework::TensorCopySync(mat_a, gpu_place, &g_mat_a);
  paddle::framework::TensorCopySync(vec_b, gpu_place, &g_vec_b);

-  GetBlas<T>(context).GEMV(trans,
-                           static_cast<int>(m),
-                           static_cast<int>(n),
-                           1.,
-                           g_data_a,
-                           g_data_b,
-                           0.,
-                           g_data_c);
+  GetBlas<T>(*context).GEMV(trans,
+                            static_cast<int>(m),
+                            static_cast<int>(n),
+                            1.,
+                            g_data_a,
+                            g_data_b,
+                            0.,
+                            g_data_c);

  paddle::framework::TensorCopySync(g_vec_c, cpu_place, &vec_c);