diff --git a/paddle/phi/tests/common/test_scalar.cu b/paddle/phi/tests/common/test_scalar.cu index ad41201b9302fe494618c67ddd6ee0730ac185ad..8768fc6f538c509edb102c6551ef665790573f2d 100644 --- a/paddle/phi/tests/common/test_scalar.cu +++ b/paddle/phi/tests/common/test_scalar.cu @@ -15,9 +15,9 @@ limitations under the License. */ #include // NOLINT #include "gtest/gtest.h" -#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/common/float16.h" @@ -44,12 +44,10 @@ TEST(Scalar, ConstructFromDenseTensor1) { alloc.get(), phi::DenseTensorMeta( phi::DataType::FLOAT16, phi::make_ddim({1}), phi::DataLayout::NCHW)); - phi::CPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::CPUPlace()) - .get()); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* dev_ctx = reinterpret_cast(pool.Get(phi::CPUPlace())); - auto* dense_x_data = dev_ctx.Alloc(&dense_x); + auto* dense_x_data = dev_ctx->Alloc(&dense_x); dense_x_data[0] = 1; phi::Scalar scalar_test(dense_x); ASSERT_NEAR(1, scalar_test.to(), 1e-6); @@ -63,12 +61,10 @@ TEST(Scalar, ConstructFromDenseTensor2) { alloc.get(), phi::DenseTensorMeta( phi::DataType::INT16, phi::make_ddim({1}), phi::DataLayout::NCHW)); - phi::CPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::CPUPlace()) - .get()); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* dev_ctx = reinterpret_cast(pool.Get(phi::CPUPlace())); - auto* dense_x_data = dev_ctx.Alloc(&dense_x); + auto* dense_x_data = dev_ctx->Alloc(&dense_x); dense_x_data[0] = 1; phi::Scalar scalar_test(dense_x); ASSERT_EQ(1, scalar_test.to()); @@ -82,12 +78,10 @@ TEST(Scalar, ConstructFromDenseTensor3) { alloc.get(), phi::DenseTensorMeta( phi::DataType::INT8, phi::make_ddim({1}), phi::DataLayout::NCHW)); - phi::CPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::CPUPlace()) - .get()); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* dev_ctx = reinterpret_cast(pool.Get(phi::CPUPlace())); - auto* dense_x_data = dev_ctx.Alloc(&dense_x); + auto* dense_x_data = dev_ctx->Alloc(&dense_x); dense_x_data[0] = 1; phi::Scalar scalar_test(dense_x); ASSERT_EQ(1, scalar_test.to()); @@ -101,12 +95,10 @@ TEST(Scalar, ConstructFromDenseTensor4) { alloc.get(), phi::DenseTensorMeta( phi::DataType::BOOL, phi::make_ddim({1}), phi::DataLayout::NCHW)); - phi::CPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::CPUPlace()) - .get()); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* dev_ctx = reinterpret_cast(pool.Get(phi::CPUPlace())); - auto* dense_x_data = dev_ctx.Alloc(&dense_x); + auto* dense_x_data = dev_ctx->Alloc(&dense_x); dense_x_data[0] = true; phi::Scalar scalar_test(dense_x); ASSERT_EQ(true, scalar_test.to()); @@ -120,12 +112,10 @@ TEST(Scalar, ConstructFromDenseTensor5) { phi::DenseTensorMeta(phi::DataType::COMPLEX64, phi::make_ddim({1}), phi::DataLayout::NCHW)); - phi::CPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::CPUPlace()) - .get()); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* dev_ctx = reinterpret_cast(pool.Get(phi::CPUPlace())); - auto* dense_x_data = dev_ctx.Alloc(&dense_x); + auto* dense_x_data = dev_ctx->Alloc(&dense_x); dense_x_data[0] = 1; phi::Scalar scalar_test(dense_x); complex64 expected_value(1, 0); @@ -140,12 +130,10 @@ TEST(Scalar, ConstructFromDenseTensor6) { phi::DenseTensorMeta(phi::DataType::COMPLEX128, phi::make_ddim({1}), phi::DataLayout::NCHW)); - phi::CPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::CPUPlace()) - .get()); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* dev_ctx = reinterpret_cast(pool.Get(phi::CPUPlace())); - auto* dense_x_data = dev_ctx.Alloc(&dense_x); + auto* dense_x_data = dev_ctx->Alloc(&dense_x); dense_x_data[0] = 1; phi::Scalar scalar_test(dense_x); complex128 expected_value(1, 0); @@ -160,15 +148,12 @@ TEST(Scalar, ConstructFromDenseTensor7) { alloc.get(), phi::DenseTensorMeta( phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW)); - phi::GPUContext dev_ctx{phi::GPUPlace()}; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::GPUPlace()) - .get()); - dev_ctx.Init(); - - auto* dense_x_data = dev_ctx.Alloc(&dense_x); - FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data); - dev_ctx.Wait(); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* dev_ctx = reinterpret_cast(pool.Get(phi::GPUPlace())); + + auto* dense_x_data = dev_ctx->Alloc(&dense_x); + FillTensor<<<1, 1, 0, dev_ctx->stream()>>>(dense_x_data); + dev_ctx->Wait(); phi::Scalar scalar_test(dense_x); ASSERT_NEAR(1, scalar_test.to(), 1e-6); } @@ -182,14 +167,12 @@ TEST(Scalar, ConstructFromTensor) { phi::DenseTensorMeta( phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW)); - phi::GPUContext dev_ctx{phi::GPUPlace()}; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::GPUPlace()) - .get()); - dev_ctx.Init(); - auto* dense_x_data = dev_ctx.Alloc(dense_x.get()); - FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data); - dev_ctx.Wait(); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* dev_ctx = reinterpret_cast(pool.Get(phi::GPUPlace())); + + auto* dense_x_data = dev_ctx->Alloc(dense_x.get()); + FillTensor<<<1, 1, 0, dev_ctx->stream()>>>(dense_x_data); + dev_ctx->Wait(); paddle::experimental::Tensor x(dense_x); paddle::experimental::Scalar scalar_test(x); ASSERT_NEAR(1, scalar_test.to(), 1e-6); diff --git a/paddle/phi/tests/kernels/strided_memcpy_test.cc b/paddle/phi/tests/kernels/strided_memcpy_test.cc index 7ffc83bb31bb8c55e484fb177b2e660606a3ba20..c9b3842a5299d35e6d46eebf2298935f61f79157 100644 --- a/paddle/phi/tests/kernels/strided_memcpy_test.cc +++ b/paddle/phi/tests/kernels/strided_memcpy_test.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/strided_memcpy.h" #include "gtest/gtest.h" -#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/phi/backends/all_context.h" namespace phi { namespace tests { @@ -89,16 +91,13 @@ TEST(StridedMemcpy, GPUCrop) { phi::GPUPlace gpu0(0); phi::CPUPlace cpu; - phi::GPUContext ctx(gpu0); - ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(gpu0, ctx.stream()) - .get()); - ctx.PartialInitWithAllocator(); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* ctx = reinterpret_cast(pool.Get(phi::GPUPlace())); auto src_allocation = paddle::memory::Alloc(gpu0, sizeof(src)); int* gpu_src = reinterpret_cast(src_allocation->ptr()); - paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); + paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream()); phi::DDim src_stride({5, 1}); @@ -110,10 +109,10 @@ TEST(StridedMemcpy, GPUCrop) { phi::DDim dst_stride({2, 1}); phi::funcs::StridedMemcpy( - ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst); + *ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst); - paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream()); - ctx.Wait(); + paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream()); + ctx->Wait(); ASSERT_EQ(1, dst[0]); ASSERT_EQ(2, dst[1]); @@ -131,14 +130,13 @@ TEST(StridedMemcpy, GPUConcat) { phi::GPUPlace gpu0(0); phi::CPUPlace cpu; - phi::GPUContext ctx(gpu0); - ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(gpu0, ctx.stream()) - .get()); - ctx.PartialInitWithAllocator(); + + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* ctx = reinterpret_cast(pool.Get(phi::GPUPlace())); + auto gpu_src_allocation = paddle::memory::Alloc(gpu0, sizeof(src)); int* gpu_src = reinterpret_cast(gpu_src_allocation->ptr()); - paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); + paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream()); int dst[8]; auto gpu_dst_allocation = paddle::memory::Alloc(gpu0, sizeof(dst)); @@ -149,12 +147,12 @@ TEST(StridedMemcpy, GPUConcat) { phi::DDim dst_stride({4, 1}); phi::funcs::StridedMemcpy( - ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst); + *ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst); phi::funcs::StridedMemcpy( - ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2); + *ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2); - paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream()); - ctx.Wait(); + paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream()); + ctx->Wait(); // clang-format off int expect_dst[] = { diff --git a/paddle/phi/tests/kernels/test_math_function.cu b/paddle/phi/tests/kernels/test_math_function.cu index aa16c232c7988c0390ef461b1a60bb9b767ab62a..cbb76989da1fba1d66660599271825872680e609 100644 --- a/paddle/phi/tests/kernels/test_math_function.cu +++ b/paddle/phi/tests/kernels/test_math_function.cu @@ -14,6 +14,7 @@ #include "gtest/gtest.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/backends/all_context.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -51,11 +52,8 @@ TEST(math_function, notrans_mul_trans_fp32) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - phi::GPUContext context(gpu_place); - context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(gpu_place, context.stream()) - .get()); - context.PartialInitWithAllocator(); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* context = reinterpret_cast(pool.Get(phi::GPUPlace())); float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr[6] = {0, 1, 2, 3, 4, 5}; @@ -65,13 +63,13 @@ TEST(math_function, notrans_mul_trans_fp32) { paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu); out_gpu.mutable_data({2, 2}, gpu_place); - GetBlas(context).MatMul( + GetBlas(*context).MatMul( input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); float* out_ptr = out.data(); - context.Wait(); + context->Wait(); EXPECT_EQ(out_ptr[0], 5); EXPECT_EQ(out_ptr[1], 14); EXPECT_EQ(out_ptr[2], 14); @@ -87,14 +85,11 @@ TEST(math_function, notrans_mul_trans_fp16) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - phi::GPUContext context(gpu_place); - context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(gpu_place, context.stream()) - .get()); - context.PartialInitWithAllocator(); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* context = reinterpret_cast(pool.Get(phi::GPUPlace())); // fp16 GEMM in cublas requires GPU compute capability >= 53 - if (context.GetComputeCapability() < 53) { + if (context->GetComputeCapability() < 53) { return; } @@ -107,18 +102,18 @@ TEST(math_function, notrans_mul_trans_fp16) { out_gpu.mutable_data({2, 2}, gpu_place); - GetBlas(context).MatMul(input1_gpu, - false, - input2_gpu, - true, - phi::dtype::float16(1), - &out_gpu, - phi::dtype::float16(0)); + GetBlas(*context).MatMul(input1_gpu, + false, + input2_gpu, + true, + phi::dtype::float16(1), + &out_gpu, + phi::dtype::float16(0)); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); phi::dtype::float16* out_ptr = out.data(); - context.Wait(); + context->Wait(); EXPECT_EQ(static_cast(out_ptr[0]), 5); EXPECT_EQ(static_cast(out_ptr[1]), 14); EXPECT_EQ(static_cast(out_ptr[2]), 14); @@ -134,11 +129,8 @@ TEST(math_function, trans_mul_notrans_fp32) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - phi::GPUContext context(gpu_place); - context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(gpu_place, context.stream()) - .get()); - context.PartialInitWithAllocator(); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* context = reinterpret_cast(pool.Get(phi::GPUPlace())); float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr[6] = {0, 1, 2, 3, 4, 5}; @@ -149,13 +141,13 @@ TEST(math_function, trans_mul_notrans_fp32) { out_gpu.mutable_data({3, 3}, gpu_place); - GetBlas(context).MatMul( + GetBlas(*context).MatMul( input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); float* out_ptr = out.data(); - context.Wait(); + context->Wait(); EXPECT_EQ(out_ptr[0], 9); EXPECT_EQ(out_ptr[1], 12); EXPECT_EQ(out_ptr[2], 15); @@ -176,14 +168,11 @@ TEST(math_function, trans_mul_notrans_fp16) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - phi::GPUContext context(gpu_place); - context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(gpu_place, context.stream()) - .get()); - context.PartialInitWithAllocator(); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* context = reinterpret_cast(pool.Get(phi::GPUPlace())); // fp16 GEMM in cublas requires GPU compute capability >= 53 - if (context.GetComputeCapability() < 53) { + if (context->GetComputeCapability() < 53) { return; } @@ -196,18 +185,18 @@ TEST(math_function, trans_mul_notrans_fp16) { out_gpu.mutable_data({3, 3}, gpu_place); - GetBlas(context).MatMul(input1_gpu, - true, - input2_gpu, - false, - phi::dtype::float16(1), - &out_gpu, - phi::dtype::float16(0)); + GetBlas(*context).MatMul(input1_gpu, + true, + input2_gpu, + false, + phi::dtype::float16(1), + &out_gpu, + phi::dtype::float16(0)); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); phi::dtype::float16* out_ptr = out.data(); - context.Wait(); + context->Wait(); EXPECT_EQ(static_cast(out_ptr[0]), 9); EXPECT_EQ(static_cast(out_ptr[1]), 12); EXPECT_EQ(static_cast(out_ptr[2]), 15); @@ -229,11 +218,8 @@ TEST(math_function, gemm_notrans_cublas_fp32) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - phi::GPUContext context(gpu_place); - context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(gpu_place, context.stream()) - .get()); - context.PartialInitWithAllocator(); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* context = reinterpret_cast(pool.Get(phi::GPUPlace())); int m = 2; int n = 3; @@ -255,7 +241,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) { float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(gpu_place); - GetBlas(context).GEMM( + GetBlas(*context).GEMM( false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3); @@ -266,7 +252,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) { // c = np.arange(8).reshape(2, 4)[:, 1:] // out = np.arange(8).reshape(2, 4) // out[:, 1:] = np.dot(a, b) + c - context.Wait(); + context->Wait(); EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[1], 24); EXPECT_EQ(input3_ptr[2], 28); @@ -287,14 +273,11 @@ TEST(math_function, gemm_notrans_cublas_fp16) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - phi::GPUContext context(gpu_place); - context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(gpu_place, context.stream()) - .get()); - context.PartialInitWithAllocator(); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* context = reinterpret_cast(pool.Get(phi::GPUPlace())); // fp16 GEMM in cublas requires GPU compute capability >= 53 - if (context.GetComputeCapability() < 53) { + if (context->GetComputeCapability() < 53) { return; } @@ -320,7 +303,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) { phi::dtype::float16* c = input3_gpu.mutable_data(gpu_place); - GetBlas(context).GEMM( + GetBlas(*context).GEMM( false, false, m, @@ -343,7 +326,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) { // c = np.arange(8).reshape(2, 4)[:, 1:] // out = np.arange(8).reshape(2, 4) // out[:, 1:] = np.dot(a, b) + c - context.Wait(); + context->Wait(); EXPECT_EQ(static_cast(input3_ptr[0]), 0); EXPECT_EQ(static_cast(input3_ptr[1]), 24); EXPECT_EQ(static_cast(input3_ptr[2]), 28); @@ -364,11 +347,8 @@ TEST(math_function, gemm_trans_cublas_fp32) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - phi::GPUContext context(gpu_place); - context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(gpu_place, context.stream()) - .get()); - context.PartialInitWithAllocator(); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* context = reinterpret_cast(pool.Get(phi::GPUPlace())); int m = 2; int n = 3; @@ -390,12 +370,12 @@ TEST(math_function, gemm_trans_cublas_fp32) { float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(gpu_place); - GetBlas(context).GEMM( + GetBlas(*context).GEMM( false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3); - context.Wait(); + context->Wait(); EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[1], 24); EXPECT_EQ(input3_ptr[2], 28); @@ -416,14 +396,11 @@ TEST(math_function, gemm_trans_cublas_fp16) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - phi::GPUContext context(gpu_place); - context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(gpu_place, context.stream()) - .get()); - context.PartialInitWithAllocator(); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* context = reinterpret_cast(pool.Get(phi::GPUPlace())); // fp16 GEMM in cublas requires GPU compute capability >= 53 - if (context.GetComputeCapability() < 53) { + if (context->GetComputeCapability() < 53) { return; } @@ -449,7 +426,7 @@ TEST(math_function, gemm_trans_cublas_fp16) { phi::dtype::float16* c = input3_gpu.mutable_data(gpu_place); - GetBlas(context).GEMM( + GetBlas(*context).GEMM( false, true, m, @@ -466,7 +443,7 @@ TEST(math_function, gemm_trans_cublas_fp16) { paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3); - context.Wait(); + context->Wait(); EXPECT_EQ(static_cast(input3_ptr[0]), 0); EXPECT_EQ(static_cast(input3_ptr[1]), 24); EXPECT_EQ(static_cast(input3_ptr[2]), 28); @@ -485,11 +462,8 @@ void GemvTest(int m, int n, bool trans) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - phi::GPUContext context(gpu_place); - context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(gpu_place, context.stream()) - .get()); - context.PartialInitWithAllocator(); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* context = reinterpret_cast(pool.Get(phi::GPUPlace())); T* data_a = mat_a.mutable_data({m, n}, cpu_place); T* data_b = vec_b.mutable_data({trans ? m : n}, cpu_place); @@ -512,14 +486,14 @@ void GemvTest(int m, int n, bool trans) { paddle::framework::TensorCopySync(mat_a, gpu_place, &g_mat_a); paddle::framework::TensorCopySync(vec_b, gpu_place, &g_vec_b); - GetBlas(context).GEMV(trans, - static_cast(m), - static_cast(n), - 1., - g_data_a, - g_data_b, - 0., - g_data_c); + GetBlas(*context).GEMV(trans, + static_cast(m), + static_cast(n), + 1., + g_data_a, + g_data_b, + 0., + g_data_c); paddle::framework::TensorCopySync(g_vec_c, cpu_place, &vec_c);