未验证 提交 d1bfb4b7 编写于 作者: H Huang Jiyi 提交者: GitHub

[phi decoupling] remove AllocatorFacade in phi (#50380)

* remove AllocatorFacade in phi

* fix include

* fix bugs
上级 13f57ec0
......@@ -15,9 +15,9 @@ limitations under the License. */
#include <map> // NOLINT
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/complex.h"
#include "paddle/phi/common/float16.h"
......@@ -44,12 +44,10 @@ TEST(Scalar, ConstructFromDenseTensor1) {
alloc.get(),
phi::DenseTensorMeta(
phi::DataType::FLOAT16, phi::make_ddim({1}), phi::DataLayout::NCHW));
phi::CPUContext dev_ctx;
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
auto* dense_x_data = dev_ctx.Alloc<float16>(&dense_x);
auto* dense_x_data = dev_ctx->Alloc<float16>(&dense_x);
dense_x_data[0] = 1;
phi::Scalar scalar_test(dense_x);
ASSERT_NEAR(1, scalar_test.to<float16>(), 1e-6);
......@@ -63,12 +61,10 @@ TEST(Scalar, ConstructFromDenseTensor2) {
alloc.get(),
phi::DenseTensorMeta(
phi::DataType::INT16, phi::make_ddim({1}), phi::DataLayout::NCHW));
phi::CPUContext dev_ctx;
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
auto* dense_x_data = dev_ctx.Alloc<int16_t>(&dense_x);
auto* dense_x_data = dev_ctx->Alloc<int16_t>(&dense_x);
dense_x_data[0] = 1;
phi::Scalar scalar_test(dense_x);
ASSERT_EQ(1, scalar_test.to<int16_t>());
......@@ -82,12 +78,10 @@ TEST(Scalar, ConstructFromDenseTensor3) {
alloc.get(),
phi::DenseTensorMeta(
phi::DataType::INT8, phi::make_ddim({1}), phi::DataLayout::NCHW));
phi::CPUContext dev_ctx;
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
auto* dense_x_data = dev_ctx.Alloc<int8_t>(&dense_x);
auto* dense_x_data = dev_ctx->Alloc<int8_t>(&dense_x);
dense_x_data[0] = 1;
phi::Scalar scalar_test(dense_x);
ASSERT_EQ(1, scalar_test.to<int8_t>());
......@@ -101,12 +95,10 @@ TEST(Scalar, ConstructFromDenseTensor4) {
alloc.get(),
phi::DenseTensorMeta(
phi::DataType::BOOL, phi::make_ddim({1}), phi::DataLayout::NCHW));
phi::CPUContext dev_ctx;
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
auto* dense_x_data = dev_ctx.Alloc<bool>(&dense_x);
auto* dense_x_data = dev_ctx->Alloc<bool>(&dense_x);
dense_x_data[0] = true;
phi::Scalar scalar_test(dense_x);
ASSERT_EQ(true, scalar_test.to<bool>());
......@@ -120,12 +112,10 @@ TEST(Scalar, ConstructFromDenseTensor5) {
phi::DenseTensorMeta(phi::DataType::COMPLEX64,
phi::make_ddim({1}),
phi::DataLayout::NCHW));
phi::CPUContext dev_ctx;
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
auto* dense_x_data = dev_ctx.Alloc<complex64>(&dense_x);
auto* dense_x_data = dev_ctx->Alloc<complex64>(&dense_x);
dense_x_data[0] = 1;
phi::Scalar scalar_test(dense_x);
complex64 expected_value(1, 0);
......@@ -140,12 +130,10 @@ TEST(Scalar, ConstructFromDenseTensor6) {
phi::DenseTensorMeta(phi::DataType::COMPLEX128,
phi::make_ddim({1}),
phi::DataLayout::NCHW));
phi::CPUContext dev_ctx;
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
auto* dense_x_data = dev_ctx.Alloc<complex128>(&dense_x);
auto* dense_x_data = dev_ctx->Alloc<complex128>(&dense_x);
dense_x_data[0] = 1;
phi::Scalar scalar_test(dense_x);
complex128 expected_value(1, 0);
......@@ -160,15 +148,12 @@ TEST(Scalar, ConstructFromDenseTensor7) {
alloc.get(),
phi::DenseTensorMeta(
phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW));
phi::GPUContext dev_ctx{phi::GPUPlace()};
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::GPUPlace())
.get());
dev_ctx.Init();
auto* dense_x_data = dev_ctx.Alloc<float>(&dense_x);
FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data);
dev_ctx.Wait();
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
auto* dense_x_data = dev_ctx->Alloc<float>(&dense_x);
FillTensor<<<1, 1, 0, dev_ctx->stream()>>>(dense_x_data);
dev_ctx->Wait();
phi::Scalar scalar_test(dense_x);
ASSERT_NEAR(1, scalar_test.to<float>(), 1e-6);
}
......@@ -182,14 +167,12 @@ TEST(Scalar, ConstructFromTensor) {
phi::DenseTensorMeta(
phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW));
phi::GPUContext dev_ctx{phi::GPUPlace()};
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::GPUPlace())
.get());
dev_ctx.Init();
auto* dense_x_data = dev_ctx.Alloc<float>(dense_x.get());
FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data);
dev_ctx.Wait();
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
auto* dense_x_data = dev_ctx->Alloc<float>(dense_x.get());
FillTensor<<<1, 1, 0, dev_ctx->stream()>>>(dense_x_data);
dev_ctx->Wait();
paddle::experimental::Tensor x(dense_x);
paddle::experimental::Scalar scalar_test(x);
ASSERT_NEAR(1, scalar_test.to<float>(), 1e-6);
......
......@@ -15,7 +15,9 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/all_context.h"
namespace phi {
namespace tests {
......@@ -89,16 +91,13 @@ TEST(StridedMemcpy, GPUCrop) {
phi::GPUPlace gpu0(0);
phi::CPUPlace cpu;
phi::GPUContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
auto src_allocation = paddle::memory::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
phi::DDim src_stride({5, 1});
......@@ -110,10 +109,10 @@ TEST(StridedMemcpy, GPUCrop) {
phi::DDim dst_stride({2, 1});
phi::funcs::StridedMemcpy<int>(
ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);
*ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);
paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
ctx.Wait();
paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
ctx->Wait();
ASSERT_EQ(1, dst[0]);
ASSERT_EQ(2, dst[1]);
......@@ -131,14 +130,13 @@ TEST(StridedMemcpy, GPUConcat) {
phi::GPUPlace gpu0(0);
phi::CPUPlace cpu;
phi::GPUContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
auto gpu_src_allocation = paddle::memory::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
int dst[8];
auto gpu_dst_allocation = paddle::memory::Alloc(gpu0, sizeof(dst));
......@@ -149,12 +147,12 @@ TEST(StridedMemcpy, GPUConcat) {
phi::DDim dst_stride({4, 1});
phi::funcs::StridedMemcpy<int>(
ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
*ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
phi::funcs::StridedMemcpy<int>(
ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2);
*ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2);
paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
ctx.Wait();
paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
ctx->Wait();
// clang-format off
int expect_dst[] = {
......
......@@ -14,6 +14,7 @@
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......@@ -51,11 +52,8 @@ TEST(math_function, notrans_mul_trans_fp32) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
phi::GPUContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5};
......@@ -65,13 +63,13 @@ TEST(math_function, notrans_mul_trans_fp32) {
paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
out_gpu.mutable_data<float>({2, 2}, gpu_place);
GetBlas<float>(context).MatMul(
GetBlas<float>(*context).MatMul(
input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
float* out_ptr = out.data<float>();
context.Wait();
context->Wait();
EXPECT_EQ(out_ptr[0], 5);
EXPECT_EQ(out_ptr[1], 14);
EXPECT_EQ(out_ptr[2], 14);
......@@ -87,14 +85,11 @@ TEST(math_function, notrans_mul_trans_fp16) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
phi::GPUContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
if (context->GetComputeCapability() < 53) {
return;
}
......@@ -107,18 +102,18 @@ TEST(math_function, notrans_mul_trans_fp16) {
out_gpu.mutable_data<phi::dtype::float16>({2, 2}, gpu_place);
GetBlas<phi::dtype::float16>(context).MatMul(input1_gpu,
false,
input2_gpu,
true,
phi::dtype::float16(1),
&out_gpu,
phi::dtype::float16(0));
GetBlas<phi::dtype::float16>(*context).MatMul(input1_gpu,
false,
input2_gpu,
true,
phi::dtype::float16(1),
&out_gpu,
phi::dtype::float16(0));
paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
phi::dtype::float16* out_ptr = out.data<phi::dtype::float16>();
context.Wait();
context->Wait();
EXPECT_EQ(static_cast<float>(out_ptr[0]), 5);
EXPECT_EQ(static_cast<float>(out_ptr[1]), 14);
EXPECT_EQ(static_cast<float>(out_ptr[2]), 14);
......@@ -134,11 +129,8 @@ TEST(math_function, trans_mul_notrans_fp32) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
phi::GPUContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5};
......@@ -149,13 +141,13 @@ TEST(math_function, trans_mul_notrans_fp32) {
out_gpu.mutable_data<float>({3, 3}, gpu_place);
GetBlas<float>(context).MatMul(
GetBlas<float>(*context).MatMul(
input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
float* out_ptr = out.data<float>();
context.Wait();
context->Wait();
EXPECT_EQ(out_ptr[0], 9);
EXPECT_EQ(out_ptr[1], 12);
EXPECT_EQ(out_ptr[2], 15);
......@@ -176,14 +168,11 @@ TEST(math_function, trans_mul_notrans_fp16) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
phi::GPUContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
if (context->GetComputeCapability() < 53) {
return;
}
......@@ -196,18 +185,18 @@ TEST(math_function, trans_mul_notrans_fp16) {
out_gpu.mutable_data<phi::dtype::float16>({3, 3}, gpu_place);
GetBlas<phi::dtype::float16>(context).MatMul(input1_gpu,
true,
input2_gpu,
false,
phi::dtype::float16(1),
&out_gpu,
phi::dtype::float16(0));
GetBlas<phi::dtype::float16>(*context).MatMul(input1_gpu,
true,
input2_gpu,
false,
phi::dtype::float16(1),
&out_gpu,
phi::dtype::float16(0));
paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
phi::dtype::float16* out_ptr = out.data<phi::dtype::float16>();
context.Wait();
context->Wait();
EXPECT_EQ(static_cast<float>(out_ptr[0]), 9);
EXPECT_EQ(static_cast<float>(out_ptr[1]), 12);
EXPECT_EQ(static_cast<float>(out_ptr[2]), 15);
......@@ -229,11 +218,8 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
phi::GPUContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
int m = 2;
int n = 3;
......@@ -255,7 +241,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(gpu_place);
GetBlas<float>(context).GEMM(
GetBlas<float>(*context).GEMM(
false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
......@@ -266,7 +252,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
// c = np.arange(8).reshape(2, 4)[:, 1:]
// out = np.arange(8).reshape(2, 4)
// out[:, 1:] = np.dot(a, b) + c
context.Wait();
context->Wait();
EXPECT_EQ(input3_ptr[0], 0);
EXPECT_EQ(input3_ptr[1], 24);
EXPECT_EQ(input3_ptr[2], 28);
......@@ -287,14 +273,11 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
phi::GPUContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
if (context->GetComputeCapability() < 53) {
return;
}
......@@ -320,7 +303,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
phi::dtype::float16* c =
input3_gpu.mutable_data<phi::dtype::float16>(gpu_place);
GetBlas<phi::dtype::float16>(context).GEMM(
GetBlas<phi::dtype::float16>(*context).GEMM(
false,
false,
m,
......@@ -343,7 +326,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
// c = np.arange(8).reshape(2, 4)[:, 1:]
// out = np.arange(8).reshape(2, 4)
// out[:, 1:] = np.dot(a, b) + c
context.Wait();
context->Wait();
EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
EXPECT_EQ(static_cast<float>(input3_ptr[1]), 24);
EXPECT_EQ(static_cast<float>(input3_ptr[2]), 28);
......@@ -364,11 +347,8 @@ TEST(math_function, gemm_trans_cublas_fp32) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
phi::GPUContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
int m = 2;
int n = 3;
......@@ -390,12 +370,12 @@ TEST(math_function, gemm_trans_cublas_fp32) {
float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(gpu_place);
GetBlas<float>(context).GEMM(
GetBlas<float>(*context).GEMM(
false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
context.Wait();
context->Wait();
EXPECT_EQ(input3_ptr[0], 0);
EXPECT_EQ(input3_ptr[1], 24);
EXPECT_EQ(input3_ptr[2], 28);
......@@ -416,14 +396,11 @@ TEST(math_function, gemm_trans_cublas_fp16) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
phi::GPUContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
if (context->GetComputeCapability() < 53) {
return;
}
......@@ -449,7 +426,7 @@ TEST(math_function, gemm_trans_cublas_fp16) {
phi::dtype::float16* c =
input3_gpu.mutable_data<phi::dtype::float16>(gpu_place);
GetBlas<phi::dtype::float16>(context).GEMM(
GetBlas<phi::dtype::float16>(*context).GEMM(
false,
true,
m,
......@@ -466,7 +443,7 @@ TEST(math_function, gemm_trans_cublas_fp16) {
paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
context.Wait();
context->Wait();
EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
EXPECT_EQ(static_cast<float>(input3_ptr[1]), 24);
EXPECT_EQ(static_cast<float>(input3_ptr[2]), 28);
......@@ -485,11 +462,8 @@ void GemvTest(int m, int n, bool trans) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
phi::GPUContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* context = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
......@@ -512,14 +486,14 @@ void GemvTest(int m, int n, bool trans) {
paddle::framework::TensorCopySync(mat_a, gpu_place, &g_mat_a);
paddle::framework::TensorCopySync(vec_b, gpu_place, &g_vec_b);
GetBlas<T>(context).GEMV(trans,
static_cast<int>(m),
static_cast<int>(n),
1.,
g_data_a,
g_data_b,
0.,
g_data_c);
GetBlas<T>(*context).GEMV(trans,
static_cast<int>(m),
static_cast<int>(n),
1.,
g_data_a,
g_data_b,
0.,
g_data_c);
paddle::framework::TensorCopySync(g_vec_c, cpu_place, &vec_c);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册