未验证 提交 7f92e27e 编写于 作者: Z zyfncg 提交者: GitHub

Fix bug of zero_allocator in HostAlloc (#48108)

* fix bug of zero_allocator in host

* fix test compile bug

* add unittest

* update test
上级 058aa381
...@@ -427,6 +427,10 @@ void AnalysisPredictor::InitDeviceContexts() { ...@@ -427,6 +427,10 @@ void AnalysisPredictor::InitDeviceContexts() {
memory::allocation::AllocatorFacade::Instance() memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place_) .GetZeroAllocator(place_)
.get()); .get());
gpu_context->SetHostZeroAllocator(
memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CPUPlace())
.get());
gpu_context->SetGenerator( gpu_context->SetGenerator(
framework::DefaultCUDAGenerator(place_.GetDeviceId()).get()); framework::DefaultCUDAGenerator(place_.GetDeviceId()).get());
gpu_context->SetHostGenerator(framework::DefaultCPUGenerator().get()); gpu_context->SetHostGenerator(framework::DefaultCPUGenerator().get());
......
...@@ -38,6 +38,10 @@ class TensorRTEngineTest : public ::testing::Test { ...@@ -38,6 +38,10 @@ class TensorRTEngineTest : public ::testing::Test {
paddle::memory::allocation::AllocatorFacade::Instance() paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CUDAPlace(0)) .GetZeroAllocator(platform::CUDAPlace(0))
.get()); .get());
ctx_->SetHostZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(paddle::platform::CPUPlace())
.get());
ctx_->SetPinnedAllocator( ctx_->SetPinnedAllocator(
paddle::memory::allocation::AllocatorFacade::Instance() paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPinnedPlace()) .GetAllocator(paddle::platform::CUDAPinnedPlace())
......
...@@ -183,6 +183,10 @@ TEST(Malloc, GPUContextMultiThreadMultiStream) { ...@@ -183,6 +183,10 @@ TEST(Malloc, GPUContextMultiThreadMultiStream) {
paddle::memory::allocation::AllocatorFacade::Instance() paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place) .GetZeroAllocator(place)
.get()); .get());
ctx->SetHostZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(paddle::platform::CPUPlace())
.get());
ctx->SetPinnedAllocator( ctx->SetPinnedAllocator(
paddle::memory::allocation::AllocatorFacade::Instance() paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPinnedPlace()) .GetAllocator(paddle::platform::CUDAPinnedPlace())
......
...@@ -221,6 +221,10 @@ NCCLComm* NCCLCommContext::AssignNCCLComm( ...@@ -221,6 +221,10 @@ NCCLComm* NCCLCommContext::AssignNCCLComm(
paddle::memory::allocation::AllocatorFacade::Instance() paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(dev_id)) .GetZeroAllocator(CUDAPlace(dev_id))
.get()); .get());
dev_ctx->SetHostZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx->SetPinnedAllocator( dev_ctx->SetPinnedAllocator(
paddle::memory::allocation::AllocatorFacade::Instance() paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPinnedPlace()) .GetAllocator(paddle::platform::CUDAPinnedPlace())
...@@ -364,6 +368,10 @@ BKCLComm* BKCLCommContext::AssignBKCLComm( ...@@ -364,6 +368,10 @@ BKCLComm* BKCLCommContext::AssignBKCLComm(
paddle::memory::allocation::AllocatorFacade::Instance() paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(XPUPlace(dev_id)) .GetZeroAllocator(XPUPlace(dev_id))
.get()); .get());
dev_ctx->SetHostZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(paddle::platform::CPUPlace())
.get());
BKCLCommImpl* c = new BKCLCommImpl; BKCLCommImpl* c = new BKCLCommImpl;
c->set_ring_id(ring_id); c->set_ring_id(ring_id);
......
...@@ -136,6 +136,10 @@ struct NCCLContext { ...@@ -136,6 +136,10 @@ struct NCCLContext {
paddle::memory::allocation::AllocatorFacade::Instance() paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(dev_id)) .GetZeroAllocator(CUDAPlace(dev_id))
.get()); .get());
ctx_->SetHostZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(paddle::platform::CPUPlace())
.get());
ctx_->SetPinnedAllocator( ctx_->SetPinnedAllocator(
paddle::memory::allocation::AllocatorFacade::Instance() paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPinnedPlace()) .GetAllocator(paddle::platform::CUDAPinnedPlace())
......
...@@ -183,6 +183,9 @@ std::unique_ptr<DeviceContext> CreateDeviceContext( ...@@ -183,6 +183,9 @@ std::unique_ptr<DeviceContext> CreateDeviceContext(
dev_ctx->SetZeroAllocator(memory::allocation::AllocatorFacade::Instance() dev_ctx->SetZeroAllocator(memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(p) .GetZeroAllocator(p)
.get()); .get());
dev_ctx->SetHostZeroAllocator(memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CPUPlace())
.get());
return PtrType(dev_ctx); return PtrType(dev_ctx);
} }
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/dense_tensor.h"
TEST(Device, Init) { TEST(Device, Init) {
using paddle::platform::CUDAPlace; using paddle::platform::CUDAPlace;
...@@ -38,6 +39,10 @@ TEST(Device, Init) { ...@@ -38,6 +39,10 @@ TEST(Device, Init) {
paddle::memory::allocation::AllocatorFacade::Instance() paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(i)) .GetZeroAllocator(CUDAPlace(i))
.get()); .get());
device_context->SetHostZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(paddle::platform::CPUPlace())
.get());
device_context->SetPinnedAllocator( device_context->SetPinnedAllocator(
paddle::memory::allocation::AllocatorFacade::Instance() paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPinnedPlace()) .GetAllocator(paddle::platform::CUDAPinnedPlace())
...@@ -69,6 +74,10 @@ TEST(Device, GPUContext) { ...@@ -69,6 +74,10 @@ TEST(Device, GPUContext) {
paddle::memory::allocation::AllocatorFacade::Instance() paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(i)) .GetZeroAllocator(CUDAPlace(i))
.get()); .get());
device_context->SetHostZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(paddle::platform::CPUPlace())
.get());
device_context->SetPinnedAllocator( device_context->SetPinnedAllocator(
paddle::memory::allocation::AllocatorFacade::Instance() paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPinnedPlace()) .GetAllocator(paddle::platform::CUDAPinnedPlace())
...@@ -92,6 +101,45 @@ TEST(Device, GPUContext) { ...@@ -92,6 +101,45 @@ TEST(Device, GPUContext) {
} }
} }
TEST(Device, HostZeroAllocator) {
using paddle::platform::CUDAPlace;
auto device_context = std::make_unique<phi::GPUContext>(CUDAPlace(0));
device_context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(CUDAPlace(0), device_context->stream())
.get());
device_context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
device_context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(0))
.get());
device_context->SetHostZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(paddle::platform::CPUPlace())
.get());
device_context->SetPinnedAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPinnedPlace())
.get());
device_context->PartialInitWithAllocator();
phi::DenseTensor tensor;
tensor.Resize({0});
device_context->HostAlloc<float>(&tensor);
ASSERT_EQ(tensor.place().GetType(), phi::AllocationType::CPU);
ASSERT_EQ(tensor.numel(), 0);
ASSERT_EQ(tensor.dtype(), phi::DataType::FLOAT32);
phi::GPUContext gpu_context(CUDAPlace(0));
gpu_context.SetHostZeroAllocator(&device_context->GetHostZeroAllocator());
gpu_context.HostAlloc<float>(&tensor);
ASSERT_EQ(tensor.place().GetType(), phi::AllocationType::CPU);
}
TEST(Device, DeviceContextPool) { TEST(Device, DeviceContextPool) {
using paddle::platform::CPUPlace; using paddle::platform::CPUPlace;
using paddle::platform::CUDAPlace; using paddle::platform::CUDAPlace;
......
...@@ -1325,6 +1325,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1325,6 +1325,10 @@ All parameter, weight, gradient are variables in Paddle.
paddle::memory::allocation::AllocatorFacade::Instance() paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place) .GetZeroAllocator(place)
.get()); .get());
context->SetHostZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(paddle::platform::CPUPlace())
.get());
return context; return context;
}) })
.def_static( .def_static(
...@@ -1349,6 +1353,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1349,6 +1353,10 @@ All parameter, weight, gradient are variables in Paddle.
paddle::memory::allocation::AllocatorFacade::Instance() paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place) .GetZeroAllocator(place)
.get()); .get());
context->SetHostZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(paddle::platform::CPUPlace())
.get());
return context; return context;
#endif #endif
}) })
...@@ -1410,6 +1418,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1410,6 +1418,10 @@ All parameter, weight, gradient are variables in Paddle.
paddle::memory::allocation::AllocatorFacade::Instance() paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place) .GetZeroAllocator(place)
.get()); .get());
context->SetHostZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(paddle::platform::CPUPlace())
.get());
context->SetPinnedAllocator( context->SetPinnedAllocator(
paddle::memory::allocation::AllocatorFacade::Instance() paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPinnedPlace()) .GetAllocator(paddle::platform::CUDAPinnedPlace())
......
...@@ -54,6 +54,14 @@ struct DeviceContext::Impl { ...@@ -54,6 +54,14 @@ struct DeviceContext::Impl {
zero_allocator_ = allocator; zero_allocator_ = allocator;
} }
void SetHostZeroAllocator(const Allocator* allocator) {
PADDLE_ENFORCE_NOT_NULL(
allocator,
phi::errors::InvalidArgument(
"Required allocator shall not be nullptr, but received nullptr."));
host_zero_allocator_ = allocator;
}
void SetPinnedAllocator(const Allocator* allocator) { void SetPinnedAllocator(const Allocator* allocator) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
allocator, allocator,
...@@ -106,6 +114,14 @@ struct DeviceContext::Impl { ...@@ -106,6 +114,14 @@ struct DeviceContext::Impl {
return *zero_allocator_; return *zero_allocator_;
} }
const Allocator& GetHostZeroAllocator() const {
PADDLE_ENFORCE_NOT_NULL(
host_zero_allocator_,
phi::errors::InvalidArgument("Required zero_allocator_ shall not be "
"nullptr, but received nullptr."));
return *host_zero_allocator_;
}
const Allocator& GetPinnedAllocator() const { const Allocator& GetPinnedAllocator() const {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
pinned_allocator_, pinned_allocator_,
...@@ -172,7 +188,8 @@ struct DeviceContext::Impl { ...@@ -172,7 +188,8 @@ struct DeviceContext::Impl {
if (tensor->initialized() && tensor->place() != CPUPlace()) { if (tensor->initialized() && tensor->place() != CPUPlace()) {
ClearHolder(tensor); ClearHolder(tensor);
} }
auto* allocator = tensor->numel() == 0 ? zero_allocator_ : host_allocator_; auto* allocator =
tensor->numel() == 0 ? host_zero_allocator_ : host_allocator_;
return tensor->AllocateFrom( return tensor->AllocateFrom(
const_cast<Allocator*>(allocator), dtype, requested_size); const_cast<Allocator*>(allocator), dtype, requested_size);
} }
...@@ -234,6 +251,7 @@ struct DeviceContext::Impl { ...@@ -234,6 +251,7 @@ struct DeviceContext::Impl {
const Allocator* device_allocator_{nullptr}; const Allocator* device_allocator_{nullptr};
const Allocator* host_allocator_{nullptr}; const Allocator* host_allocator_{nullptr};
const Allocator* zero_allocator_{nullptr}; const Allocator* zero_allocator_{nullptr};
const Allocator* host_zero_allocator_{nullptr};
const Allocator* pinned_allocator_{nullptr}; const Allocator* pinned_allocator_{nullptr};
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
const Allocator* cuda_graph_allocator_{nullptr}; const Allocator* cuda_graph_allocator_{nullptr};
...@@ -248,6 +266,7 @@ DeviceContext::DeviceContext(const DeviceContext& other) { ...@@ -248,6 +266,7 @@ DeviceContext::DeviceContext(const DeviceContext& other) {
impl_->SetHostAllocator(&other.GetHostAllocator()); impl_->SetHostAllocator(&other.GetHostAllocator());
impl_->SetAllocator(&other.GetAllocator()); impl_->SetAllocator(&other.GetAllocator());
impl_->SetZeroAllocator(&other.GetZeroAllocator()); impl_->SetZeroAllocator(&other.GetZeroAllocator());
impl_->SetHostZeroAllocator(&other.GetHostZeroAllocator());
impl_->SetPinnedAllocator(&other.GetPinnedAllocator()); impl_->SetPinnedAllocator(&other.GetPinnedAllocator());
impl_->SetHostGenerator(other.GetHostGenerator()); impl_->SetHostGenerator(other.GetHostGenerator());
impl_->SetGenerator(other.GetGenerator()); impl_->SetGenerator(other.GetGenerator());
...@@ -300,10 +319,18 @@ void DeviceContext::SetZeroAllocator(const Allocator* allocator) { ...@@ -300,10 +319,18 @@ void DeviceContext::SetZeroAllocator(const Allocator* allocator) {
impl_->SetZeroAllocator(allocator); impl_->SetZeroAllocator(allocator);
} }
void DeviceContext::SetHostZeroAllocator(const Allocator* allocator) {
impl_->SetHostZeroAllocator(allocator);
}
const Allocator& DeviceContext::GetZeroAllocator() const { const Allocator& DeviceContext::GetZeroAllocator() const {
return impl_->GetZeroAllocator(); return impl_->GetZeroAllocator();
} }
const Allocator& DeviceContext::GetHostZeroAllocator() const {
return impl_->GetHostZeroAllocator();
}
void DeviceContext::SetPinnedAllocator(const Allocator* allocator) { void DeviceContext::SetPinnedAllocator(const Allocator* allocator) {
impl_->SetPinnedAllocator(allocator); impl_->SetPinnedAllocator(allocator);
} }
......
...@@ -82,6 +82,13 @@ class PADDLE_API DeviceContext { ...@@ -82,6 +82,13 @@ class PADDLE_API DeviceContext {
*/ */
void SetZeroAllocator(const Allocator*); void SetZeroAllocator(const Allocator*);
/**
* @brief Set the zero-size host Allocator object.
*
* @param allocator
*/
void SetHostZeroAllocator(const Allocator*);
/** /**
* @brief Set the zero-size Allocator object. * @brief Set the zero-size Allocator object.
* *
...@@ -105,6 +112,8 @@ class PADDLE_API DeviceContext { ...@@ -105,6 +112,8 @@ class PADDLE_API DeviceContext {
const Allocator& GetZeroAllocator() const; const Allocator& GetZeroAllocator() const;
const Allocator& GetHostZeroAllocator() const;
const Allocator& GetPinnedAllocator() const; const Allocator& GetPinnedAllocator() const;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册