From 168fac13449e1e8dd982b80c61cfb9ad9261f55d Mon Sep 17 00:00:00 2001 From: ykkk2333 <77383312+ykkk2333@users.noreply.github.com> Date: Thu, 8 Jun 2023 14:45:40 +0800 Subject: [PATCH] xpu support auto growth allocator (#54121) --- paddle/fluid/memory/allocation/CMakeLists.txt | 1 + .../memory/allocation/allocator_facade.cc | 239 +++++++++++++++++- .../memory/allocation/allocator_facade.h | 3 + .../allocation/stream_safe_xpu_allocator.cc | 209 +++++++++++++++ .../allocation/stream_safe_xpu_allocator.h | 90 +++++++ .../fluid/memory/allocation/xpu_allocator.cc | 57 +++++ .../fluid/memory/allocation/xpu_allocator.h | 42 +++ paddle/fluid/platform/device/xpu/xpu_info.cc | 124 +++++++++ paddle/fluid/platform/device/xpu/xpu_info.h | 16 ++ paddle/phi/backends/device_memory_aligment.h | 3 +- paddle/phi/backends/xpu/enforce_xpu.h | 10 + paddle/phi/backends/xpu/xpu_context.cc | 115 +++------ paddle/phi/backends/xpu/xpu_context.h | 4 - paddle/phi/backends/xpu/xpu_header.h | 18 ++ paddle/phi/backends/xpu/xpu_info.h | 5 + paddle/phi/core/flags.cc | 2 +- paddle/phi/kernels/xpu/top_k_kernel.cc | 23 +- paddle/phi/kernels/xpu/xpu_mem_util.h | 56 ++++ 18 files changed, 923 insertions(+), 94 deletions(-) create mode 100644 paddle/fluid/memory/allocation/stream_safe_xpu_allocator.cc create mode 100644 paddle/fluid/memory/allocation/stream_safe_xpu_allocator.h create mode 100644 paddle/fluid/memory/allocation/xpu_allocator.cc create mode 100644 paddle/fluid/memory/allocation/xpu_allocator.h create mode 100644 paddle/phi/kernels/xpu/xpu_mem_util.h diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index bb1d9e2e897..1a395903989 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -56,6 +56,7 @@ endif() if(WITH_XPU) list(APPEND ALLOCATOR_DEPS xpu_info) + list(APPEND ALLOCATOR_SRCS xpu_allocator.cc stream_safe_xpu_allocator.cc) endif() if(WITH_IPU) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 251ec771728..07e55115ba1 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -22,6 +22,7 @@ #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h" #include "paddle/fluid/memory/allocation/retry_allocator.h" #include "paddle/fluid/memory/allocation/stat_allocator.h" +#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/macros.h" @@ -35,7 +36,6 @@ #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" #include "paddle/fluid/memory/allocation/thread_local_allocator.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #ifdef PADDLE_WITH_CUDA @@ -50,7 +50,10 @@ #endif #ifdef PADDLE_WITH_XPU +#include "paddle/fluid/memory/allocation/stream_safe_xpu_allocator.h" +#include "paddle/fluid/memory/allocation/xpu_allocator.h" #include "paddle/fluid/platform/device/xpu/xpu_info.h" +#include "paddle/phi/backends/xpu/xpu_context.h" #endif #ifdef PADDLE_WITH_IPU @@ -166,6 +169,11 @@ class AllocatorFacadePrivate { std::map>>; #endif +#ifdef PADDLE_WITH_XPU + using XPUAllocatorMap = + std::map>>; +#endif explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) { strategy_ = GetAllocatorStrategy(); @@ -236,9 +244,16 @@ class AllocatorFacadePrivate { InitNaiveBestFitCUDAPinnedAllocator(); #endif #ifdef PADDLE_WITH_XPU + allow_free_idle_chunk_ = allow_free_idle_chunk; for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { - InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); + InitAutoGrowthXPUAllocator(platform::XPUPlace(dev_id), + allow_free_idle_chunk_); + } + if (FLAGS_use_stream_safe_cuda_allocator) { + WrapStreamSafeXPUAllocatorForDefault(); + is_stream_safe_cuda_allocator_used_ = true; } + #endif #ifdef PADDLE_WITH_IPU for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) { @@ -441,6 +456,114 @@ class AllocatorFacadePrivate { } #endif +#ifdef PADDLE_WITH_XPU + bool HasXPUAllocator(const platform::XPUPlace& place, XPUStream stream) { + auto it = xpu_allocators_.find(place); + if (it == xpu_allocators_.end()) { + return false; + } + const std::map>& allocator_map = + it->second; + return allocator_map.find(stream) != allocator_map.end(); + } + + const std::shared_ptr& GetAllocator( + const platform::XPUPlace& place, + XPUStream stream, + bool create_if_not_found = false) { + if (stream == GetDefaultStream(place)) { + VLOG(7) << "Get Allocator by passing in a default stream"; + return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); + } + + /* shared_lock_guard */ { + std::shared_lock lock_guard( + xpu_allocator_mutex_); + if (LIKELY(HasXPUAllocator(place, stream))) { + return xpu_allocators_[place][stream]; + } else { + PADDLE_ENFORCE_NE(create_if_not_found, + false, + platform::errors::NotFound( + "No allocator found for stream %s in place %s " + "with create_if_not_found = false", + stream, + place)); + } + } + + /* unique_lock_guard */ { + std::unique_lock lock_guard( + xpu_allocator_mutex_); + InitStreamSafeXPUAllocator(place, stream); + return xpu_allocators_[place][stream]; + } + } + + const std::shared_ptr + GetDefaultStreamSafeXPUAllocator(const platform::XPUPlace& place) const { + const auto iter = default_stream_safe_xpu_allocators_.find(place); + PADDLE_ENFORCE_NE( + iter, + default_stream_safe_xpu_allocators_.end(), + platform::errors::NotFound( + "No StreamSafeXPUAllocator found for the place, %s", place)); + return iter->second; + } + + XPUStream GetDefaultStream(const platform::XPUPlace& place) const { + const std::shared_ptr& allocator = + GetDefaultStreamSafeXPUAllocator(place); + return allocator->GetDefaultStream(); + } + + void SetDefaultStream(const platform::XPUPlace& place, XPUStream stream) { + const std::shared_ptr& allocator = + GetDefaultStreamSafeXPUAllocator(place); + + PADDLE_ENFORCE_EQ( + allocator->GetDefaultStream(), + nullptr, + platform::errors::Unavailable( + "The default stream for StreamSafeXPUAllocator(%p) in %s has been " + "set to %p, not allow to change it to %p.", + allocator.get(), + place, + allocator->GetDefaultStream(), + stream)); + + allocator->SetDefaultStream(stream); + VLOG(8) << "Set default stream to " << stream + << " for StreamSafeXPUAllocator(" << allocator.get() << ") in " + << place; + } + + void RecordStream(std::shared_ptr allocation, + XPUStream stream) { + std::shared_ptr stream_safe_xpu_allocation = + std::dynamic_pointer_cast(allocation); + if (stream_safe_xpu_allocation != nullptr) { + stream_safe_xpu_allocation->RecordStream(stream); + } else { + VLOG(6) << "RecordStream for a non-StreamSafeXPUAllocation"; + } + } + + XPUStream GetStream( + const std::shared_ptr& allocation) const { + const std::shared_ptr stream_safe_xpu_allocation = + std::dynamic_pointer_cast(allocation); + if (stream_safe_xpu_allocation != nullptr) { + return stream_safe_xpu_allocation->GetOwningStream(); + } + + VLOG(6) << "GetStream for a non-StreamSafeXPUAllocation"; + return static_cast( + platform::DeviceContextPool::Instance().Get(allocation->place())) + ->stream(); + } +#endif + private: class ZeroSizeAllocator : public Allocator { public: @@ -774,6 +897,104 @@ class AllocatorFacadePrivate { void InitNaiveBestFitXPUAllocator(platform::XPUPlace p) { allocators_[p] = std::make_shared(p); } + + // Create a new XPUAllocator or XPUManagedAllocator for the given device + std::shared_ptr CreateXPUAllocator(platform::XPUPlace p) { + return std::make_shared(p); + } + + void InitStreamSafeXPUAllocator(platform::XPUPlace p, XPUStream stream) { + PADDLE_ENFORCE_EQ( + strategy_, + AllocatorStrategy::kAutoGrowth, + platform::errors::Unimplemented( + "Only support auto-growth strategey for StreamSafeXPUAllocator, " + "the allocator strategy %d is unsupported for multi-stream", + static_cast(strategy_))); + if (LIKELY(!HasXPUAllocator(p, stream))) { + VLOG(8) << "Init XPU allocator for stream " << stream << " in place " + << p; + InitAutoGrowthXPUAllocator(p, stream); + + WrapStreamSafeXPUAllocator(p, stream); + + WrapXPURetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time); + WrapStatAllocator(p, stream); + } + } + + void InitAutoGrowthXPUAllocator(platform::XPUPlace p, XPUStream stream) { + auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 6; + VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is " + << FLAGS_auto_growth_chunk_size_in_mb; + auto xpu_allocator = CreateXPUAllocator(p); + auto alignment = platform::XPUMinChunkSize(); + + std::shared_ptr underlying_allocator{nullptr}; + + VLOG(10) << "not use AlignedAllocator with alignment: " << alignment; + underlying_allocator = xpu_allocator; + + xpu_allocators_[p][stream] = std::make_shared( + underlying_allocator, alignment, chunk_size, allow_free_idle_chunk_); + } + + void InitAutoGrowthXPUAllocator(platform::XPUPlace p, + bool allow_free_idle_chunk) { + auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 6; + VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is " + << FLAGS_auto_growth_chunk_size_in_mb; + auto xpu_allocator = CreateXPUAllocator(p); + auto alignment = platform::XPUMinChunkSize(); + + std::shared_ptr underlying_allocator{nullptr}; + + VLOG(10) << "not use AlignedAllocator with alignment: " << alignment; + underlying_allocator = xpu_allocator; + + allocators_[p] = std::make_shared( + underlying_allocator, alignment, chunk_size, allow_free_idle_chunk); + } + + void WrapStreamSafeXPUAllocator(platform::XPUPlace p, XPUStream stream) { + std::shared_ptr& allocator = xpu_allocators_[p][stream]; + allocator = std::make_shared(allocator, p, stream); + } + + void WrapStreamSafeXPUAllocatorForDefault() { + for (auto& pair : allocators_) { + auto& place = pair.first; + if (platform::is_xpu_place(place)) { + std::shared_ptr&& allocator = + std::make_shared( + pair.second, + place, + /* default_stream = */ nullptr); + pair.second = allocator; + default_stream_safe_xpu_allocators_[place] = allocator; + VLOG(8) << "WrapStreamSafeXPUAllocator for " << place + << ", allocator address = " << pair.second.get(); + } + } + } + + void WrapXPURetryAllocator(platform::XPUPlace p, + XPUStream stream, + size_t retry_time) { + PADDLE_ENFORCE_GT( + retry_time, + 0, + platform::errors::InvalidArgument( + "Retry time should be larger than 0, but got %d", retry_time)); + std::shared_ptr& allocator = xpu_allocators_[p][stream]; + allocator = std::make_shared(allocator, retry_time); + } + + void WrapStatAllocator(platform::XPUPlace p, XPUStream stream) { + std::shared_ptr& allocator = xpu_allocators_[p][stream]; + allocator = std::make_shared(allocator); + } + #endif #ifdef PADDLE_WITH_IPU @@ -807,7 +1028,7 @@ class AllocatorFacadePrivate { int device_count = platform::GetXPUDeviceCount(); for (int i = 0; i < device_count; ++i) { platform::XPUPlace p(i); - system_allocators_[p] = std::make_shared(p); + system_allocators_[p] = CreateXPUAllocator(p); } #endif #ifdef PADDLE_WITH_IPU @@ -905,7 +1126,8 @@ class AllocatorFacadePrivate { platform::errors::InvalidArgument( "Retry time should be larger than 0, but got %d", retry_time)); for (auto& pair : allocators_) { - if (platform::is_gpu_place(pair.first)) { + if (platform::is_gpu_place(pair.first) || + platform::is_xpu_place(pair.first)) { pair.second = std::make_shared(pair.second, retry_time); } } @@ -930,6 +1152,15 @@ class AllocatorFacadePrivate { CUDAAllocatorMap cuda_allocators_; std::shared_timed_mutex cuda_allocator_mutex_; #endif + +#ifdef PADDLE_WITH_XPU + // a standalone XPU allocator to support multi-stream GC in new executor + std::map> + default_stream_safe_xpu_allocators_; + XPUAllocatorMap xpu_allocators_; + std::shared_timed_mutex xpu_allocator_mutex_; +#endif + AllocatorStrategy strategy_; AllocatorMap allocators_; static AllocatorMap zero_size_allocators_; diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 7f10b2286b4..a1f21a5e693 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -19,6 +19,9 @@ #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/platform/device/xpu/xpu_info.h" +#endif #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/stream.h" diff --git a/paddle/fluid/memory/allocation/stream_safe_xpu_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_xpu_allocator.cc new file mode 100644 index 00000000000..7f48ef5ab50 --- /dev/null +++ b/paddle/fluid/memory/allocation/stream_safe_xpu_allocator.cc @@ -0,0 +1,209 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/stream_safe_xpu_allocator.h" +#include + +#include "paddle/fluid/platform/device/xpu/enforce_xpu.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/backends/xpu/xpu_info.h" + +namespace paddle { +namespace memory { +namespace allocation { + +StreamSafeXPUAllocation::StreamSafeXPUAllocation( + DecoratedAllocationPtr underlying_allocation, + XPUStream owning_stream, + StreamSafeXPUAllocator* allocator) + : Allocation(underlying_allocation->ptr(), + underlying_allocation->base_ptr(), + underlying_allocation->size(), + underlying_allocation->place()), + underlying_allocation_(std::move(underlying_allocation)), + owning_stream_(std::move(owning_stream)), + allocator_(allocator->shared_from_this()) {} + +void StreamSafeXPUAllocation::RecordStream(XPUStream stream) { + VLOG(8) << "Try record stream " << stream << " for address " << ptr(); + if (stream == owning_stream_) { + return; + } + + std::call_once(once_flag_, + [this] { phi::backends::xpu::SetXPUDeviceId(place_.device); }); + + std::lock_guard lock_guard(outstanding_event_map_lock_); + + RecordStreamPrivate(stream); +} + +bool StreamSafeXPUAllocation::CanBeFreed() { + std::call_once(once_flag_, + [this] { phi::backends::xpu::SetXPUDeviceId(place_.device); }); + for (auto it = outstanding_event_map_.begin(); + it != outstanding_event_map_.end(); + ++it) { + XPUEvent& event = it->second; + + PADDLE_ENFORCE_XRE_SUCCESS(xpu_event_destroy(event)); + VLOG(8) << "Destroy event " << event; + } + return true; +} + +XPUStream StreamSafeXPUAllocation::GetOwningStream() const { + return owning_stream_; +} + +void StreamSafeXPUAllocation::RecordStreamPrivate(XPUStream stream) { + XPUEvent record_event; + auto it = outstanding_event_map_.find(stream); + if (it == outstanding_event_map_.end()) { + XPUEvent new_event; + PADDLE_ENFORCE_XRE_SUCCESS(xpu_event_create(&new_event)); + outstanding_event_map_[stream] = new_event; + record_event = new_event; + VLOG(9) << "Create a new event " << new_event; + } else { + record_event = it->second; + VLOG(9) << "Reuse event " << record_event; + } + + PADDLE_ENFORCE_XRE_SUCCESS(xpu_event_record(record_event, stream)); + VLOG(8) << "Record event " << record_event << " to stream " << stream; +} + +StreamSafeXPUAllocator::StreamSafeXPUAllocator( + std::shared_ptr underlying_allocator, + platform::XPUPlace place, + XPUStream default_stream) + : underlying_allocator_(std::move(underlying_allocator)), + place_(std::move(place)), + default_stream_(std::move(default_stream)) { + std::lock_guard lock_guard(allocator_map_lock_); + allocator_map_[place].emplace_back(this); +} + +StreamSafeXPUAllocator::~StreamSafeXPUAllocator() { + std::lock_guard lock_guard(allocator_map_lock_); + std::vector& allocators = allocator_map_[place_]; + allocators.erase(std::remove(allocators.begin(), allocators.end(), this), + allocators.end()); +} + +bool StreamSafeXPUAllocator::IsAllocThreadSafe() const { return true; } + +XPUStream StreamSafeXPUAllocator::GetDefaultStream() const { + return default_stream_; +} + +void StreamSafeXPUAllocator::SetDefaultStream(XPUStream stream) { + default_stream_ = stream; +} + +phi::Allocation* StreamSafeXPUAllocator::AllocateImpl(size_t size) { + platform::RecordEvent record("StreamSafeXPUAllocator::Allocate", + platform::TracerEventType::UserDefined, + 9 /*level*/); + ProcessUnfreedAllocations(); + VLOG(8) << "Try allocate " << size << " bytes"; + AllocationPtr underlying_allocation; + try { + underlying_allocation = underlying_allocator_->Allocate(size); + } catch (BadAlloc&) { + VLOG(4) << "Allocation failed when allocating " << size << " bytes"; + ReleaseImpl(place_); + try { + underlying_allocation = underlying_allocator_->Allocate(size); + } catch (...) { + VLOG(3) + << "Still allocation failed after release memory from all streams"; + throw; + } + } catch (...) { + throw; + } + StreamSafeXPUAllocation* allocation = new StreamSafeXPUAllocation( + static_unique_ptr_cast(std::move(underlying_allocation)), + default_stream_, + this); + VLOG(8) << "Thread " << std::this_thread::get_id() << " Allocate " + << allocation->size() << " bytes at address " << allocation->ptr() + << " , stream: " << default_stream_; + return allocation; +} + +void StreamSafeXPUAllocator::FreeImpl(phi::Allocation* allocation) { + platform::RecordEvent record("StreamSafeXPUAllocator::Free", + platform::TracerEventType::UserDefined, + 9 /*level*/); + StreamSafeXPUAllocation* stream_safe_xpu_allocation = + static_cast(allocation); + + VLOG(8) << "Try free allocation " << stream_safe_xpu_allocation->ptr(); + if (stream_safe_xpu_allocation->CanBeFreed()) { + VLOG(9) << "Directly delete allocation"; + delete stream_safe_xpu_allocation; + } else { + VLOG(9) << "Put into unfreed_allocation list"; + std::lock_guard lock_guard(unfreed_allocation_lock_); + unfreed_allocations_.emplace_back(stream_safe_xpu_allocation); + } +} + +uint64_t StreamSafeXPUAllocator::ReleaseImpl(const platform::Place& place) { + std::lock_guard lock_guard(allocator_map_lock_); + std::vector& allocators = allocator_map_[place]; + uint64_t released_size = 0; + for (StreamSafeXPUAllocator* allocator : allocators) { + released_size += allocator->ProcessUnfreedAllocationsAndRelease(); + } + VLOG(8) << "Release " << released_size << " bytes memory from all streams"; + return released_size; +} + +void StreamSafeXPUAllocator::ProcessUnfreedAllocations() { + // NOTE(Ruibiao): This condition is to reduce lock competion. It does not need + // to be thread-safe since here occasional misjudgments are permissible. + if (unfreed_allocations_.empty()) { + return; + } + + std::lock_guard lock_guard(unfreed_allocation_lock_); + for (auto it = unfreed_allocations_.begin(); + it != unfreed_allocations_.end();) { + if ((*it)->CanBeFreed()) { + delete *it; + it = unfreed_allocations_.erase(it); + } else { + ++it; + } + } +} + +uint64_t StreamSafeXPUAllocator::ProcessUnfreedAllocationsAndRelease() { + ProcessUnfreedAllocations(); + return underlying_allocator_->Release(place_); +} + +thread_local std::once_flag StreamSafeXPUAllocation::once_flag_; + +std::map> + StreamSafeXPUAllocator::allocator_map_; +SpinLock StreamSafeXPUAllocator::allocator_map_lock_; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/stream_safe_xpu_allocator.h b/paddle/fluid/memory/allocation/stream_safe_xpu_allocator.h new file mode 100644 index 00000000000..0d30d8a44ed --- /dev/null +++ b/paddle/fluid/memory/allocation/stream_safe_xpu_allocator.h @@ -0,0 +1,90 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/memory/allocation/spin_lock.h" +#include "paddle/fluid/platform/place.h" + +#include "paddle/fluid/platform/device/xpu/xpu_info.h" +#include "paddle/phi/backends/xpu/xpu_context.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class StreamSafeXPUAllocator; + +class StreamSafeXPUAllocation : public Allocation { + public: + StreamSafeXPUAllocation(DecoratedAllocationPtr underlying_allocation, + XPUStream owning_stream, + StreamSafeXPUAllocator *allocator); + + void RecordStream(XPUStream stream); + bool CanBeFreed(); + XPUStream GetOwningStream() const; + + private: + thread_local static std::once_flag once_flag_; + void RecordStreamPrivate(XPUStream stream); + DecoratedAllocationPtr underlying_allocation_; + + std::map outstanding_event_map_; + XPUStream owning_stream_; + SpinLock outstanding_event_map_lock_; + std::shared_ptr allocator_; +}; + +class StreamSafeXPUAllocator + : public Allocator, + public std::enable_shared_from_this { + public: + StreamSafeXPUAllocator(std::shared_ptr underlying_allocator, + platform::XPUPlace place, + XPUStream default_stream); + ~StreamSafeXPUAllocator(); + + bool IsAllocThreadSafe() const override; + XPUStream GetDefaultStream() const; + void SetDefaultStream(XPUStream stream); + + protected: + phi::Allocation *AllocateImpl(size_t size) override; + void FreeImpl(phi::Allocation *allocation) override; + uint64_t ReleaseImpl(const platform::Place &place) override; + + private: + void ProcessUnfreedAllocations(); + uint64_t ProcessUnfreedAllocationsAndRelease(); + + static std::map> + allocator_map_; + static SpinLock allocator_map_lock_; + + std::shared_ptr underlying_allocator_; + platform::XPUPlace place_; + XPUStream default_stream_; + std::list unfreed_allocations_; + SpinLock unfreed_allocation_lock_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/xpu_allocator.cc b/paddle/fluid/memory/allocation/xpu_allocator.cc new file mode 100644 index 00000000000..6867aed2577 --- /dev/null +++ b/paddle/fluid/memory/allocation/xpu_allocator.cc @@ -0,0 +1,57 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/xpu_allocator.h" + +#include + +#include "paddle/fluid/platform/device/xpu/xpu_info.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace memory { +namespace allocation { +bool XPUAllocator::IsAllocThreadSafe() const { return true; } +void XPUAllocator::FreeImpl(phi::Allocation* allocation) { + PADDLE_ENFORCE_EQ( + allocation->place(), + place_, + platform::errors::PermissionDenied( + "XPU memory is freed in incorrect device. This may be a bug")); + platform::RecordedXPUFree( + allocation->ptr(), allocation->size(), place_.device); + delete allocation; +} + +phi::Allocation* XPUAllocator::AllocateImpl(size_t size) { + std::call_once(once_flag_, + [this] { platform::SetXPUDeviceId(place_.device); }); + + void* ptr; + auto result = platform::RecordedXPUMalloc(&ptr, size, place_.device); + if (LIKELY(result == XPU_SUCCESS)) { + return new Allocation(ptr, size, platform::Place(place_)); + } + + PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( + "\n\nOut of memory error on XPU %d. " + "Cannot allocate %s memory on XPU %d.\n\n", + place_.device, + string::HumanReadableSize(size), + place_.device)); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/xpu_allocator.h b/paddle/fluid/memory/allocation/xpu_allocator.h new file mode 100644 index 00000000000..ee1e3317d66 --- /dev/null +++ b/paddle/fluid/memory/allocation/xpu_allocator.h @@ -0,0 +1,42 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include // NOLINT + +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class XPUAllocator : public Allocator { + public: + explicit XPUAllocator(const platform::XPUPlace& place) : place_(place) {} + + bool IsAllocThreadSafe() const override; + + protected: + void FreeImpl(phi::Allocation* allocation) override; + phi::Allocation* AllocateImpl(size_t size) override; + + private: + platform::XPUPlace place_; + std::once_flag once_flag_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc index 624a19dd7de..a15c5d3c499 100644 --- a/paddle/fluid/platform/device/xpu/xpu_info.cc +++ b/paddle/fluid/platform/device/xpu/xpu_info.cc @@ -19,6 +19,8 @@ limitations under the License. */ #include "paddle/fluid/platform/device/xpu/enforce_xpu.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/lock_guard_ptr.h" +#include "paddle/fluid/platform/monitor.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/backends/xpu/xpu_info.h" @@ -92,5 +94,127 @@ phi::backends::xpu::XPUVersion get_xpu_version(int dev_id) { return phi::backends::xpu::get_xpu_version(dev_id); } +/**************************** XPU Allocator **************************/ +size_t XPUMinChunkSize() { return 1 << 6; } + +static void RaiseNonOutOfMemoryError(int status) { + if (status == XPUERR_NOMEM) { + status = XPU_SUCCESS; + } + PADDLE_ENFORCE_XRE_SUCCESS(status); +} + +class RecordedXPUMallocHelper { + private: + explicit RecordedXPUMallocHelper(int dev_id, uint64_t limit_size = 0) + : dev_id_(dev_id), limit_size_(limit_size) { + if (NeedRecord()) { + mtx_.reset(new std::mutex()); + } + } + + DISABLE_COPY_AND_ASSIGN(RecordedXPUMallocHelper); + + public: + static RecordedXPUMallocHelper* Instance(int dev_id) { + std::call_once(once_flag_, [] { + int dev_cnt = GetXPUDeviceCount(); + instances_.reserve(dev_cnt); + for (int i = 0; i < dev_cnt; ++i) { + // NOTE(zhiqiu): share the flags with gpu, avoid more flags. + instances_.emplace_back(new RecordedXPUMallocHelper(i, 0UL << 20)); + } + }); + + PADDLE_ENFORCE_GE( + dev_id, + 0, + phi::errors::OutOfRange( + "Device id must be not less than 0, but got %d.", dev_id)); + PADDLE_ENFORCE_LT( + dev_id, + instances_.size(), + phi::errors::OutOfRange("Device id %d exceeds XPU card number %d.", + dev_id, + instances_.size())); + return instances_[dev_id].get(); + } + + /** + * Try to allocate `size` XPU memory. Only XPUERR_NOMEM + * or XPU_SUCCESS would be returned. + */ + int Malloc(void** ptr, size_t size) { + LockGuardPtr lock(mtx_); + if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) { + return XPUERR_NOMEM; + } + + XPUDeviceGuard guard(dev_id_); + VLOG(10) << "Allocate " << size << " bytes with ptr = " << &(ptr); + auto result = xpu_malloc(ptr, size); + if (result == XPU_SUCCESS) { + cur_size_.fetch_add(size); + return result; + } else { + RaiseNonOutOfMemoryError(result); + // Non out of memory error would be raised inside + // RaiseNonOutOfMemoryError. Therefore, we can + // return XPUERR_NOMEM directly here. + return XPUERR_NOMEM; + } + } + + /** + * Free XPU memory. Usually, free is not allowed to raise error. + * If it does raise error, the process should be crashed. + */ + void Free(void* ptr, size_t size) { + XPUDeviceGuard guard(dev_id_); + xpu_free(ptr); + cur_size_.fetch_sub(size); + } + + inline bool NeedRecord() const { return limit_size_ != 0; } + + uint64_t RecordedSize() const { return cur_size_.load(); } + + uint64_t LimitSize() const { return limit_size_; } + + private: + const int dev_id_; + const uint64_t limit_size_; + std::atomic cur_size_{0}; + + mutable std::unique_ptr mtx_; + + static std::once_flag once_flag_; + static std::vector> instances_; +}; + +std::once_flag RecordedXPUMallocHelper::once_flag_; +std::vector> + RecordedXPUMallocHelper::instances_; + +int RecordedXPUMalloc(void** ptr, size_t size, int dev_id) { + return RecordedXPUMallocHelper::Instance(dev_id)->Malloc(ptr, size); +} + +void RecordedXPUFree(void* p, size_t size, int dev_id) { + return RecordedXPUMallocHelper::Instance(dev_id)->Free(p, size); +} + +uint64_t RecordedXPUMallocSize(int dev_id) { + return RecordedXPUMallocHelper::Instance(dev_id)->RecordedSize(); +} + +uint64_t RecordedXPULimitSize(int dev_id) { + return RecordedXPUMallocHelper::Instance(dev_id)->LimitSize(); +} + +bool IsXPUMallocRecorded(int dev_id) { + return RecordedXPUMallocHelper::Instance(dev_id)->NeedRecord(); +} + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/xpu/xpu_info.h b/paddle/fluid/platform/device/xpu/xpu_info.h index 3d428ec7c5b..163c3a3dcc8 100644 --- a/paddle/fluid/platform/device/xpu/xpu_info.h +++ b/paddle/fluid/platform/device/xpu/xpu_info.h @@ -70,6 +70,22 @@ using XPUDeviceGuard = phi::backends::xpu::XPUDeviceGuard; phi::backends::xpu::XPUVersion get_xpu_version(int dev_id); +//! Get the minimum chunk size for XPU allocator. +size_t XPUMinChunkSize(); + +//! xpu_malloc with recorded info +int RecordedXPUMalloc(void **ptr, size_t size, int dev_id); + +//! xpu_free with recorded info +void RecordedXPUFree(void *p, size_t size, int dev_id); + +//! Get recorded actrtMalloc size. If record is disabled, return 0. +uint64_t RecordedXPUMallocSize(int dev_id); + +uint64_t RecordedXPULimitSize(int dev_id); + +bool IsXPUMallocRecorded(int dev_id); + } // namespace platform } // namespace paddle #endif diff --git a/paddle/phi/backends/device_memory_aligment.h b/paddle/phi/backends/device_memory_aligment.h index 06d9b450a83..8508d520655 100644 --- a/paddle/phi/backends/device_memory_aligment.h +++ b/paddle/phi/backends/device_memory_aligment.h @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/phi/core/errors.h" #include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/xpu/xpu_info.h" namespace phi { @@ -38,7 +39,7 @@ inline size_t Alignment(size_t size, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) alignment = phi::backends::gpu::GpuMinChunkSize(); #elif defined(PADDLE_WITH_XPU) - alignment = alignment; + alignment = phi::backends::xpu::XPUMinChunkSize(); #else PADDLE_THROW(phi::errors::PreconditionNotMet( "Fluid is not compiled with CUDA/XPU.")); diff --git a/paddle/phi/backends/xpu/enforce_xpu.h b/paddle/phi/backends/xpu/enforce_xpu.h index 853001b1897..0a2a21e236d 100644 --- a/paddle/phi/backends/xpu/enforce_xpu.h +++ b/paddle/phi/backends/xpu/enforce_xpu.h @@ -206,6 +206,16 @@ DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS); __THROW_ERROR_INTERNAL__(__summary__); \ } \ } while (0) +#define PADDLE_ENFORCE_XRE_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + auto xre_msg = xpu_strerror(__cond__); \ + if (UNLIKELY(__cond__ != XPU_SUCCESS)) { \ + auto __summary__ = \ + phi::errors::External("XPU Runtime Error: ", xre_msg); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) } // namespace xpu } // namespace backends diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc index 44f247ff259..34f640e66a6 100644 --- a/paddle/phi/backends/xpu/xpu_context.cc +++ b/paddle/phi/backends/xpu/xpu_context.cc @@ -59,22 +59,24 @@ struct XPUContext::Impl { } } - bool IsDataloader() const { - if (std::getenv("XPU_PADDLE_XDL_CONTEXTS") == nullptr) { - return false; - } - std::string cur_thread_name = phi::GetCurrentThreadName(); - VLOG(3) << "XPU Dataloader: current thread at Get Context = " - << phi::GetCurrentThreadName(); - bool is_dataloader_thread = (cur_thread_name != "MainThread"); - return is_dataloader_thread; - } - Impl() : place_(XPUPlace()) {} explicit Impl(const Place& place) : place_(place) {} ~Impl() { + for (auto& ctx_it : context_map_) { + auto& ctx = ctx_it.second; + if (ctx != nullptr) { + xpu_wait(ctx->xpu_stream); + if (ctx->xpu_stream) { + xpu_stream_destroy(ctx->xpu_stream); + ctx->xpu_stream = nullptr; + } + ctx = nullptr; + } + } + context_map_.clear(); + if (owned_ && context_ != nullptr) { backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId()); xpu_wait(context_->xpu_stream); @@ -87,27 +89,13 @@ struct XPUContext::Impl { xpu::destroy_context(context_); context_ = nullptr; } - if (std::getenv("XPU_PADDLE_XDL_CONTEXTS") != nullptr) { - // destroy all XPU Dataloader threads if exist - backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId()); - for (auto ctx : GetAllXdlCtxs()) { - xpu_wait(ctx->xpu_stream); - if (ctx->xpu_stream) { - xpu_stream_destroy(ctx->xpu_stream); - ctx->xpu_stream = nullptr; - } - xpu::destroy_context(ctx); - ctx = nullptr; - } - xdl_context_map_.clear(); - } } const Place& GetPlace() const { return place_; } XPUStream stream() const { - if (IsDataloader()) { - xpu::Context* ctx_t = GetXdlCtx(); + xpu::Context* ctx_t = GetXdlCtx(); + if (ctx_t) { return ctx_t->xpu_stream; } return context_->xpu_stream; @@ -132,10 +120,10 @@ struct XPUContext::Impl { // Overload GetXContext function to set and get // contexts of XPU Dataloader threads, and keep old GetXContext Method xpu::Context* GetXContext() { - if (IsDataloader()) { - SetXdlCtx(); - xpu::Context* ctx_t = GetXdlCtx(); - PD_CHECK(ctx_t != nullptr, "the xpu dataloader context is nullptr."); + SetXdlCtx(); + xpu::Context* ctx_t = GetXdlCtx(); + if (ctx_t) { + PD_CHECK(ctx_t != nullptr, "the xpu context is nullptr."); return ctx_t; } @@ -144,21 +132,12 @@ struct XPUContext::Impl { } void Wait() const { - if (IsDataloader()) { - xpu::Context* ctx_t = GetXdlCtx(); - if (ctx_t) { - PD_CHECK(ctx_t != nullptr, "the xpu dataloader context is nullptr."); - xpu_wait(ctx_t->xpu_stream); - } - return; - } - backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId()); PD_CHECK(context_ != nullptr, "the xpu context is nullptr."); xpu_wait(context_->xpu_stream); xpu::Context* ctx_t = GetXdlCtx(); if (ctx_t) { - PD_CHECK(ctx_t != nullptr, "the xpu dataloader context is nullptr."); + PD_CHECK(ctx_t != nullptr, "the xpu context is nullptr."); xpu_wait(ctx_t->xpu_stream); } } @@ -169,10 +148,6 @@ struct XPUContext::Impl { LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: " << static_cast(place_.device); context_ = xpu::create_context(); - if (std::getenv("XPU_PADDLE_XDL_CONTEXTS") != nullptr) { - // Initialize XPU Dataloader threads contexts map - InitializeXdlContexts(); - } xpu_version_ = backends::xpu::get_xpu_version(place_.device); SetL3Cache(); } @@ -190,43 +165,29 @@ struct XPUContext::Impl { stream_owned_ = true; } - // Methods of XPU Dataloader threads contexts map, - // currently, need set 'export XPU_PADDLE_XDL_CONTEXTS=1' - // to open XPU Dataloader context map - void InitializeXdlContexts() { - if (std::getenv("XPU_PADDLE_XDL_CONTEXTS") == nullptr) { - return; - } - auto thread_map = phi::GetAllThreadNames(); - for (const auto& tp : thread_map) { - std::string t_name = tp.second; - if (t_name.substr(0, 10) == "Dataloader") { - SetXdlCtx(); - } - } - } - void SetXdlCtx() { - auto pid = phi::GetProcessId(); - if (xdl_context_map_.find(pid) == xdl_context_map_.end()) { + std::string tname = phi::GetCurrentThreadName(); + if (tname.substr(0, 10) == "Dataloader" && + context_map_.find(tname) == context_map_.end()) { + VLOG(4) << "Set XPU Dataloader Context with current thread name = " + << tname << " currently " << context_map_.size() + << " contexts existing"; xpu::Context* ctx_t = xpu::create_context(); - xdl_context_map_[pid] = ctx_t; + context_map_[tname] = ctx_t; } } xpu::Context* GetXdlCtx() const { - auto pid = phi::GetProcessId(); - return (xdl_context_map_.find(pid) == xdl_context_map_.end()) - ? nullptr - : xdl_context_map_.find(pid)->second; - } - - std::vector GetAllXdlCtxs() { - std::vector ctxs; - for (const auto& it : xdl_context_map_) { - ctxs.emplace_back(it.second); + std::string tname = phi::GetCurrentThreadName(); + VLOG(4) << "Get XPU Context with current thread name = " << tname + << " currently " << context_map_.size() << " contexts existing"; + if (tname.substr(0, 10) != "Dataloader") { + return context_; + } else { + return (context_map_.find(tname) == context_map_.end()) + ? nullptr + : context_map_.find(tname)->second; } - return ctxs; } bool owned_{false}; @@ -236,7 +197,7 @@ struct XPUContext::Impl { int runtime_version_; int driver_version_; xpu::Context* context_{nullptr}; - std::unordered_map xdl_context_map_; + std::unordered_map context_map_; // NOTE: Distributed communicator, distributed framework manages its // resources, XPUContext only holds references. @@ -290,8 +251,6 @@ void XPUContext::SetXContext(xpu::Context* context) { void XPUContext::SetL3Cache(int l3_size) { impl_->SetL3Cache(l3_size); } -bool XPUContext::IsDataloader() const { return impl_->IsDataloader(); } - void XPUContext::SetBkclContext(xpu::BKCLContext_t context) { impl_->SetBkclContext(context); } diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h index fa4899d9871..3e734a064b9 100644 --- a/paddle/phi/backends/xpu/xpu_context.h +++ b/paddle/phi/backends/xpu/xpu_context.h @@ -47,10 +47,6 @@ class XPUContext : public DeviceContext, xpu::Context* x_context() const; - // For multi-thread dataloader, - // check if the current thread is Dataloader thread - bool IsDataloader() const; - // Return bkcl context. xpu::BKCLContext_t bkcl_context() const; void SetBkclContext(xpu::BKCLContext_t context); diff --git a/paddle/phi/backends/xpu/xpu_header.h b/paddle/phi/backends/xpu/xpu_header.h index 5b6c06ac36a..c787874b22f 100644 --- a/paddle/phi/backends/xpu/xpu_header.h +++ b/paddle/phi/backends/xpu/xpu_header.h @@ -51,4 +51,22 @@ class XPUTypeTrait { using Type = bfloat16; }; +template +class XPUTypeToPhiType { + public: + using Type = T; +}; + +template <> +class XPUTypeToPhiType { + public: + using Type = phi::dtype::float16; +}; + +template <> +class XPUTypeToPhiType { + public: + using Type = phi::dtype::bfloat16; +}; + #endif diff --git a/paddle/phi/backends/xpu/xpu_info.h b/paddle/phi/backends/xpu/xpu_info.h index 9d5f073eaa8..bbd13193cd4 100644 --- a/paddle/phi/backends/xpu/xpu_info.h +++ b/paddle/phi/backends/xpu/xpu_info.h @@ -45,6 +45,11 @@ int GetXPUCurrentDeviceId(); std::vector GetXPUSelectedDevices(); /***** Memory Management *****/ +//! Get the minimum chunk size for XPU buddy allocator. +inline size_t XPUMinChunkSize() { + // Allow to allocate the minimum chunk size is 64 bytes. + return 1 << 6; +} //! Copy memory from address src to dst synchronously. void MemcpySyncH2D(void *dst, diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc index 4f91ef3fc3b..35aeaaed078 100644 --- a/paddle/phi/core/flags.cc +++ b/paddle/phi/core/flags.cc @@ -542,7 +542,7 @@ PHI_DEFINE_EXPORTED_double( // NOTE(zhiqiu): better to share the flags, otherwise we will have too many // flags. #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_CUSTOM_DEVICE) + defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU) /** * Memory related FLAG diff --git a/paddle/phi/kernels/xpu/top_k_kernel.cc b/paddle/phi/kernels/xpu/top_k_kernel.cc index ccafeb4aebd..5d77e9c4dc8 100644 --- a/paddle/phi/kernels/xpu/top_k_kernel.cc +++ b/paddle/phi/kernels/xpu/top_k_kernel.cc @@ -17,6 +17,7 @@ #include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/xpu/xpu_mem_util.h" namespace phi { template @@ -59,8 +60,9 @@ void TopkKernel(const Context& dev_ctx, size_t k = k_scalar.to(); if (axis + 1 == in_dims.size()) { xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); - int32_t* indices_int_data = - RAII_GUARD.alloc_l3_or_gm(indices->numel()); + int32_t* indices_int_data = Alloc_l3_or_gm( + dev_ctx, &RAII_GUARD, indices->numel()); + PADDLE_ENFORCE_XDNN_NOT_NULL(indices_int_data); const size_t row = phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); @@ -104,7 +106,9 @@ void TopkKernel(const Context& dev_ctx, } xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); - XPUType* trans_in_data = RAII_GUARD.alloc_l3_or_gm(x.numel()); + XPUType* trans_in_data = + Alloc_l3_or_gm(dev_ctx, &RAII_GUARD, x.numel()); + PADDLE_ENFORCE_XDNN_NOT_NULL(trans_in_data); // Transpose and save interval output to trans_in int r = xpu::transpose(dev_ctx.x_context(), @@ -119,10 +123,17 @@ void TopkKernel(const Context& dev_ctx, r, XPUAPIErrorMsg[r])); - XPUType* trans_out_data = RAII_GUARD.alloc_l3_or_gm(out->numel()); - int64_t* trans_idx_data = RAII_GUARD.alloc_l3_or_gm(out->numel()); + XPUType* trans_out_data = + Alloc_l3_or_gm(dev_ctx, &RAII_GUARD, out->numel()); + PADDLE_ENFORCE_XDNN_NOT_NULL(trans_out_data); + + int64_t* trans_idx_data = + Alloc_l3_or_gm(dev_ctx, &RAII_GUARD, out->numel()); + PADDLE_ENFORCE_XDNN_NOT_NULL(trans_idx_data); + int32_t* trans_idx_int32_data = - RAII_GUARD.alloc_l3_or_gm(out->numel()); + Alloc_l3_or_gm(dev_ctx, &RAII_GUARD, out->numel()); + PADDLE_ENFORCE_XDNN_NOT_NULL(trans_idx_int32_data); const size_t row = phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); const size_t col = trans_dims[trans_dims.size() - 1]; diff --git a/paddle/phi/kernels/xpu/xpu_mem_util.h b/paddle/phi/kernels/xpu/xpu_mem_util.h new file mode 100644 index 00000000000..0b46f05b08d --- /dev/null +++ b/paddle/phi/kernels/xpu/xpu_mem_util.h @@ -0,0 +1,56 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_XPU + +#include +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/backends/xpu/xpu_header.h" +#include "paddle/phi/backends/xpu/xpu_info.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" + +namespace phi { +template +T* Alloc_l3(xpu::ctx_guard* RAII_GUARD, const int64_t n) { + T* ret = RAII_GUARD->alloc_l3(n); + return ret; +} + +template +T* Alloc_gm(const Context& dev_ctx, const int64_t n) { + DenseTensor ret_tensor; + DDim d({n}); + ret_tensor.Resize(d); + T* ret = dev_ctx.template Alloc(&ret_tensor); + return ret; +} + +template +XPUT* Alloc_l3_or_gm(const Context& dev_ctx, + xpu::ctx_guard* RAII_GUARD, + const int64_t n) { + XPUT* ret = Alloc_l3(RAII_GUARD, n); + if (ret != nullptr) { + return ret; + } + using T = typename XPUTypeToPhiType::Type; + ret = reinterpret_cast(Alloc_gm(dev_ctx, n)); + return ret; +} + +} // namespace phi +#endif -- GitLab