From 0e26361ca11cd6f6d905518744af8638fe4c9871 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Thu, 4 Aug 2022 10:20:57 +0800 Subject: [PATCH] add xpu garbage collector for standalone executor. (#44572) --- .../framework/new_executor/CMakeLists.txt | 7 +- .../garbage_collector/CMakeLists.txt | 21 +-- .../event_garbage_collector.cc | 103 +++++++------- .../event_garbage_collector.h | 23 ++-- .../fast_garbage_collector.cc | 9 +- .../fast_garbage_collector.h | 10 +- .../garbage_collector/garbage_collector.cc | 33 ++++- .../garbage_collector/garbage_collector.h | 15 ++- .../no_event_garbage_collector.cc | 106 +++++++++++++++ .../no_event_garbage_collector.h | 39 ++++++ .../framework/new_executor/interpretercore.cc | 55 +------- .../framework/new_executor/interpretercore.h | 1 - paddle/fluid/platform/CMakeLists.txt | 9 -- paddle/fluid/platform/device_context.h | 4 +- paddle/fluid/platform/device_event.h | 6 - paddle/fluid/platform/device_event_base.h | 4 +- paddle/fluid/platform/device_event_xpu.cc | 126 ------------------ 17 files changed, 276 insertions(+), 295 deletions(-) create mode 100644 paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.cc create mode 100644 paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h delete mode 100644 paddle/fluid/platform/device_event_xpu.cc diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt index f72a815797..e01fcb68fb 100644 --- a/paddle/fluid/framework/new_executor/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/CMakeLists.txt @@ -39,15 +39,10 @@ set(STANDALONE_EXECUTOR_DEPS scope glog workqueue - interpretercore_event_garbage_collector + interpretercore_garbage_collector ${DEVICE_EVENT_LIBS} glog) -if(WITH_GPU OR WITH_ROCM) - set(STANDALONE_EXECUTOR_DEPS ${STANDALONE_EXECUTOR_DEPS} - interpretercore_fast_garbage_collector) -endif() - cc_library( standalone_executor SRCS ${STANDALONE_EXECUTOR_SRCS} diff --git a/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt b/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt index 359c56c561..d7ff6e4d50 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt @@ -1,22 +1,5 @@ cc_library( interpretercore_garbage_collector - SRCS garbage_collector.cc + SRCS garbage_collector.cc event_garbage_collector.cc fast_garbage_collector.cc + no_event_garbage_collector.cc DEPS garbage_collector) -cc_library( - interpretercore_event_garbage_collector - SRCS event_garbage_collector.cc - DEPS interpretercore_garbage_collector) - -if(WITH_GPU OR WITH_ROCM) - if(WITH_GPU) - nv_library( - interpretercore_fast_garbage_collector - SRCS fast_garbage_collector.cc - DEPS interpretercore_garbage_collector) - elseif(WITH_ROCM) - hip_library( - interpretercore_fast_garbage_collector - SRCS fast_garbage_collector.cc - DEPS interpretercore_garbage_collector) - endif() -endif() diff --git a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc index 5f3386d52d..6133d6ece8 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc +++ b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc @@ -24,48 +24,33 @@ namespace paddle { namespace framework { -InterpreterCoreEventGarbageCollector::InterpreterCoreEventGarbageCollector() { +InterpreterCoreEventGarbageCollector::InterpreterCoreEventGarbageCollector( + const std::vector& vec_instruction) { WorkQueueOptions options(/*name*/ "GarbageCollector", /*num_threads*/ 1, /*allow_spinning*/ true, /*track_task*/ false); queue_ = CreateSingleThreadedWorkQueue(options); + for (auto& instruc : vec_instruction) { + gc_event_.emplace_back(instruc.DeviceContext().GetPlace(), + platform::GenerateDeviceEventFlag()); + } } InterpreterCoreEventGarbageCollector::~InterpreterCoreEventGarbageCollector() { queue_.reset(nullptr); } -void InterpreterCoreEventGarbageCollector::Add( - Garbage garbage, - platform::DeviceEvent* event, - const platform::DeviceContext* ctx) { - if (!garbage) { - return; - } - - if (max_memory_size_ <= 1) { - Free(garbage, event, ctx); - } else { - std::unique_ptr pending_delete_garbages; - { // lock guard - std::lock_guard guard(spinlock_); - cur_memory_size_ += garbage->size(); - garbages_->push_back(std::move(garbage)); - - if (cur_memory_size_ >= max_memory_size_) { - cur_memory_size_ = 0; - pending_delete_garbages = std::move(garbages_); - garbages_ = std::make_unique(); - } - } - } -} - -void InterpreterCoreEventGarbageCollector::Add(Variable* var) { - PADDLE_THROW(platform::errors::Unimplemented( - "Add(Variable* var) is not implemented for " - "InterpreterCoreEventGarbageCollector.")); +void InterpreterCoreEventGarbageCollector::Add(Variable* var, + const Instruction& instr) { + PADDLE_ENFORCE_LT(instr.Id(), + gc_event_.size(), + platform::errors::OutOfRange( + "The index should be less than the size of gc event " + ", but got index is %d and size is %d", + instr.Id(), + gc_event_.size())); + Add(var, &gc_event_.at(instr.Id()), &instr.DeviceContext()); } void InterpreterCoreEventGarbageCollector::Add( @@ -109,23 +94,28 @@ void InterpreterCoreEventGarbageCollector::Add( } } -void InterpreterCoreEventGarbageCollector::Free( - GarbageQueue* garbages, +void InterpreterCoreEventGarbageCollector::Add( + Garbage garbage, platform::DeviceEvent* event, const platform::DeviceContext* ctx) { - event->Record(ctx); - event->SetFininshed(); // Only for CPU Event - queue_->AddTask([container = garbages, event = event]() { - while (!event->Query()) { -#if defined(_WIN32) - SleepEx(50, FALSE); -#else - sched_yield(); -#endif - continue; + if (!garbage) { + return; + } + + if (max_memory_size_ <= 1) { + Free(garbage, event, ctx); + } else { + { // lock guard + std::lock_guard guard(spinlock_); + cur_memory_size_ += garbage->size(); + garbages_->push_back(std::move(garbage)); + events_[ctx] = event; + + if (cur_memory_size_ >= max_memory_size_) { + FreeGarbages(); + } } - delete container; - }); + } } void InterpreterCoreEventGarbageCollector::Free( @@ -146,5 +136,28 @@ void InterpreterCoreEventGarbageCollector::Free( }); } +void InterpreterCoreEventGarbageCollector::FreeGarbages() { + for (auto& vals : events_) { + vals.second->Record(vals.first); + vals.second->SetFininshed(); // Only for CPU Event + } + queue_->AddTask( + [container = std::move(*garbages_), events = std::move(events_)]() { + for (auto& vals : events) { + while (!vals.second->Query()) { +#if defined(_WIN32) + SleepEx(50, FALSE); +#else + sched_yield(); +#endif + continue; + } + } + }); + cur_memory_size_ = 0; + garbages_->clear(); + events_.clear(); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h index 415aa5a96d..305dbb598b 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h +++ b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h @@ -24,28 +24,31 @@ namespace framework { class InterpreterCoreEventGarbageCollector : public InterpreterCoreGarbageCollector { public: - InterpreterCoreEventGarbageCollector(); + InterpreterCoreEventGarbageCollector( + const std::vector& vec_instruction); ~InterpreterCoreEventGarbageCollector(); - - void Add(Variable* var) override; - - virtual void Add(Variable* var, - platform::DeviceEvent* event, - const platform::DeviceContext* ctx); + void Add(Variable* var, const Instruction& instruction) override; private: + void Add(Variable* var, + platform::DeviceEvent* event, + const platform::DeviceContext* ctx); void Add(Garbage garbage, platform::DeviceEvent* event, const platform::DeviceContext* ctx); - void Free(GarbageQueue* garbages, - platform::DeviceEvent* event, - const platform::DeviceContext* ctx); + void Free(const Garbage& garbage, platform::DeviceEvent* event, const platform::DeviceContext* ctx); + void FreeGarbages(); + std::unique_ptr queue_; paddle::memory::SpinLock spinlock_; + std::vector gc_event_; + std::unordered_map + events_; }; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc index 762e211bcb..f6ed094887 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc +++ b/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc @@ -17,12 +17,9 @@ namespace paddle { namespace framework { -void InterpreterCoreFastGarbageCollector::Add( - Variable* var, - platform::DeviceEvent* event, - const platform::DeviceContext* ctx) { - PADDLE_THROW(platform::errors::Unimplemented( - "Not implemented for InterpreterCoreFastGarbageCollector.")); +void InterpreterCoreFastGarbageCollector::Add(Variable* var, + const Instruction&) { + Add(var); } void InterpreterCoreFastGarbageCollector::Add(Variable* var) { diff --git a/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.h b/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.h index 6b5fd33f68..07034a4f29 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.h +++ b/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.h @@ -13,8 +13,6 @@ // limitations under the License. #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - #include "paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h" namespace paddle { @@ -23,15 +21,11 @@ namespace framework { class InterpreterCoreFastGarbageCollector : public InterpreterCoreGarbageCollector { public: - void Add(Variable* var) override; - void Add(Variable* var, - platform::DeviceEvent* event, - const platform::DeviceContext* ctx) override; + void Add(Variable* var, const Instruction& instr) override; private: + void Add(Variable* var); void Add(Garbage garbage); }; } // namespace framework } // namespace paddle - -#endif diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc index 8e849c79bd..e7e925a477 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc +++ b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc @@ -13,17 +13,48 @@ // limitations under the License. #include "paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h" - #include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h" +#include "paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.h" +#include "paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h" + +DECLARE_bool(fast_eager_deletion_mode); namespace paddle { namespace framework { +bool IsInterpretercoreFastGCEnabled() { + return memory::allocation::AllocatorFacade::Instance() + .IsStreamSafeCUDAAllocatorUsed() && + FLAGS_fast_eager_deletion_mode; +} + InterpreterCoreGarbageCollector::InterpreterCoreGarbageCollector() { garbages_ = std::make_unique(); max_memory_size_ = static_cast(GetEagerDeletionThreshold()); cur_memory_size_ = 0; } +std::unique_ptr +CreateInterpreterCoreGarbageCollector( + const platform::Place& place, + const std::vector& vec_instruction) { + if (platform::is_gpu_place(place)) { + if (IsInterpretercoreFastGCEnabled()) { + return std::unique_ptr( + new InterpreterCoreFastGarbageCollector()); + } else { + return std::unique_ptr( + new InterpreterCoreEventGarbageCollector(vec_instruction)); + } + } else if (platform::is_xpu_place(place) || platform::is_ipu_place(place)) { + return std::unique_ptr( + new InterpreterCoreNoEventGarbageCollector()); + } else { + return std::unique_ptr( + new InterpreterCoreEventGarbageCollector(vec_instruction)); + } +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h index c0397ceeb6..2e8e1792cd 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h +++ b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h @@ -15,6 +15,7 @@ #include +#include "paddle/fluid/framework/new_executor/new_executor_defs.h" #include "paddle/fluid/memory/allocation/spin_lock.h" #include "paddle/fluid/platform/device_event.h" #include "paddle/fluid/platform/enforce.h" @@ -30,10 +31,9 @@ class InterpreterCoreGarbageCollector { public: InterpreterCoreGarbageCollector(); virtual ~InterpreterCoreGarbageCollector() {} - virtual void Add(Variable* var) = 0; - virtual void Add(Variable* var, - platform::DeviceEvent* event, - const platform::DeviceContext* ctx) = 0; + + virtual void Add(Variable* var, const Instruction& instruction) = 0; + DISABLE_COPY_AND_ASSIGN(InterpreterCoreGarbageCollector); protected: @@ -43,5 +43,12 @@ class InterpreterCoreGarbageCollector { memory::SpinLock spinlock_; }; +bool IsInterpretercoreFastGCEnabled(); + +std::unique_ptr +CreateInterpreterCoreGarbageCollector( + const platform::Place& place, + const std::vector& vec_instruction); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.cc new file mode 100644 index 0000000000..bbe7659ab0 --- /dev/null +++ b/paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h" + +namespace paddle { +namespace framework { + +InterpreterCoreNoEventGarbageCollector:: + InterpreterCoreNoEventGarbageCollector() { + WorkQueueOptions options(/*name*/ "NoEventGarbageCollector", + /*num_threads*/ 1, + /*allow_spinning*/ true, + /*track_task*/ false); + queue_ = CreateSingleThreadedWorkQueue(options); +} + +InterpreterCoreNoEventGarbageCollector:: + ~InterpreterCoreNoEventGarbageCollector() { + queue_.reset(nullptr); +} + +void InterpreterCoreNoEventGarbageCollector::Add(Variable* var, + const Instruction& instr) { + Add(var, &instr.DeviceContext()); +} + +void InterpreterCoreNoEventGarbageCollector::Add( + Variable* var, const platform::DeviceContext* ctx) { + if (UNLIKELY(max_memory_size_ < 0) || var == nullptr) { + return; + } + + if (var->IsType()) { + Add(var->GetMutable()->MoveMemoryHolder(), ctx); + } else if (var->IsType< + operators::reader:: + OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) { + // TODO(xiongkun03) in old executor, this type of variable is not support + // eager deletion. so we just leave it here ? + } else if (var->IsType()) { + // TODO(xiongkun03) in old executor, this type of variable is not support + // eager deletion. so we just leave it here ? + } else if (var->IsType()) { + Add(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder(), + ctx); + var->GetMutable()->mutable_rows()->clear(); + } else if (var->IsType()) { + auto* tensor_arr = var->GetMutable(); + for (auto& t : *tensor_arr) { + Add(t.MoveMemoryHolder(), ctx); + } + } else if (var->IsType>()) { + // NOTE(@xiongkun03) conditional_op / while_op will create a STEP_SCOPE + // refer to executor.cc to see what old garbage collector does. + // do nothing, because the sub scope will be deleted by sub-executor. + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "The variable(%s) is not supported in eager deletion.", + framework::ToTypeName(var->Type()))); + } +} + +void InterpreterCoreNoEventGarbageCollector::Add( + Garbage garbage, const platform::DeviceContext* ctx) { + if (!garbage) { + return; + } + if (max_memory_size_ <= 1) { + queue_->AddTask([container = garbage, ctx = ctx]() { ctx->Wait(); }); + } else { + // lock guard + std::lock_guard guard(spinlock_); + cur_memory_size_ += garbage->size(); + garbages_->emplace_back(std::move(garbage)); + ctxs_.insert(ctx); + + if (cur_memory_size_ >= max_memory_size_) { + cur_memory_size_ = 0; + queue_->AddTask( + [container = std::move(*garbages_), dev_ctxs = std::move(ctxs_)]() { + for (auto& ctx : dev_ctxs) { + ctx->Wait(); + } + }); + ctxs_.clear(); + garbages_->clear(); + } + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h b/paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h new file mode 100644 index 0000000000..36c8adec36 --- /dev/null +++ b/paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +#include "paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h" +#include "paddle/fluid/framework/new_executor/workqueue/workqueue.h" + +namespace paddle { +namespace framework { + +class InterpreterCoreNoEventGarbageCollector + : public InterpreterCoreGarbageCollector { + public: + InterpreterCoreNoEventGarbageCollector(); + ~InterpreterCoreNoEventGarbageCollector(); + void Add(Variable* var, const Instruction& instr) override; + + private: + void Add(Variable* var, const platform::DeviceContext* ctx); + void Add(Garbage garbage, const platform::DeviceContext* ctx); + std::unique_ptr queue_; + std::unordered_set ctxs_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 1a98b44729..ecbce3b610 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -18,8 +18,6 @@ #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" -#include "paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h" -#include "paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.h" #include "paddle/fluid/framework/new_executor/interpretercore_util.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/os_info.h" @@ -41,7 +39,6 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, DECLARE_bool(check_nan_inf); DECLARE_bool(benchmark); -DECLARE_bool(fast_eager_deletion_mode); constexpr const char* kExceptionCaught = "ExceptionCaught"; constexpr const char* kTaskCompletion = "TaskCompletion"; @@ -52,12 +49,6 @@ namespace framework { static constexpr size_t kHostNumThreads = 4; static constexpr size_t kDeviceNumThreads = 1; -bool IsInterpretercoreFastGCEnabled() { - return memory::allocation::AllocatorFacade::Instance() - .IsStreamSafeCUDAAllocatorUsed() && - FLAGS_fast_eager_deletion_mode; -} - InterpreterCore::InterpreterCore(const platform::Place& place, const BlockDesc& block, const std::set& skip_gc_vars, @@ -71,16 +62,6 @@ InterpreterCore::InterpreterCore(const platform::Place& place, is_build_ = false; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (IsInterpretercoreFastGCEnabled()) { - gc_ = std::make_unique(); - } else { - gc_ = std::make_unique(); - } -#else - gc_ = std::make_unique(); -#endif - exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught); completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion); @@ -498,16 +479,7 @@ void InterpreterCore::Convert( } BuildSkipShareLoDInfo(); - - for (size_t i = 0; i < vec_instruction_.size(); ++i) { -#ifdef PADDLE_WITH_IPU - gc_event_.emplace_back(phi::CPUPlace(), 0); -#else - gc_event_.emplace_back(vec_instruction_[i].DeviceContext().GetPlace(), - platform::GenerateDeviceEventFlag()); - -#endif - } + gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_); bool inplaced = false; for (auto inst : vec_instruction_) { if (inst.OpBase()->Type() == "share_buffer" || @@ -828,9 +800,6 @@ void InterpreterCore::RunInstructionAsync( RunInstruction(instr_node); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - RecordStreamForGC(instr_node); -#endif CheckGC(instr_node, atomic_var_ref); interpreter::RecordEvent(instr_node, place_); @@ -969,7 +938,9 @@ void InterpreterCore::CheckGC( std::vector>* atomic_var_ref) { platform::RecordEvent record( "CheckGC", platform::TracerEventType::UserDefined, 10); - size_t instr_id = instr.Id(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + RecordStreamForGC(instr); +#endif auto& var_scope = var_scope_; for (auto var_id : instr.GCCheckVars()) { @@ -986,23 +957,7 @@ void InterpreterCore::CheckGC( if (is_ready) { VLOG(6) << "Async delete variable with name : " << var_scope.GetNameById(var_id); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (IsInterpretercoreFastGCEnabled()) { - static_cast(gc_.get())->Add( - var_scope_.VarRef(var_id)); - - } else { - static_cast(gc_.get())->Add( - var_scope_.VarRef(var_id), - &gc_event_.at(instr_id), - &instr.DeviceContext()); - } -#else - static_cast(gc_.get())->Add( - var_scope_.VarRef(var_id), - &gc_event_.at(instr_id), - &instr.DeviceContext()); -#endif + gc_->Add(var_scope_.VarRef(var_id), instr); } } } diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index 7069be5af1..a7efa1349e 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -141,7 +141,6 @@ class InterpreterCore { std::shared_ptr completion_notifier_{nullptr}; std::unique_ptr gc_; - std::vector gc_event_; std::future> atomic_deps_; std::future> atomic_var_ref_; diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 75331dbbb2..6ed27fd9b3 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -265,15 +265,6 @@ cc_library( set(DEVICE_EVENT_LIBS device_event_base CACHE INTERNAL "device event libs") -if(WITH_XPU) - cc_library( - device_event_xpu - SRCS device_event_xpu.cc - DEPS device_event_base xpu_info) - set(DEVICE_EVENT_LIBS - device_event_xpu - CACHE INTERNAL "device event libs") -endif() if(WITH_ASCEND_CL) cc_library( diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 4bb1e3abf8..4b8833f9a6 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -113,8 +113,8 @@ bool AllowTF32Cudnn(); enum DeviceType { CPU = 0, CUDA = 1, - XPU = 2, - NPU = 3, + NPU = 2, + XPU = 3, IPU = 4, MLU = 5, diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h index 2edccfa90c..cf80266050 100644 --- a/paddle/fluid/platform/device_event.h +++ b/paddle/fluid/platform/device_event.h @@ -37,12 +37,6 @@ USE_EVENT_WAIT(kCUDA, kCUDA) USE_EVENT_WAIT(kCPU, kCUDA) #endif -#ifdef PADDLE_WITH_XPU -USE_EVENT(kXPU); -USE_EVENT_WAIT(kXPU, kXPU) -USE_EVENT_WAIT(kCPU, kXPU) -#endif - #ifdef PADDLE_WITH_ASCEND_CL USE_EVENT(kNPU); USE_EVENT_WAIT(kNPU, kNPU) diff --git a/paddle/fluid/platform/device_event_base.h b/paddle/fluid/platform/device_event_base.h index a2d3fc1dc3..6a2948480b 100644 --- a/paddle/fluid/platform/device_event_base.h +++ b/paddle/fluid/platform/device_event_base.h @@ -64,9 +64,9 @@ class DeviceEvent { "Required type < %d, but received type = %d", MaxDeviceTypes, type_id_)); - // TODO(Aurelius84): only support CPU/CUDA/XPU/NPU. + // TODO(Aurelius84): only support CPU/CUDA/NPU. PADDLE_ENFORCE_LT(type_id_, - 4, + 3, platform::errors::Unavailable( "Currently DeviceEvent do not support %s", place)); PADDLE_ENFORCE_NOT_NULL( diff --git a/paddle/fluid/platform/device_event_xpu.cc b/paddle/fluid/platform/device_event_xpu.cc deleted file mode 100644 index 098b0a5645..0000000000 --- a/paddle/fluid/platform/device_event_xpu.cc +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/platform/device/xpu/xpu_info.h" -#include "paddle/fluid/platform/device_event_base.h" - -#ifdef PADDLE_WITH_XPU -namespace paddle { -namespace platform { - -struct XPUDeviceEventWrapper { - explicit XPUDeviceEventWrapper(const platform::Place& place) { - PADDLE_ENFORCE_EQ( - platform::is_xpu_place(place), - true, - platform::errors::PreconditionNotMet( - "Required device shall be XPUPlace, but received %d. ", place)); - - device_id_ = place.device; - PADDLE_ENFORCE_GT( - device_id_, - -1, - platform::errors::PreconditionNotMet( - "Required DeviceOption.device_id > -1, but received %d. ", - device_id_)); - xpu_event_create(&handle_); - } - - xpuEventHandle handle_; - int device_id_; -}; - -void DeviceEventCreateXPU(DeviceEvent* event, - const platform::Place& place, - unsigned int) { - event->InitEvent(std::make_shared(place)); -} - -void DeviceEventRecordXPU(DeviceEvent* event, const DeviceContext* context) { - auto* wrapper = static_cast(event->GetEvent().get()); - PADDLE_ENFORCE_NOT_NULL( - wrapper, - platform::errors::PreconditionNotMet( - "Failed to dynamic_cast event into XPUDeviceEventWrapper.")); - - auto* xpu_dev_ctx = dynamic_cast(context); - PADDLE_ENFORCE_NOT_NULL( - xpu_dev_ctx, - platform::errors::PreconditionNotMet( - "Failed to dynamic_cast context into XPUDeviceContext.")); - xpu_event_record(wrapper->handle_, xpu_dev_ctx->stream()); -} - -void DeviceEventFinishXPU(const DeviceEvent* event) { - auto* wrapper = static_cast(event->GetEvent().get()); - PADDLE_ENFORCE_NOT_NULL( - wrapper, - platform::errors::PreconditionNotMet( - "Failed to dynamic_cast event into XPUDeviceEventWrapper.")); - xpu_event_wait(wrapper->handle_); -} - -// current xpu not support query, used wait to instead. -bool DeviceEventQueryXPU(const DeviceEvent* event) { - DeviceEventFinishXPU(event); - return true; -} - -void DeviceEventXPUWaitXPU(const DeviceEvent* event, - const DeviceContext* context) { - auto* wrapper = static_cast(event->GetEvent().get()); - PADDLE_ENFORCE_NOT_NULL( - wrapper, - platform::errors::PreconditionNotMet( - "Failed to dynamic_cast event into XPUDeviceEventWrapper.")); - auto* xpu_dev_ctx = dynamic_cast(context); - PADDLE_ENFORCE_NOT_NULL( - xpu_dev_ctx, - platform::errors::PreconditionNotMet( - "Failed to dynamic_cast context into XOUDeviceContext.")); - xpu_stream_wait_event(xpu_dev_ctx->stream(), wrapper->handle_); -} - -void DeviceEventCPUWaitXPU(const DeviceEvent* event, - const DeviceContext* context) { - DeviceEventFinishXPU(event); -} - -void DeviceEventSetFinishedXPU(const DeviceEvent* event) { - // do nothing -} - -void EventResetXPU(const DeviceEvent* event) { - // do nothing -} - -} // namespace platform -} // namespace paddle - -using ::paddle::platform::kCPU; -using ::paddle::platform::kXPU; -REGISTER_EVENT_CREATE_FUNCTION(kXPU, paddle::platform::DeviceEventCreateXPU) -REGISTER_EVENT_RECORD_FUNCTION(kXPU, paddle::platform::DeviceEventRecordXPU) -REGISTER_EVENT_QUERY_FUNCTION(kXPU, paddle::platform::DeviceEventQueryXPU) -REGISTER_EVENT_FINISH_FUNCTION(kXPU, paddle::platform::DeviceEventFinishXPU) -REGISTER_EVENT_SET_FINISHED_FUNCTION( - kXPU, paddle::platform::DeviceEventSetFinishedXPU) -REGISTER_EVENT_WAIT_FUNCTION(kXPU, - kXPU, - paddle::platform::DeviceEventXPUWaitXPU) -REGISTER_EVENT_WAIT_FUNCTION(kCPU, - kXPU, - paddle::platform::DeviceEventCPUWaitXPU) -REGISTER_EVENT_RESET_FUNCTION(kXPU, paddle::platform::EventResetXPU) -#endif -- GitLab