diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 7a1b40c0b60a788b1f0a70e688f8fcbe427ad076..e3f8bbb72f2a1b75b6041d41496cef0efc81874f 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/framework/details/computation_op_handle.h" +#include + namespace paddle { namespace framework { namespace details { @@ -33,7 +35,7 @@ void ComputationOpHandle::RunImpl() { } } - op_->Run(*scope_->FindVar("@TMP_SCOPE@")->Get(), place_); + op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); } std::string ComputationOpHandle::Name() const { return op_->Type(); } diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 9180903b864d03e59f55f41410b2240fa4199496..e3e7c55d153aec8ce9c25c962821b266eaa84fe4 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -14,6 +14,9 @@ #include "paddle/fluid/framework/details/fetch_op_handle.h" +#include +#include + namespace paddle { namespace framework { namespace details { @@ -57,7 +60,10 @@ void FetchOpHandle::RunImpl() { for (size_t i = 0; i < scopes.size(); ++i) { auto &scope = scopes[i]; - auto &t = scope->FindVar(var_name)->Get(); + auto &t = scope->FindVar(kLocalExecScopeName) + ->Get() + ->FindVar(var_name) + ->Get(); if (platform::is_gpu_place(var->place_)) { #ifdef PADDLE_WITH_CUDA TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]); diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index d7a541ac4bb83625060db337446d03a1afda3ed0..fbdb54ba8d940c8dedd44a42a85825af5d2ec664 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -24,6 +24,8 @@ namespace paddle { namespace framework { namespace details { +constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@"; + class OpHandleBase { private: DISABLE_COPY_AND_ASSIGN(OpHandleBase); diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h index 3b818b1a45b56351e34f9e52ec22b6d02a0c1591..a8833b7388ab907020a260d356f1484ffd227658 100644 --- a/paddle/fluid/framework/details/ssa_graph_executor.h +++ b/paddle/fluid/framework/details/ssa_graph_executor.h @@ -15,13 +15,15 @@ #pragma once #include +#include +#include + #include "paddle/fluid/framework/details/ssa_graph.h" #include "paddle/fluid/framework/feed_fetch_type.h" namespace paddle { namespace framework { namespace details { - class SSAGraphExecutor { DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor); diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 62af4c1d79ded5eaa30e4e6d43cc0d7327ae9689..1ce69ab02b09fe7ec17f479bcef97c931e853dc4 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -136,12 +136,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ready_ops.clear(); }; - // Create local scopes. - for (auto &scope : local_scopes_) { - auto &local_scope = scope->NewScope(); - *scope->Var("@TMP_SCOPE@")->GetMutable() = &local_scope; - } - // Step 3. Execution while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) { // 1. Run All Ready ops @@ -189,34 +183,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( PADDLE_ENFORCE(ready_ops.empty()); PADDLE_ENFORCE(delayed_ops.empty()); PADDLE_ENFORCE(blocked_by_delayed_ops.empty()); - ++computation_count_; - - auto sync_computation = [&] { - computation_count_ = 0; - // Wait All computational streams - for (auto p : this->places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); - } - for (auto &scope : local_scopes_) { - scope->DropKids(); - } - }; // Wait FetchOps. if (!fetch_ops.empty()) { fetch_ops.clear(); - sync_computation(); - } - - if (computation_count_ == max_async_computation) { - sync_computation(); - } - - // NOTE: the temp scope can be dropped lazily if needed. - // Drop tmp scopes; - for (auto &scope : local_scopes_) { - auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable(); - kid = nullptr; } return fetch_data; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 79cfc26b461a39811a9a125e5aeac3492d967386..bb5e837b135c35b5aea403496b45aab1ccc288ff 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -99,9 +99,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { std::unique_ptr exception_; std::atomic running_ops_; bool allow_op_delay_; - - size_t computation_count_{0}; - size_t max_async_computation{100}; }; } // namespace details diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 20dcc080b696431b9972c0a972904d957f9b47d8..c1486b527d2e06d2b3f7e0f89458bf9a22564586 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" #include +#include #include #ifdef PADDLE_WITH_CUDA @@ -41,6 +42,8 @@ class ParallelExecutorPrivate { #ifdef PADDLE_WITH_CUDA std::unique_ptr nccl_ctxs_; #endif + + std::vector> var_types_; }; std::vector &ParallelExecutor::GetLocalScopes() { @@ -97,14 +100,9 @@ ParallelExecutor::ParallelExecutor( allow_op_delay)); // Step 3. Create vars in each scope; - for (auto *scope : member_->local_scopes_) { - for (auto *var : main_program.Block(0).AllVars()) { - if (scope->FindVar(var->Name()) != nullptr) { - continue; - } - - InitializeVariable(scope->Var(var->Name()), var->GetType()); - } + for (auto *var : main_program.Block(0).AllVars()) { + member_->var_types_.emplace_back(var->Name(), var->GetType(), + var->Persistable()); } } @@ -163,9 +161,42 @@ void ParallelExecutor::Run( const std::unordered_map &feed_tensors) { platform::RecordBlock b(0); SplitTensorToPlaces(feed_tensors); + + // Create local scopes. + for (auto &scope : member_->local_scopes_) { + Scope &local_scope = scope->NewScope(); + *scope->Var(details::kLocalExecScopeName)->GetMutable() = + &local_scope; + + for (auto &name_type_pair : member_->var_types_) { + if (scope->FindVar(std::get<0>(name_type_pair)) != nullptr) { + continue; + } + + if (std::get<2>(name_type_pair)) { // Persistable + InitializeVariable(scope->Var(std::get<0>(name_type_pair)), + std::get<1>(name_type_pair)); + } else { + InitializeVariable(scope->Var(std::get<0>(name_type_pair)), + std::get<1>(name_type_pair)); + } + } + } + auto fetch_data = member_->executor_->Run(fetch_tensors); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetch_data; + + // Wait All computational streams + for (auto p : member_->places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } + for (auto &scope : member_->local_scopes_) { + auto &local_scope = + *scope->Var(details::kLocalExecScopeName)->GetMutable(); + scope->DeleteScope(local_scope); + local_scope = nullptr; + } } void ParallelExecutor::SplitTensorToPlaces(