diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 37b07e5736312b3050debe745f2d3c108469c5d6..15c496130c2b6c7643ff96661be09e5ac4870344 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -25,7 +25,7 @@ struct ExecutionStrategy { size_t num_threads_{0}; bool use_cuda_{true}; bool allow_op_delay_{false}; - size_t num_iteration_per_drop_scope_{1}; + size_t num_iteration_per_drop_scope_{100}; ExecutorType type_{kDefault}; bool dry_run_{false}; }; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 22bf0d308b2debe3b3c064b68c92f75cb8ab9c12..00b8136dc2ea06cc06ae5b586f6291cb866d950c 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -66,17 +66,15 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); ++drop_scope_counter_; + bool stream_end = false; if (!fetch_tensors.empty()) { - // Wait All computational streams - for (auto p : places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); - } + WaitComputationalStreams(); + stream_end = true; } if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { - // Wait All computational streams - for (auto p : places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); + if (!stream_end) { + WaitComputationalStreams(); } for (auto &scope : local_scopes_) { diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 5e87e0bf50b51d2b630aba06a5907dd721754d1f..0f6340213daee98a75401f9db0e628f7b4fd79fc 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -47,6 +47,14 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { FeedFetchList Run(const std::vector& fetch_tensors) override; + private: + inline void WaitComputationalStreams() { + // Wait All computational streams + for (auto p : places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } + } + private: size_t drop_scope_counter_{0}; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index a63c71aad25961877af5ca2c5c6085868cdd3477..d590c3a3c6b2a589fda4ce327e9c557a664e71cc 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -815,7 +815,7 @@ All parameter, weight, gradient are variables in Paddle. R"DOC(The type is INT, num_iteration_per_drop_scope indicates how many iterations to clean up the temp variables which is generated during execution. It may make the execution faster, - because the temp variable's shape maybe the same between two iterations. Default 1. + because the temp variable's shape maybe the same between two iterations. Default 100. NOTES: 1. If you fetch data when calling the 'run', the ParallelExecutor