diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index 37b07e5736312b3050debe745f2d3c108469c5d6..15c496130c2b6c7643ff96661be09e5ac4870344 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -25,7 +25,7 @@ struct ExecutionStrategy {
   size_t num_threads_{0};
   bool use_cuda_{true};
   bool allow_op_delay_{false};
-  size_t num_iteration_per_drop_scope_{1};
+  size_t num_iteration_per_drop_scope_{100};
   ExecutorType type_{kDefault};
   bool dry_run_{false};
 };
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 22bf0d308b2debe3b3c064b68c92f75cb8ab9c12..00b8136dc2ea06cc06ae5b586f6291cb866d950c 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -66,17 +66,15 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
   platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
   ++drop_scope_counter_;
 
+  bool stream_end = false;
   if (!fetch_tensors.empty()) {
-    // Wait All computational streams
-    for (auto p : places_) {
-      platform::DeviceContextPool::Instance().Get(p)->Wait();
-    }
+    WaitComputationalStreams();
+    stream_end = true;
   }
 
   if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
-    // Wait All computational streams
-    for (auto p : places_) {
-      platform::DeviceContextPool::Instance().Get(p)->Wait();
+    if (!stream_end) {
+      WaitComputationalStreams();
     }
 
     for (auto &scope : local_scopes_) {
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index 5e87e0bf50b51d2b630aba06a5907dd721754d1f..0f6340213daee98a75401f9db0e628f7b4fd79fc 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -47,6 +47,14 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
 
   FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
 
+ private:
+  inline void WaitComputationalStreams() {
+    // Wait All computational streams
+    for (auto p : places_) {
+      platform::DeviceContextPool::Instance().Get(p)->Wait();
+    }
+  }
+
  private:
   size_t drop_scope_counter_{0};
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index a63c71aad25961877af5ca2c5c6085868cdd3477..d590c3a3c6b2a589fda4ce327e9c557a664e71cc 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -815,7 +815,7 @@ All parameter, weight, gradient are variables in Paddle.
           R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
                 many iterations to clean up the temp variables which
                 is generated during execution. It may make the execution faster,
-                because the temp variable's shape maybe the same between two iterations. Default 1.
+                because the temp variable's shape maybe the same between two iterations. Default 100.
 
                 NOTES:
                     1. If you fetch data when calling the 'run', the ParallelExecutor