diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index 7a1b40c0b60a788b1f0a70e688f8fcbe427ad076..e3f8bbb72f2a1b75b6041d41496cef0efc81874f 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 
+#include <string>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -33,7 +35,7 @@ void ComputationOpHandle::RunImpl() {
     }
   }
 
-  op_->Run(*scope_->FindVar("@TMP_SCOPE@")->Get<Scope *>(), place_);
+  op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
 }
 
 std::string ComputationOpHandle::Name() const { return op_->Type(); }
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 9180903b864d03e59f55f41410b2240fa4199496..e3e7c55d153aec8ce9c25c962821b266eaa84fe4 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -14,6 +14,9 @@
 
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 
+#include <string>
+#include <vector>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -57,7 +60,10 @@ void FetchOpHandle::RunImpl() {
 
   for (size_t i = 0; i < scopes.size(); ++i) {
     auto &scope = scopes[i];
-    auto &t = scope->FindVar(var_name)->Get<framework::LoDTensor>();
+    auto &t = scope->FindVar(kLocalExecScopeName)
+                  ->Get<Scope *>()
+                  ->FindVar(var_name)
+                  ->Get<framework::LoDTensor>();
     if (platform::is_gpu_place(var->place_)) {
 #ifdef PADDLE_WITH_CUDA
       TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index d7a541ac4bb83625060db337446d03a1afda3ed0..fbdb54ba8d940c8dedd44a42a85825af5d2ec664 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -24,6 +24,8 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
+
 class OpHandleBase {
  private:
   DISABLE_COPY_AND_ASSIGN(OpHandleBase);
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h
index 3b818b1a45b56351e34f9e52ec22b6d02a0c1591..a8833b7388ab907020a260d356f1484ffd227658 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -15,13 +15,15 @@
 #pragma once
 
 #include <memory>
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/details/ssa_graph.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
-
 class SSAGraphExecutor {
   DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);
 
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 62af4c1d79ded5eaa30e4e6d43cc0d7327ae9689..1ce69ab02b09fe7ec17f479bcef97c931e853dc4 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -136,12 +136,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     ready_ops.clear();
   };
 
-  // Create local scopes.
-  for (auto &scope : local_scopes_) {
-    auto &local_scope = scope->NewScope();
-    *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>() = &local_scope;
-  }
-
   // Step 3. Execution
   while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) {
     // 1. Run All Ready ops
@@ -189,34 +183,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   PADDLE_ENFORCE(ready_ops.empty());
   PADDLE_ENFORCE(delayed_ops.empty());
   PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
-  ++computation_count_;
-
-  auto sync_computation = [&] {
-    computation_count_ = 0;
-    // Wait All computational streams
-    for (auto p : this->places_) {
-      platform::DeviceContextPool::Instance().Get(p)->Wait();
-    }
-    for (auto &scope : local_scopes_) {
-      scope->DropKids();
-    }
-  };
 
   // Wait FetchOps.
   if (!fetch_ops.empty()) {
     fetch_ops.clear();
-    sync_computation();
-  }
-
-  if (computation_count_ == max_async_computation) {
-    sync_computation();
-  }
-
-  // NOTE: the temp scope can be dropped lazily if needed.
-  // Drop tmp scopes;
-  for (auto &scope : local_scopes_) {
-    auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>();
-    kid = nullptr;
   }
 
   return fetch_data;
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 79cfc26b461a39811a9a125e5aeac3492d967386..bb5e837b135c35b5aea403496b45aab1ccc288ff 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -99,9 +99,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::unique_ptr<platform::EnforceNotMet> exception_;
   std::atomic<int> running_ops_;
   bool allow_op_delay_;
-
-  size_t computation_count_{0};
-  size_t max_async_computation{100};
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 20dcc080b696431b9972c0a972904d957f9b47d8..c1486b527d2e06d2b3f7e0f89458bf9a22564586 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/parallel_executor.h"
 
 #include <string>
+#include <tuple>
 #include <vector>
 
 #ifdef PADDLE_WITH_CUDA
@@ -41,6 +42,8 @@ class ParallelExecutorPrivate {
 #ifdef PADDLE_WITH_CUDA
   std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
+
+  std::vector<std::tuple<std::string, proto::VarType::Type, bool>> var_types_;
 };
 
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@@ -97,14 +100,9 @@ ParallelExecutor::ParallelExecutor(
       allow_op_delay));
 
   // Step 3. Create vars in each scope;
-  for (auto *scope : member_->local_scopes_) {
-    for (auto *var : main_program.Block(0).AllVars()) {
-      if (scope->FindVar(var->Name()) != nullptr) {
-        continue;
-      }
-
-      InitializeVariable(scope->Var(var->Name()), var->GetType());
-    }
+  for (auto *var : main_program.Block(0).AllVars()) {
+    member_->var_types_.emplace_back(var->Name(), var->GetType(),
+                                     var->Persistable());
   }
 }
 
@@ -163,9 +161,42 @@ void ParallelExecutor::Run(
     const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
   platform::RecordBlock b(0);
   SplitTensorToPlaces(feed_tensors);
+
+  // Create local scopes.
+  for (auto &scope : member_->local_scopes_) {
+    Scope &local_scope = scope->NewScope();
+    *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
+        &local_scope;
+
+    for (auto &name_type_pair : member_->var_types_) {
+      if (scope->FindVar(std::get<0>(name_type_pair)) != nullptr) {
+        continue;
+      }
+
+      if (std::get<2>(name_type_pair)) {  // Persistable
+        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
+                           std::get<1>(name_type_pair));
+      } else {
+        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
+                           std::get<1>(name_type_pair));
+      }
+    }
+  }
+
   auto fetch_data = member_->executor_->Run(fetch_tensors);
   *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
       fetch_data;
+
+  // Wait All computational streams
+  for (auto p : member_->places_) {
+    platform::DeviceContextPool::Instance().Get(p)->Wait();
+  }
+  for (auto &scope : member_->local_scopes_) {
+    auto &local_scope =
+        *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
+    scope->DeleteScope(local_scope);
+    local_scope = nullptr;
+  }
 }
 
 void ParallelExecutor::SplitTensorToPlaces(