diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 6b7bbf9003a57731d4d78933fe3cba933e1405d5..5a4f218077de41775053f971cdb244adae7c28db 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -107,22 +107,20 @@ void AllReduceOpHandle::RunImpl() {
           PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
               buffer, buffer, numel, static_cast<ncclDataType_t>(dtype),
               ncclSum, comm, stream));
-          if (!nccl_ctxs_->need_group_call_) cudaStreamSynchronize(stream);
+          // TODO(Yancey1989): synchronize here can get better performance
+          // if don't use NCCL group call, but need more profileing.
+          if (local_scopes_.size() == 1UL) cudaStreamSynchronize(stream);
         });
       }
 
       this->RunAndRecordEvent([&] {
-        // TODO(Yancey1989): need allreduce operator to avoid this flag
-        if (nccl_ctxs_->need_group_call_) {
+        if (all_reduce_calls.size() == 1UL) {
+          all_reduce_calls[0]();
+        } else {
           platform::NCCLGroupGuard guard;
           for (auto &call : all_reduce_calls) {
             call();
           }
-        } else {
-          // only used in executor_type == ParallalGraph, one thread one GPU
-          // TODO(Yancey1989): use allreduce operator to avoid this tricky.
-          PADDLE_ENFORCE(all_reduce_calls.size() == 1UL);
-          all_reduce_calls[0]();
         }
       });
 
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 6e8cf86fcc99d411e67c54a50bb11466158b8612..5b82805ad9391d82fd9b6cf020658cb0a2801c1d 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -386,8 +386,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
           CreateComputationalOps(&result, node, places_.size());
         }
 
-// insert synchronous ops at the backpropagation; and
-// insert synchronous ops if the graph contains mutilple places.
+// insert collective ops at the backpropagation; and
+// insert collective ops if the graph contains mutilple places.
 
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
         if (!is_forwarding &&
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 4914e0a5ad3426350855feec03c16ff3708693c4..4822627ac3b65972f41d9a23d9fe3dba3de3f97d 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -52,6 +52,7 @@ void OpHandleBase::Run(bool use_cuda) {
 #else
   PADDLE_ENFORCE(!use_cuda);
 #endif
+
   RunImpl();
 }
 
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index cebf63364da6a5d4588d5613d61d64f68e9fa0fd..677a2937945b03fa577317cb4f26e09354d06957 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -216,6 +216,7 @@ void ThreadedSSAGraphExecutor::RunOp(
       if (LIKELY(!strategy_.dry_run_)) {
         op->Run(strategy_.use_cuda_);
       }
+      VLOG(10) << op << " " << op->Name() << " Done ";
       running_ops_--;
       ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << "Signal posted";
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 2604e41045b74af323c7ba6e5788cd06ae0d8fbc..63f3ef0eaccd3201a7a74270173c9ad4695cd0be 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -231,7 +231,6 @@ ParallelExecutor::ParallelExecutor(
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
     ncclUniqueId *nccl_id = nullptr;
-    bool need_group_call = true;
     if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) {
       // parallel graph mode should initialize nccl by ncclCommInitRank since
       // it call nccl operator per device per thread.
@@ -243,17 +242,16 @@ ParallelExecutor::ParallelExecutor(
       } else {
         nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
       }
-      need_group_call = false;
     } else if (nccl_id_var != nullptr) {  // the other executor type.
       // the distributed training with nccl mode would initialize the nccl id in
       // startup_program.
       nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
     } else {
-      // initlize NCCL by ncclCommInitAll, do not need nccl_id.
+      // initlize NCCL by ncclCommInitAll, do not need to intialize the nccl_id.
     }
 
     member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
-        member_->places_, nccl_id, num_trainers, trainer_id, need_group_call));
+        member_->places_, nccl_id, num_trainers, trainer_id));
 #else
     PADDLE_THROW("Not compiled with CUDA");
 #endif
@@ -288,6 +286,14 @@ ParallelExecutor::ParallelExecutor(
   graphs.push_back(std::move(graph));
 #endif
 
+  auto max_memory_size = GetEagerDeletionThreshold();
+  // TODO(Yancey1989): fix gc failed on ParallelGraph executor.
+  if (max_memory_size >= 0 &&
+      exec_strategy.type_ != ExecutionStrategy::kParallelGraph) {
+    graphs[0] = member_->PrepareGCAndRefCnts(
+        std::move(graphs[0]), static_cast<size_t>(max_memory_size));
+  }
+
   // Step 3. Create vars in each scope. Passes may also create new vars.
   //         skip control vars and empty vars
   std::vector<details::VariableInfo> var_infos;
diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h
index 8fd834be9acc80a5ff3989c158d1ae3d81873196..7a51d18fbbf65f68725aa86a6a0ce4d15dff5673 100644
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -27,6 +27,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+
 struct ExceptionHandler {
   mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
   explicit ExceptionHandler(
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 440b16cf915f977cbe833c0ed87de87acc62d52f..ed719f91d0980480aa62a5cd3c1f819e6c0e7475 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -46,6 +46,7 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
       sin >> num;
       place = platform::CUDAPlace(static_cast<int>(num));
     }
+
     out->Reset(framework::MakeDecoratedReader<BufferedReader>(underlying_reader,
                                                               place, 2));
   }
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 23a0222239adc537cbd27f53ca4eb4c66a913c3c..8d062dcdb47587e0e19b54ac96cbb8c6801a580d 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -82,15 +82,12 @@ struct NCCLContext {
 struct NCCLContextMap {
   std::unordered_map<int, NCCLContext> contexts_;
   std::vector<int> order_;
-  bool need_group_call_;
 
   explicit NCCLContextMap(const std::vector<platform::Place> &places,
                           ncclUniqueId *nccl_id = nullptr,
-                          size_t num_trainers = 1, size_t trainer_id = 0,
-                          bool need_group_call = true) {
+                          size_t num_trainers = 1, size_t trainer_id = 0) {
     PADDLE_ENFORCE(!places.empty());
     order_.reserve(places.size());
-    need_group_call_ = need_group_call;
     for (auto &p : places) {
       int dev_id = boost::get<CUDAPlace>(p).device;
       order_.emplace_back(dev_id);
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index 0ff079b4e2c7a204820f43850049f296508ff40c..fffe8bee5806019d2f08616544572556689b01e5 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -123,7 +123,7 @@ class TestMNIST(TestParallelExecutorBase):
         self.check_simple_fc_convergence(False)
 
     def test_simple_fc_with_new_strategy(self):
-        # use_cuda, use_reducea
+        # use_cuda, use_reduce
         self._compare_reduce_and_allreduce(simple_fc_net, True)
         self._compare_reduce_and_allreduce(simple_fc_net, False)