diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
index 3d61fa79f759b9c57ea3e60cc6639143bfbb1eac..bb926256676761f107ab386ff5815fabbd088664 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -37,6 +37,8 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
 
   std::string Name() const override;
 
+  // Delay and buffer nccl_all_reduce together can significantly increase
+  // performance. Disable this feature by returning false.
   bool IsDelayedOp() override { return true; };
 
  protected:
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 075eed4ecca0f38f8868cb6742e89020d05bae97..32fc9100ab13c3a442c5a24e9ad750328c50de13 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -45,7 +45,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   std::unordered_set<VarHandleBase *> pending_vars;
   BlockingQueue<VarHandleBase *> ready_vars;
   std::unordered_set<OpHandleBase *> ready_ops;
-
+  // For ops (e.g. nccl_all_reduce) that need to coordinate multiple
+  // streams from multiple GPUs, it's faster to buffer them and schedule
+  // together since we currently cannot overlap computation and memcpy streams.
+  // Should revisit it if overlapping is available.
   std::unordered_set<OpHandleBase *> delayed_ops;
   std::unordered_set<OpHandleBase *> after_delayed_ops;
   std::unordered_set<VarHandleBase *> delayed_vars;
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 33e8d3bf210c72f5cfcf7d2f884b1397c25d7eae..fec7d6899ce80d8ddefa811cbf51ef7dd91e5dd7 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -16,7 +16,6 @@ import core
 import multiprocessing
 import framework
 import executor
-import sys
 
 __all__ = ['ParallelExecutor']
 
@@ -36,7 +35,12 @@ class ParallelExecutor(object):
                 places.append(p)
 
         if num_threads is None:
-            num_threads = len(places)
+            if use_cuda:
+                # Experiments on se-resnext shows that too many threads hurt
+                # performance. Worth tunning for other models in the future.
+                num_threads = len(places)
+            else:
+                min(len(places) * 2, multiprocessing.cpu_count())
 
         startup = framework.default_startup_program()
         main = framework.default_main_program()