From 46f3a39e91fd422f1b6d5cbaadad9a35456eb36a Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Sun, 1 Apr 2018 18:05:59 -0700
Subject: [PATCH] polish and add comments.

---
 .../fluid/framework/details/nccl_all_reduce_op_handle.h   | 2 ++
 .../framework/details/threaded_ssa_graph_executor.cc      | 5 ++++-
 python/paddle/fluid/parallel_executor.py                  | 8 ++++++--
 3 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
index 3d61fa79f75..bb926256676 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -37,6 +37,8 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
 
   std::string Name() const override;
 
+  // Delay and buffer nccl_all_reduce together can significantly increase
+  // performance. Disable this feature by returning false.
   bool IsDelayedOp() override { return true; };
 
  protected:
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 075eed4ecca..32fc9100ab1 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -45,7 +45,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   std::unordered_set<VarHandleBase *> pending_vars;
   BlockingQueue<VarHandleBase *> ready_vars;
   std::unordered_set<OpHandleBase *> ready_ops;
-
+  // For ops (e.g. nccl_all_reduce) that need to coordinate multiple
+  // streams from multiple GPUs, it's faster to buffer them and schedule
+  // together since we currently cannot overlap computation and memcpy streams.
+  // Should revisit it if overlapping is available.
   std::unordered_set<OpHandleBase *> delayed_ops;
   std::unordered_set<OpHandleBase *> after_delayed_ops;
   std::unordered_set<VarHandleBase *> delayed_vars;
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 33e8d3bf210..fec7d6899ce 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -16,7 +16,6 @@ import core
 import multiprocessing
 import framework
 import executor
-import sys
 
 __all__ = ['ParallelExecutor']
 
@@ -36,7 +35,12 @@ class ParallelExecutor(object):
                 places.append(p)
 
         if num_threads is None:
-            num_threads = len(places)
+            if use_cuda:
+                # Experiments on se-resnext shows that too many threads hurt
+                # performance. Worth tunning for other models in the future.
+                num_threads = len(places)
+            else:
+                min(len(places) * 2, multiprocessing.cpu_count())
 
         startup = framework.default_startup_program()
         main = framework.default_main_program()
-- 
GitLab