polish and add comments.

46f3a39e · Xin Pan · d0ac9253 · 46f3a39e · 46f3a39e · 46f3a39e
3 changed file
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -37,6 +37,8 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
  std::string Name() const override;
+  // Delay and buffer nccl_all_reduce together can significantly increase
+  // performance. Disable this feature by returning false.
  bool IsDelayedOp() override { return true; };
 protected:

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -45,7 +45,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
  std::unordered_set<VarHandleBase *> pending_vars;
  BlockingQueue<VarHandleBase *> ready_vars;
  std::unordered_set<OpHandleBase *> ready_ops;
+  // For ops (e.g. nccl_all_reduce) that need to coordinate multiple
+  // streams from multiple GPUs, it's faster to buffer them and schedule
+  // together since we currently cannot overlap computation and memcpy streams.
+  // Should revisit it if overlapping is available.
  std::unordered_set<OpHandleBase *> delayed_ops;
  std::unordered_set<OpHandleBase *> after_delayed_ops;
  std::unordered_set<VarHandleBase *> delayed_vars;

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -16,7 +16,6 @@ import core
 import multiprocessing
 import framework
 import executor
-import sys
 __all__ = ['ParallelExecutor']
@@ -36,7 +35,12 @@ class ParallelExecutor(object):
                places.append(p)
        if num_threads is None:
-            num_threads = len(places)
+            if use_cuda:
+                # Experiments on se-resnext shows that too many threads hurt
+                # performance. Worth tunning for other models in the future.
+                num_threads = len(places)
+            else:
+                min(len(places) * 2, multiprocessing.cpu_count())
        startup = framework.default_startup_program()
        main = framework.default_main_program()