From 46f3a39e91fd422f1b6d5cbaadad9a35456eb36a Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 1 Apr 2018 18:05:59 -0700 Subject: [PATCH] polish and add comments. --- .../fluid/framework/details/nccl_all_reduce_op_handle.h | 2 ++ .../framework/details/threaded_ssa_graph_executor.cc | 5 ++++- python/paddle/fluid/parallel_executor.py | 8 ++++++-- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h index 3d61fa79f75..bb926256676 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h @@ -37,6 +37,8 @@ struct NCCLAllReduceOpHandle : public OpHandleBase { std::string Name() const override; + // Delay and buffer nccl_all_reduce together can significantly increase + // performance. Disable this feature by returning false. bool IsDelayedOp() override { return true; }; protected: diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 075eed4ecca..32fc9100ab1 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -45,7 +45,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( std::unordered_set pending_vars; BlockingQueue ready_vars; std::unordered_set ready_ops; - + // For ops (e.g. nccl_all_reduce) that need to coordinate multiple + // streams from multiple GPUs, it's faster to buffer them and schedule + // together since we currently cannot overlap computation and memcpy streams. + // Should revisit it if overlapping is available. std::unordered_set delayed_ops; std::unordered_set after_delayed_ops; std::unordered_set delayed_vars; diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 33e8d3bf210..fec7d6899ce 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -16,7 +16,6 @@ import core import multiprocessing import framework import executor -import sys __all__ = ['ParallelExecutor'] @@ -36,7 +35,12 @@ class ParallelExecutor(object): places.append(p) if num_threads is None: - num_threads = len(places) + if use_cuda: + # Experiments on se-resnext shows that too many threads hurt + # performance. Worth tunning for other models in the future. + num_threads = len(places) + else: + min(len(places) * 2, multiprocessing.cpu_count()) startup = framework.default_startup_program() main = framework.default_main_program() -- GitLab