diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h index 3d61fa79f759b9c57ea3e60cc6639143bfbb1eac..bb926256676761f107ab386ff5815fabbd088664 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h @@ -37,6 +37,8 @@ struct NCCLAllReduceOpHandle : public OpHandleBase { std::string Name() const override; + // Delay and buffer nccl_all_reduce together can significantly increase + // performance. Disable this feature by returning false. bool IsDelayedOp() override { return true; }; protected: diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 075eed4ecca0f38f8868cb6742e89020d05bae97..32fc9100ab13c3a442c5a24e9ad750328c50de13 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -45,7 +45,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( std::unordered_set pending_vars; BlockingQueue ready_vars; std::unordered_set ready_ops; - + // For ops (e.g. nccl_all_reduce) that need to coordinate multiple + // streams from multiple GPUs, it's faster to buffer them and schedule + // together since we currently cannot overlap computation and memcpy streams. + // Should revisit it if overlapping is available. std::unordered_set delayed_ops; std::unordered_set after_delayed_ops; std::unordered_set delayed_vars; diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 33e8d3bf210c72f5cfcf7d2f884b1397c25d7eae..fec7d6899ce80d8ddefa811cbf51ef7dd91e5dd7 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -16,7 +16,6 @@ import core import multiprocessing import framework import executor -import sys __all__ = ['ParallelExecutor'] @@ -36,7 +35,12 @@ class ParallelExecutor(object): places.append(p) if num_threads is None: - num_threads = len(places) + if use_cuda: + # Experiments on se-resnext shows that too many threads hurt + # performance. Worth tunning for other models in the future. + num_threads = len(places) + else: + min(len(places) * 2, multiprocessing.cpu_count()) startup = framework.default_startup_program() main = framework.default_main_program()