diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 8c6c9f35e84f4fd7d2b5486ac0eb60beceb512a2..17dd1399119d190bcbc31adb34ec61deb92a9994 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -135,12 +135,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { void AppendMultiDevPass(const BuildStrategy &strategy) { ir::Pass *multi_devices_pass; if (strategy_.is_distribution_) { + VLOG(3) << "multi device parameter server mode"; multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); } else { if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { + VLOG(3) << "multi devices collective mode with allreduce"; multi_devices_pass = AppendPass("allreduce_mode_multi_devices_pass").get(); } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { + VLOG(3) << "multi deivces collective mode with reduce"; multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get(); } else { PADDLE_THROW("Unknown reduce strategy."); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 7d1e63f3682bca8965f6c5e695132dff44fa3715..478d2ffbcf2988487893984284d4597f018f0ca0 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -937,9 +937,21 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result, } void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { - if (need_broadcast_var_ || - (UseGPU() && - strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce)) { + // broad cast received parameters when training in parameter server mode. + if (need_broadcast_var_) { + // There are 4 conditions: + // 1. GPU && Reduce: Reduce gradient then broadcast gradient to other GPUS. + // Need to broadcast received parameters to other GPU. + // 2. GPU && AllReduce: AllReduce all graident to each GPU. Need to + // broadcast received parameters to other GPU. + // 3. CPU && AllReduce: AllReduce all gradient to each thread. Need to + // broadcast received parameters to other scope. + // 4. CPU && Reduce: because all parameters share the same memory, did not + // broadcast received parameters. + if (!UseGPU() && + strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { + return; + } if (strategy_.fuse_broadcast_op_) { CreateFusedBroadcastOp(result, bcast_var_name_set_); } else { diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index fa79db19ee895c028f9cad0f4212aaff0e513784..483a7d4f46f16761b683e4cea90f833be1fe944f 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -19,6 +19,7 @@ import sys from .. import compat as cpt from . import core +from . import framework __all__ = ['CompiledProgram', 'ExecutionStrategy', 'BuildStrategy'] @@ -110,6 +111,8 @@ class CompiledProgram(object): self._exec_strategy = ExecutionStrategy() if self._build_strategy is None: self._build_strategy = BuildStrategy() + self._build_strategy.is_distribution = framework.is_pserver_mode( + self._program) return self def with_inference_optimize(self, config): diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 15367c724e5304fed78ef58f8a27932e1d6de318..f01b6ab09437a70835e84d66ab59709039b6f54f 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -87,6 +87,15 @@ def _current_expected_place(): return _imperative_current_expected_place_ +def is_pserver_mode(main_program): + main = main_program if main_program \ + else default_main_program() + for op in main.global_block().ops: + if op.type in ["send", "recv"]: + return True + return False + + class NameScope(object): def __init__(self, name="", parent=None): self._children = dict() diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 8586670c2481a0f997e84f33860b6df28b3223ae..648bf69273f7ce31431bf8006c4540580cb94b61 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -29,15 +29,6 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy BuildStrategy = core.ParallelExecutor.BuildStrategy -def _is_pserver_mode(main_program): - main = main_program if main_program \ - else framework.default_main_program() - for op in main.global_block().ops: - if op.type in ["send", "recv"]: - return True - return False - - class ParallelExecutor(object): """ ParallelExecutor is designed for data parallelism, which focuses on distributing @@ -140,7 +131,7 @@ class ParallelExecutor(object): # FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, # num_trainers is 1, so the current fields of build_strategy doesn't tell if # it's distributed model. - build_strategy.is_distribution = _is_pserver_mode( + build_strategy.is_distribution = framework.is_pserver_mode( main_program) or num_trainers > 1 # step4: get main_program, scope, local_scopes