diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc index 224ab21b4788f99b91e343f06afa55dcb2a69a82..23c463eca41428fc60f8e6ba2e94267bc6697b3e 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc @@ -465,8 +465,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result, new details::SparseAllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), scopes, places, multi_nccl_ctxs_, is_encoded, - static_cast(strategy_.trainers_endpoints_.size()) * - places_.size())); + strategy_.num_trainers_ * places_.size())); } else { result->Get(kGraphOps).emplace_back( new details::AllReduceOpHandle( diff --git a/python/paddle/fluid/incubate/fleet/collective/__init__.py b/python/paddle/fluid/incubate/fleet/collective/__init__.py index fa5dd3673dc749e32b04c4c2b8f076ae856d0002..26b8e2c3b12a0a2e888377ddad24bbeae7c6f3e1 100644 --- a/python/paddle/fluid/incubate/fleet/collective/__init__.py +++ b/python/paddle/fluid/incubate/fleet/collective/__init__.py @@ -271,7 +271,6 @@ class CollectiveOptimizer(DistributedOptimizer): node_num = self._node_num() assert node_num >= 1, "nccl2 node_num must >= 1, now:{}" % node_num - self._strategy.fuse_all_reduce_ops = True exec_strategy = self._strategy.exec_strategy if node_num <= 1: diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 9708e53ba1f947371f0be7b6197547cc9f287d19..4080eebbb51555f96d0d1e444aa26f575733980e 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -291,6 +291,10 @@ class TestDistRunnerBase(object): build_stra.num_trainers = 1 build_stra.trainer_id = 0 + if args.use_dgc: + # fuse_all_reduce_ops require that gradients should not be sparse types + build_stra.fuse_all_reduce_ops = False + print_to_err(type(self).__name__, "begin to compile with data parallel") binary = compiler.CompiledProgram(trainer_prog).with_data_parallel( loss_name=avg_cost.name, @@ -852,7 +856,9 @@ class TestDistBase(unittest.TestCase): if check_error_log: required_envs["GLOG_vmodule"] = \ - "fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10,executor=10,operator=10" + "fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10," \ + "alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10,executor=10,operator=10," \ + "sparse_all_reduce_op_handle=10" required_envs["GLOG_logtostderr"] = "1" local_losses \