diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index b28c884429c17936a2733664baecf96bf5ff492a..1de6d26d05b9e410b6e81f07c2f21829d8b8c54c 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -183,7 +183,7 @@ message DistributedStrategy { optional bool use_hierarchical_allreduce = 15 [ default = false ]; optional int32 hierarchical_allreduce_inter_nranks = 16 [ default = 1 ]; optional bool sync_batch_norm = 17 [ default = false ]; - optional bool fuse_all_reduce_ops = 18 [ default = false ]; + optional bool fuse_all_reduce_ops = 18 [ default = true ]; optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ]; optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ]; optional bool cudnn_exhaustive_search = 21 [ default = false ]; diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py index 1387827736560e0e2e3fb00041eb372d77530c09..b6c25e3ad67d3a7d8628cf32aa7cd0c5564915e6 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py @@ -366,6 +366,8 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): "gradient_merge_acc_step": 1, "mp_degree": 1 } + + strategy.fuse_all_reduce_ops = False self.optimizer(avg_cost, strategy, train_prog, startup_prog) startup_prog_ops = startup_prog.global_block().ops main_prog_ops = train_prog.global_block().ops