From 8126a41d732d268486f1d36e687b374e865ac0a6 Mon Sep 17 00:00:00 2001 From: lilong12 Date: Wed, 20 Jan 2021 13:32:15 +0800 Subject: [PATCH] fix the bug of all_reduce pipeline gradient multiple times (#30437) * update, test=develop --- .../fleet/meta_optimizers/pipeline_optimizer.py | 4 ++++ python/paddle/fluid/tests/unittests/CMakeLists.txt | 9 ++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py index 67a3312552..9e46bf3368 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py @@ -233,6 +233,7 @@ class PipelineOptimizer(MetaOptimizerBase): block = self.main_program_list[ring_id - 1]['program'].global_block() origin_block = self.main_program.global_block() grad = None + processed_param_name = set() for idx, op in reversed(list(enumerate(block.ops))): if is_backward_op(op) and \ OP_ROLE_VAR_KEY in op.attr_names: @@ -242,7 +243,10 @@ class PipelineOptimizer(MetaOptimizerBase): assert len(op_role_var) % 2 == 0 offset = idx for i in range(0, len(op_role_var), 2): + param_name = op_role_var[i] param = block.vars[op_role_var[i]] + if param_name in processed_param_name: continue + processed_param_name.add(param_name) grad = block.vars[op_role_var[i + 1]] origin_param = origin_block.vars[op_role_var[i]] if origin_param.is_distributed: diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 0d4d3f1ade..be60e489ca 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -10,7 +10,7 @@ if(NOT WITH_NCCL) endif() string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}") list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist) -#list(APPEND DIST_TEST_OPS test_pipeline) +list(APPEND DIST_TEST_OPS test_pipeline) list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext) list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding) list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height) @@ -62,7 +62,6 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_auto) foreach(TEST_OP ${MIXED_DIST_TEST_OPS}) list(REMOVE_ITEM TEST_OPS ${TEST_OP}) endforeach() -list(REMOVE_ITEM TEST_OPS test_pipeline) if(NOT WITH_GPU OR WIN32) LIST(REMOVE_ITEM TEST_OPS test_c_comm_init_all_op) @@ -826,9 +825,9 @@ if(WITH_GPU AND NOT WIN32) set_tests_properties(test_collective_allgather_api PROPERTIES TIMEOUT 120) set_tests_properties(test_collective_broadcast_api PROPERTIES TIMEOUT 120) set_tests_properties(test_collective_allreduce_api PROPERTIES TIMEOUT 120) -# if(WITH_DISTRIBUTE) -# set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120) -# endif() + if(WITH_DISTRIBUTE) + set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120) + endif() set_tests_properties(test_reducescatter_api PROPERTIES TIMEOUT 120) set_tests_properties(test_broadcast PROPERTIES TIMEOUT 120) set_tests_properties(test_reducescatter PROPERTIES TIMEOUT 120) -- GitLab