From fbe8982fff08039b74c399a391cddf8f55ff6b9f Mon Sep 17 00:00:00 2001 From: Ghost Screaming Date: Thu, 15 Jun 2023 10:27:18 +0800 Subject: [PATCH] [Cherry-Pick] Fix problem of TimeOut of distributed testcases under cuda12. (#54635) --- test/auto_parallel/CMakeLists.txt | 10 +++++----- test/collective/CMakeLists.txt | 14 +++++++------- test/collective/fleet/CMakeLists.txt | 8 ++++---- .../fleet/dygraph_group_sharded_stage3.py | 5 ++++- .../fleet/dygraph_group_sharded_stage3_offload.py | 5 ++++- test/collective/fleet/hybrid_parallel_mp_bf16.py | 5 ++++- test/collective/fleet/hybrid_parallel_pp_bf16.py | 5 ++++- test/distributed_passes/CMakeLists.txt | 2 +- test/legacy_test/CMakeLists.txt | 2 +- 9 files changed, 34 insertions(+), 22 deletions(-) diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt index c805071af32..811ce947dbb 100644 --- a/test/auto_parallel/CMakeLists.txt +++ b/test/auto_parallel/CMakeLists.txt @@ -24,7 +24,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_optimization_tuner_api MODULES test_optimization_tuner_api) set_tests_properties(test_optimization_tuner_api - PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80) + PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120) py_test_modules(test_converter MODULES test_converter) set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50) @@ -48,10 +48,10 @@ if(WITH_DISTRIBUTE AND WITH_GPU) PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50) py_test_modules(test_pass_sharding MODULES test_pass_sharding) set_tests_properties(test_pass_sharding - PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50) + PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100) py_test_modules(test_pass_amp MODULES test_pass_amp) set_tests_properties(test_pass_amp PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" - TIMEOUT 50) + TIMEOUT 80) py_test_modules(test_amp_o2_pass MODULES test_amp_o2_pass) set_tests_properties(test_amp_o2_pass PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50) @@ -85,11 +85,11 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_tuning_recompute MODULES test_tuning_recompute) set_tests_properties(test_tuning_recompute PROPERTIES TIMEOUT 300) py_test_modules(test_fused_linear_pass MODULES test_fused_linear_pass) - set_tests_properties(test_fused_linear_pass PROPERTIES TIMEOUT 20) + set_tests_properties(test_fused_linear_pass PROPERTIES TIMEOUT 40) py_test_modules(test_align_tool MODULES test_align_tool) set_tests_properties(test_align_tool PROPERTIES TIMEOUT 20) py_test_modules(test_pass_base_list MODULES test_pass_base_list) - set_tests_properties(test_pass_base_list PROPERTIES TIMEOUT 20) + set_tests_properties(test_pass_base_list PROPERTIES TIMEOUT 40) py_test_modules(test_fuse_adamw_pass MODULES test_fuse_adamw_pass) set_tests_properties(test_fuse_adamw_pass PROPERTIES TIMEOUT 20) py_test_modules(test_rule_based_tuner_o2 MODULES test_rule_based_tuner_o2) diff --git a/test/collective/CMakeLists.txt b/test/collective/CMakeLists.txt index a5e9e0e3ac7..1e0cf94dfa9 100644 --- a/test/collective/CMakeLists.txt +++ b/test/collective/CMakeLists.txt @@ -107,7 +107,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_alltoall_single_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_alltoall_single_api - PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "160" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( @@ -137,10 +137,10 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") if(${CUDA_ARCH_NAME} STREQUAL "Ampere") set_tests_properties(test_collective_broadcast_api - PROPERTIES TIMEOUT "360" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "500" LABELS "RUN_TYPE=DIST") else() set_tests_properties(test_collective_broadcast_api - PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "450" LABELS "RUN_TYPE=DIST") endif() endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) @@ -178,7 +178,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_isend_irecv_api MODULES test_collective_isend_irecv_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_isend_irecv_api - PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "160" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( @@ -240,10 +240,10 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") if(${CUDA_ARCH_NAME} STREQUAL "Ampere") set_tests_properties(test_collective_reduce_scatter_api - PROPERTIES TIMEOUT "210" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "360" LABELS "RUN_TYPE=DIST") else() set_tests_properties(test_collective_reduce_scatter_api - PROPERTIES TIMEOUT "150" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "250" LABELS "RUN_TYPE=DIST") endif() endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) @@ -258,7 +258,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_scatter_api MODULES test_collective_scatter_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_scatter_api - PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( diff --git a/test/collective/fleet/CMakeLists.txt b/test/collective/fleet/CMakeLists.txt index 47d6db03896..4a472f19508 100644 --- a/test/collective/fleet/CMakeLists.txt +++ b/test/collective/fleet/CMakeLists.txt @@ -237,7 +237,7 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) START_BASH ../../legacy_test/dist_test.sh TIMEOUT - "120" + "160" LABELS "RUN_TYPE=DIST" ENVS @@ -682,13 +682,13 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) START_BASH ../../legacy_test/dist_test.sh TIMEOUT - "120" + "240" LABELS "RUN_TYPE=DIST" ENVS "PADDLE_DIST_UT_PORT=21272;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python" ) - set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT "120") + set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT "240") endif() if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT) bash_test_modules( @@ -922,7 +922,7 @@ if((WITH_GPU) AND (LINUX)) test_dygraph_dist_save_load MODULES test_dygraph_dist_save_load ENVS "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_dygraph_dist_save_load - PROPERTIES TIMEOUT "200" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU) AND (LINUX)) py_test_modules( diff --git a/test/collective/fleet/dygraph_group_sharded_stage3.py b/test/collective/fleet/dygraph_group_sharded_stage3.py index 5499968079a..bbe0884d982 100644 --- a/test/collective/fleet/dygraph_group_sharded_stage3.py +++ b/test/collective/fleet/dygraph_group_sharded_stage3.py @@ -366,7 +366,10 @@ def test_stage2_stage3(): # bfp16 nccl_version = core.nccl_version() - if nccl_version >= 21000: + if ( + nccl_version >= 21000 + and paddle.device.cuda.get_device_properties().major >= 8 + ): stage2_params = train_mlp( mlp11, sharding_stage=2, diff --git a/test/collective/fleet/dygraph_group_sharded_stage3_offload.py b/test/collective/fleet/dygraph_group_sharded_stage3_offload.py index e97a163e42f..b34f178aa83 100644 --- a/test/collective/fleet/dygraph_group_sharded_stage3_offload.py +++ b/test/collective/fleet/dygraph_group_sharded_stage3_offload.py @@ -215,7 +215,10 @@ def test_stage3_offload(): # bfp16 offload nccl_version = core.nccl_version() - if nccl_version >= 21000: + if ( + nccl_version >= 21000 + and paddle.device.cuda.get_device_properties().major >= 8 + ): stage3_params = train_mlp(mlp7, use_pure_fp16=True, use_bfp16=True) stage3_params_offload = train_mlp( mlp8, use_pure_fp16=True, offload=True, use_bfp16=True diff --git a/test/collective/fleet/hybrid_parallel_mp_bf16.py b/test/collective/fleet/hybrid_parallel_mp_bf16.py index 9e0847b38c5..ae977f98917 100644 --- a/test/collective/fleet/hybrid_parallel_mp_bf16.py +++ b/test/collective/fleet/hybrid_parallel_mp_bf16.py @@ -60,5 +60,8 @@ class TestMPFP16(TestDistMPTraning): if __name__ == "__main__": - if check_nccl_version_for_bf16(): + if ( + check_nccl_version_for_bf16() + and paddle.device.cuda.get_device_properties().major >= 8 + ): unittest.main() diff --git a/test/collective/fleet/hybrid_parallel_pp_bf16.py b/test/collective/fleet/hybrid_parallel_pp_bf16.py index 6ae0a98d5c8..70b3aec1515 100644 --- a/test/collective/fleet/hybrid_parallel_pp_bf16.py +++ b/test/collective/fleet/hybrid_parallel_pp_bf16.py @@ -165,5 +165,8 @@ class TestDistPPTraning(unittest.TestCase): if __name__ == "__main__": - if check_nccl_version_for_bf16(): + if ( + check_nccl_version_for_bf16() + and paddle.device.cuda.get_device_properties().major >= 8 + ): unittest.main() diff --git a/test/distributed_passes/CMakeLists.txt b/test/distributed_passes/CMakeLists.txt index 79bc34620a4..12018ff20de 100644 --- a/test/distributed_passes/CMakeLists.txt +++ b/test/distributed_passes/CMakeLists.txt @@ -29,6 +29,6 @@ endif() foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS "NVIDIA_TF32_OVERRIDE=0") list(APPEND DIST_TEST_OPS ${TEST_OP}) - set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 200) + set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 250) set_tests_properties(${TEST_OP} PROPERTIES LABELS "RUN_TYPE=DIST") endforeach() diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt index b3d630d2d52..bd9ad353f79 100644 --- a/test/legacy_test/CMakeLists.txt +++ b/test/legacy_test/CMakeLists.txt @@ -1304,4 +1304,4 @@ set_tests_properties(test_reduce_op_static_build PROPERTIES TIMEOUT 500) set_tests_properties(test_sync_batch_norm_op_static_build PROPERTIES LABELS "RUN_TYPE=DIST") set_tests_properties(test_sync_batch_norm_op_static_build PROPERTIES TIMEOUT - 120) + 250) -- GitLab