From 26c3077a3e0905e4b1cf3c9b06b4667dcc9cfdd4 Mon Sep 17 00:00:00 2001 From: Roc <30228238+sljlp@users.noreply.github.com> Date: Mon, 8 May 2023 15:14:30 +0800 Subject: [PATCH] Fix timeout v2 (#53514) --- .../tests/unittests/collective/CMakeLists.txt | 16 ++++ .../unittests/collective/fleet/CMakeLists.txt | 76 +++++++++++++++++++ tools/gen_ut_cmakelists.py | 41 +++++++--- 3 files changed, 124 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt index 1aec76f7052..ae17fbb1938 100644 --- a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt @@ -44,6 +44,8 @@ if((WITH_ROCM OR WITH_GPU) AND (LINUX)) test_collective_split_embedding START_BASH ../dist_test.sh + TIMEOUT + "300" LABELS "RUN_TYPE=DIST" ENVS @@ -85,6 +87,8 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_alltoall_single START_BASH ../dist_test.sh + TIMEOUT + "350" LABELS "RUN_TYPE=DIST" ENVS @@ -112,6 +116,8 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_batch_isend_irecv START_BASH ../dist_test.sh + TIMEOUT + "350" LABELS "RUN_TYPE=DIST" ENVS @@ -176,6 +182,8 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_process_group START_BASH ../dist_test.sh + TIMEOUT + "350" LABELS "RUN_TYPE=DIST" ENVS @@ -202,6 +210,8 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_reduce_scatter START_BASH ../dist_test.sh + TIMEOUT + "350" LABELS "RUN_TYPE=DIST" ENVS @@ -391,6 +401,8 @@ if((WITH_ROCM OR WITH_GPU) AND (LINUX)) test_world_size_and_rank START_BASH test_world_size_and_rank.sh + TIMEOUT + "120" LABELS "RUN_TYPE=DIST" ENVS @@ -414,6 +426,8 @@ if((WITH_ROCM OR WITH_GPU) AND (LINUX)) test_strategy_group START_BASH test_strategy_group.sh + TIMEOUT + "120" LABELS "RUN_TYPE=DIST" ENVS @@ -425,6 +439,8 @@ if((WITH_ROCM OR WITH_GPU) AND (LINUX)) test_orthogonal_strategy START_BASH test_orthogonal_strategy.sh + TIMEOUT + "120" LABELS "RUN_TYPE=DIST" ENVS diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt index dad5f1d4b5b..17408305e59 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt @@ -37,6 +37,8 @@ if(WITH_NCCL) test_parallel_margin_cross_entropy START_BASH ../../dist_test.sh + TIMEOUT + "120" LABELS "RUN_TYPE=DIST" ENVS @@ -95,6 +97,8 @@ if(WITH_NCCL) test_parallel_dygraph_mp_layers START_BASH ../../dist_test.sh + TIMEOUT + "120" LABELS "RUN_TYPE=DIST" ENVS @@ -120,6 +124,8 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) test_dygraph_sharding_stage3_for_eager START_BASH ../../dist_test.sh + TIMEOUT + "350" LABELS "RUN_TYPE=DIST" ENVS @@ -146,6 +152,8 @@ if((WITH_GPU) AND LOCAL_ALL_PLAT) test_parallel_dygraph_pipeline_parallel START_BASH ../../dist_test.sh + TIMEOUT + "500" LABELS "RUN_TYPE=DIST" ENVS @@ -159,6 +167,8 @@ if((WITH_GPU) AND LOCAL_ALL_PLAT) test_parallel_dygraph_pipeline_parallel_with_virtual_stage START_BASH ../../dist_test.sh + TIMEOUT + "500" LABELS "RUN_TYPE=DIST" ENVS @@ -173,6 +183,8 @@ if((WITH_GPU) AND LOCAL_ALL_PLAT) test_parallel_dygraph_pp_adaptor START_BASH ../../dist_test.sh + TIMEOUT + "500" LABELS "RUN_TYPE=DIST" ENVS @@ -193,6 +205,8 @@ if(WITH_NCCL) test_parallel_class_center_sample START_BASH ../../dist_test.sh + TIMEOUT + "120" LABELS "RUN_TYPE=DIST" ENVS @@ -207,6 +221,8 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) test_pipeline START_BASH ../../dist_test.sh + TIMEOUT + "120" LABELS "RUN_TYPE=DIST" ENVS @@ -226,6 +242,8 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) test_static_model_parallel START_BASH ../../dist_test.sh + TIMEOUT + "240" LABELS "RUN_TYPE=DIST" ENVS @@ -239,6 +257,8 @@ if(WITH_NCCL) test_parallel_dygraph_no_sync START_BASH ../../dist_test.sh + TIMEOUT + "300" LABELS "RUN_TYPE=DIST" ENVS @@ -252,6 +272,8 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) test_dygraph_sharding_stage2 START_BASH ../../dist_test.sh + TIMEOUT + "200" LABELS "RUN_TYPE=DIST" ENVS @@ -264,6 +286,8 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) test_parallel_dygraph_control_flow START_BASH ../../dist_test.sh + TIMEOUT + "350" LABELS "RUN_TYPE=DIST" ENVS @@ -288,6 +312,8 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) test_hybrid_parallel_inference_helper START_BASH ../../dist_test.sh + TIMEOUT + "120" LABELS "RUN_TYPE=DIST" ENVS @@ -365,6 +391,8 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) test_parallel_dygraph_sharding_parallel START_BASH ../../dist_test.sh + TIMEOUT + "120" LABELS "RUN_TYPE=DIST" ENVS @@ -378,6 +406,8 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) test_parallel_dygraph_tensor_parallel START_BASH ../../dist_test.sh + TIMEOUT + "200" LABELS "RUN_TYPE=DIST" ENVS @@ -391,6 +421,8 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) test_dygraph_group_sharded_api_for_eager START_BASH ../../dist_test.sh + TIMEOUT + "120" LABELS "RUN_TYPE=DIST" ENVS @@ -417,6 +449,8 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) test_parallel_dygraph_unused_variables START_BASH ../../dist_test.sh + TIMEOUT + "350" LABELS "RUN_TYPE=DIST" ENVS @@ -442,6 +476,8 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) test_parallel_dygraph_no_sync_gradient_check START_BASH ../../dist_test.sh + TIMEOUT + "60" LABELS "RUN_TYPE=DIST" ENVS @@ -472,6 +508,8 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) test_parallel_dygraph_qat START_BASH ../../dist_test.sh + TIMEOUT + "120" LABELS "RUN_TYPE=DIST" ENVS @@ -486,6 +524,8 @@ if(WITH_NCCL) test_parallel_dygraph_sparse_embedding START_BASH ../../dist_test.sh + TIMEOUT + "200" LABELS "RUN_TYPE=DIST" ENVS @@ -501,6 +541,8 @@ if((WITH_ROCM) AND LOCAL_ALL_PLAT) test_parallel_dygraph_sparse_embedding START_BASH ../../dist_test.sh + TIMEOUT + "200" LABELS "RUN_TYPE=DIST" ENVS @@ -521,6 +563,8 @@ if(WITH_NCCL) test_parallel_dygraph_sparse_embedding_over_height START_BASH ../../dist_test.sh + TIMEOUT + "150" LABELS "RUN_TYPE=DIST" ENVS @@ -536,6 +580,8 @@ if((WITH_ROCM) AND LOCAL_ALL_PLAT) test_parallel_dygraph_sparse_embedding_over_height START_BASH ../../dist_test.sh + TIMEOUT + "350" LABELS "RUN_TYPE=DIST" ENVS @@ -554,6 +600,8 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) test_auto_parallel_parallelizer START_BASH ../../dist_test.sh + TIMEOUT + "120" LABELS "RUN_TYPE=DIST" ENVS @@ -587,6 +635,8 @@ if((WITH_GPU OR WITH_XPU) AND (LINUX)) test_c_comm_init_op START_BASH test_c_comm_init_op.sh + TIMEOUT + "120" LABELS "RUN_TYPE=DIST" ENVS @@ -598,6 +648,8 @@ if((WITH_GPU) AND (LINUX)) test_fused_attention_pass_with_mp START_BASH test_fused_attention_pass_with_mp.sh + TIMEOUT + "120" LABELS "RUN_TYPE=DIST" ENVS @@ -610,6 +662,8 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) test_ir_pass_pipeline START_BASH ../../dist_test.sh + TIMEOUT + "120" LABELS "RUN_TYPE=DIST" ENVS @@ -622,6 +676,8 @@ if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT) test_parallel_dygraph_mnist START_BASH ../../dist_test.sh + TIMEOUT + "200" LABELS "RUN_TYPE=DIST" ENVS @@ -634,6 +690,8 @@ if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT) test_parallel_dygraph_se_resnext START_BASH ../../dist_test.sh + TIMEOUT + "200" LABELS "RUN_TYPE=DIST" ENVS @@ -704,6 +762,8 @@ if(LOCAL_ALL_ARCH AND (LINUX)) test_auto_checkpoint START_BASH ../../dist_test.sh + TIMEOUT + "200" LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY" ENVS @@ -716,6 +776,8 @@ if(LOCAL_ALL_ARCH AND (LINUX)) test_auto_checkpoint1 START_BASH ../../dist_test.sh + TIMEOUT + "200" LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY" ENVS @@ -728,6 +790,8 @@ if(LOCAL_ALL_ARCH AND (LINUX)) test_auto_checkpoint2 START_BASH ../../dist_test.sh + TIMEOUT + "200" LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY" ENVS @@ -740,6 +804,8 @@ if(LOCAL_ALL_ARCH AND (LINUX)) test_auto_checkpoint3 START_BASH ../../dist_test.sh + TIMEOUT + "200" LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY" ENVS @@ -752,6 +818,8 @@ if(LOCAL_ALL_ARCH AND (LINUX)) test_auto_checkpoint_multiple START_BASH ../../dist_test.sh + TIMEOUT + "200" LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY" ENVS @@ -764,6 +832,8 @@ if(LOCAL_ALL_ARCH AND (LINUX)) test_auto_checkpoint_dist_basic START_BASH ../../dist_test.sh + TIMEOUT + "200" LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY" ENVS @@ -776,6 +846,8 @@ if(LOCAL_ALL_ARCH AND (LINUX)) test_hdfs1 START_BASH ../../dist_test.sh + TIMEOUT + "200" LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY" ENVS @@ -788,6 +860,8 @@ if(LOCAL_ALL_ARCH AND (LINUX)) test_hdfs2 START_BASH ../../dist_test.sh + TIMEOUT + "200" LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY" ENVS @@ -800,6 +874,8 @@ if(LOCAL_ALL_ARCH AND (LINUX)) test_hdfs3 START_BASH ../../dist_test.sh + TIMEOUT + "200" LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY" ENVS diff --git a/tools/gen_ut_cmakelists.py b/tools/gen_ut_cmakelists.py index 26cabcbaced..7ceb99db637 100644 --- a/tools/gen_ut_cmakelists.py +++ b/tools/gen_ut_cmakelists.py @@ -97,6 +97,8 @@ def _proccess_archs(arch): arch = arch.upper().strip() if len(arch) > 0: for a in arch.split(";"): + if '' == a: + continue assert a in [ "GPU", "ROCM", @@ -372,11 +374,12 @@ class DistUTPortManager: class CMakeGenerator: - def __init__(self, current_dirs, ignore_dirs): + def __init__(self, current_dirs, only_check, ignore_dirs): self.processed_dirs = set() self.port_manager = DistUTPortManager(ignore_dirs) self.current_dirs = _norm_dirs(current_dirs) self.modified_or_created_files = [] + self._only_check = only_check def prepare_dist_ut_port(self): for c in self._find_root_dirs(): @@ -457,6 +460,10 @@ class CMakeGenerator: for c in conditions: cmd += f"if ({c})\n" + time_out_str = ( + f' TIMEOUT "{timeout}"' if len(timeout.strip()) > 0 else '' + ) + if launcher[-3:] == ".sh": run_type = _process_run_type(run_type) dist_ut_port = self.port_manager.process_dist_port_num(num_port) @@ -466,6 +473,7 @@ class CMakeGenerator: {name} START_BASH {launcher} + {time_out_str} LABELS "RUN_TYPE={run_type}" ENVS @@ -492,9 +500,6 @@ class CMakeGenerator: run_type_str = ( "" if len(run_type) == 0 else f' LABELS "RUN_TYPE={run_type}"' ) - time_out_str = ( - f' TIMEOUT "{timeout}"' if len(timeout.strip()) > 0 else '' - ) run_serial_str = ( f' RUN_SERIAL {run_serial}' if len(run_serial) > 0 else '' ) @@ -562,8 +567,13 @@ class CMakeGenerator: # check whether the generated file are thge same with the existing file, ignoring the blank chars # if the are same, skip the weiting process - with open(f"{current_work_dir}/CMakeLists.txt", "r") as old_cmake_file: - char_seq = old_cmake_file.read().split() + if os.path.isfile(f"{current_work_dir}/CMakeLists.txt"): + with open( + f"{current_work_dir}/CMakeLists.txt", "r" + ) as old_cmake_file: + char_seq = old_cmake_file.read().split() + else: + char_seq = [] char_seq = "".join(char_seq) if char_seq != "".join(cmds.split()): @@ -574,8 +584,11 @@ class CMakeGenerator: self.modified_or_created_files.append( f"{current_work_dir}/CMakeLists.txt" ) - with open(f"{current_work_dir}/CMakeLists.txt", "w") as cmake_file: - print(cmds, end="", file=cmake_file) + if not self._only_check: + with open( + f"{current_work_dir}/CMakeLists.txt", "w" + ) as cmake_file: + print(cmds, end="", file=cmake_file) if __name__ == "__main__": @@ -607,6 +620,14 @@ if __name__ == "__main__": nargs='*', help="To keep dist ports the same with old version cmake, old cmakelists.txt files are needed to parse dist_ports. If a directories are newly created and there is no cmakelists.txt file, the directory path must be specified by this option. The dirs are not recursive.", ) + parser.add_argument( + "--only-check-changed", + '-o', + type=lambda x: x.lower() not in ["false", "0", "off"], + required=False, + default=False, + help="Only check wheather the CMake files should be rewriten, do not write it enven if it should be write", + ) args = parser.parse_args() assert not ( @@ -624,7 +645,9 @@ if __name__ == "__main__": if len(args.dirpaths) >= 1: current_work_dirs = current_work_dirs + list(args.dirpaths) - cmake_generator = CMakeGenerator(current_work_dirs, args.ignore_cmake_dirs) + cmake_generator = CMakeGenerator( + current_work_dirs, args.only_check_changed, args.ignore_cmake_dirs + ) cmake_generator.prepare_dist_ut_port() created = cmake_generator.parse_csvs() -- GitLab