未验证 提交 a90d9088 编写于 作者: G Ghost Screaming 提交者: GitHub

Fix cuda12 timeout problems. (#54615)

* Fix bug of reduce_sum op. When input.numel() > INT32_MAX, its result
is wrong.

* Remove climits.

* Fix problem of pickle and NCCL_P2P_DISABLE in distributed testcases in
cuda12.

* Fix problem of TimeOut of distributed testcases under cuda12.

* Remove useless modification.

* Remove useless modification.
上级 58b4c60f
......@@ -24,7 +24,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
py_test_modules(test_optimization_tuner_api MODULES
test_optimization_tuner_api)
set_tests_properties(test_optimization_tuner_api
PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100)
PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
py_test_modules(test_converter MODULES test_converter)
set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE"
TIMEOUT 50)
......@@ -48,10 +48,10 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
py_test_modules(test_pass_sharding MODULES test_pass_sharding)
set_tests_properties(test_pass_sharding
PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100)
py_test_modules(test_pass_amp MODULES test_pass_amp)
set_tests_properties(test_pass_amp PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE"
TIMEOUT 50)
TIMEOUT 80)
py_test_modules(test_amp_o2_pass MODULES test_amp_o2_pass)
set_tests_properties(test_amp_o2_pass PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE"
TIMEOUT 50)
......@@ -85,11 +85,11 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
py_test_modules(test_tuning_recompute MODULES test_tuning_recompute)
set_tests_properties(test_tuning_recompute PROPERTIES TIMEOUT 300)
py_test_modules(test_fused_linear_pass MODULES test_fused_linear_pass)
set_tests_properties(test_fused_linear_pass PROPERTIES TIMEOUT 20)
set_tests_properties(test_fused_linear_pass PROPERTIES TIMEOUT 40)
py_test_modules(test_align_tool MODULES test_align_tool)
set_tests_properties(test_align_tool PROPERTIES TIMEOUT 20)
py_test_modules(test_pass_base_list MODULES test_pass_base_list)
set_tests_properties(test_pass_base_list PROPERTIES TIMEOUT 20)
set_tests_properties(test_pass_base_list PROPERTIES TIMEOUT 40)
py_test_modules(test_fuse_adamw_pass MODULES test_fuse_adamw_pass)
set_tests_properties(test_fuse_adamw_pass PROPERTIES TIMEOUT 20)
py_test_modules(test_rule_based_tuner_o2 MODULES test_rule_based_tuner_o2)
......
......@@ -107,7 +107,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
test_collective_alltoall_single_api ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_alltoall_single_api
PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
PROPERTIES TIMEOUT "160" LABELS "RUN_TYPE=DIST")
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
......@@ -137,10 +137,10 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
if(${CUDA_ARCH_NAME} STREQUAL "Ampere")
set_tests_properties(test_collective_broadcast_api
PROPERTIES TIMEOUT "360" LABELS "RUN_TYPE=DIST")
PROPERTIES TIMEOUT "500" LABELS "RUN_TYPE=DIST")
else()
set_tests_properties(test_collective_broadcast_api
PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
PROPERTIES TIMEOUT "450" LABELS "RUN_TYPE=DIST")
endif()
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
......@@ -178,7 +178,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
test_collective_isend_irecv_api MODULES test_collective_isend_irecv_api
ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_isend_irecv_api
PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
PROPERTIES TIMEOUT "160" LABELS "RUN_TYPE=DIST")
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
......@@ -240,10 +240,10 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
if(${CUDA_ARCH_NAME} STREQUAL "Ampere")
set_tests_properties(test_collective_reduce_scatter_api
PROPERTIES TIMEOUT "210" LABELS "RUN_TYPE=DIST")
PROPERTIES TIMEOUT "360" LABELS "RUN_TYPE=DIST")
else()
set_tests_properties(test_collective_reduce_scatter_api
PROPERTIES TIMEOUT "150" LABELS "RUN_TYPE=DIST")
PROPERTIES TIMEOUT "250" LABELS "RUN_TYPE=DIST")
endif()
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
......@@ -258,7 +258,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
test_collective_scatter_api MODULES test_collective_scatter_api ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_scatter_api
PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST")
PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
......
......@@ -237,7 +237,7 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
START_BASH
../../legacy_test/dist_test.sh
TIMEOUT
"120"
"160"
LABELS
"RUN_TYPE=DIST"
ENVS
......@@ -682,13 +682,13 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
START_BASH
../../legacy_test/dist_test.sh
TIMEOUT
"120"
"240"
LABELS
"RUN_TYPE=DIST"
ENVS
"PADDLE_DIST_UT_PORT=21272;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
)
set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT "120")
set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT "240")
endif()
if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT)
bash_test_modules(
......@@ -922,7 +922,7 @@ if((WITH_GPU) AND (LINUX))
test_dygraph_dist_save_load MODULES test_dygraph_dist_save_load ENVS
"http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_dygraph_dist_save_load
PROPERTIES TIMEOUT "200" LABELS "RUN_TYPE=DIST")
PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
endif()
if((WITH_GPU) AND (LINUX))
py_test_modules(
......
......@@ -366,7 +366,10 @@ def test_stage2_stage3():
# bfp16
nccl_version = core.nccl_version()
if nccl_version >= 21000:
if (
nccl_version >= 21000
and paddle.device.cuda.get_device_properties().major >= 8
):
stage2_params = train_mlp(
mlp11,
sharding_stage=2,
......
......@@ -215,7 +215,10 @@ def test_stage3_offload():
# bfp16 offload
nccl_version = core.nccl_version()
if nccl_version >= 21000:
if (
nccl_version >= 21000
and paddle.device.cuda.get_device_properties().major >= 8
):
stage3_params = train_mlp(mlp7, use_pure_fp16=True, use_bfp16=True)
stage3_params_offload = train_mlp(
mlp8, use_pure_fp16=True, offload=True, use_bfp16=True
......
......@@ -60,5 +60,8 @@ class TestMPFP16(TestDistMPTraning):
if __name__ == "__main__":
if check_nccl_version_for_bf16():
if (
check_nccl_version_for_bf16()
and paddle.device.cuda.get_device_properties().major >= 8
):
unittest.main()
......@@ -165,5 +165,8 @@ class TestDistPPTraning(unittest.TestCase):
if __name__ == "__main__":
if check_nccl_version_for_bf16():
if (
check_nccl_version_for_bf16()
and paddle.device.cuda.get_device_properties().major >= 8
):
unittest.main()
......@@ -29,6 +29,6 @@ endif()
foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS "NVIDIA_TF32_OVERRIDE=0")
list(APPEND DIST_TEST_OPS ${TEST_OP})
set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 200)
set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 250)
set_tests_properties(${TEST_OP} PROPERTIES LABELS "RUN_TYPE=DIST")
endforeach()
......@@ -1312,4 +1312,4 @@ set_tests_properties(test_reduce_op_static_build PROPERTIES TIMEOUT 500)
set_tests_properties(test_sync_batch_norm_op_static_build
PROPERTIES LABELS "RUN_TYPE=DIST")
set_tests_properties(test_sync_batch_norm_op_static_build PROPERTIES TIMEOUT
160)
250)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册