未验证 提交 a90d9088 编写于 作者: G Ghost Screaming 提交者: GitHub

Fix cuda12 timeout problems. (#54615)

* Fix bug of reduce_sum op. When input.numel() > INT32_MAX, its result
is wrong.

* Remove climits.

* Fix problem of pickle and NCCL_P2P_DISABLE in distributed testcases in
cuda12.

* Fix problem of TimeOut of distributed testcases under cuda12.

* Remove useless modification.

* Remove useless modification.
上级 58b4c60f
...@@ -24,7 +24,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU) ...@@ -24,7 +24,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
py_test_modules(test_optimization_tuner_api MODULES py_test_modules(test_optimization_tuner_api MODULES
test_optimization_tuner_api) test_optimization_tuner_api)
set_tests_properties(test_optimization_tuner_api set_tests_properties(test_optimization_tuner_api
PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100) PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
py_test_modules(test_converter MODULES test_converter) py_test_modules(test_converter MODULES test_converter)
set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE"
TIMEOUT 50) TIMEOUT 50)
...@@ -48,10 +48,10 @@ if(WITH_DISTRIBUTE AND WITH_GPU) ...@@ -48,10 +48,10 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50) PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
py_test_modules(test_pass_sharding MODULES test_pass_sharding) py_test_modules(test_pass_sharding MODULES test_pass_sharding)
set_tests_properties(test_pass_sharding set_tests_properties(test_pass_sharding
PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50) PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100)
py_test_modules(test_pass_amp MODULES test_pass_amp) py_test_modules(test_pass_amp MODULES test_pass_amp)
set_tests_properties(test_pass_amp PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" set_tests_properties(test_pass_amp PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE"
TIMEOUT 50) TIMEOUT 80)
py_test_modules(test_amp_o2_pass MODULES test_amp_o2_pass) py_test_modules(test_amp_o2_pass MODULES test_amp_o2_pass)
set_tests_properties(test_amp_o2_pass PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" set_tests_properties(test_amp_o2_pass PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE"
TIMEOUT 50) TIMEOUT 50)
...@@ -85,11 +85,11 @@ if(WITH_DISTRIBUTE AND WITH_GPU) ...@@ -85,11 +85,11 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
py_test_modules(test_tuning_recompute MODULES test_tuning_recompute) py_test_modules(test_tuning_recompute MODULES test_tuning_recompute)
set_tests_properties(test_tuning_recompute PROPERTIES TIMEOUT 300) set_tests_properties(test_tuning_recompute PROPERTIES TIMEOUT 300)
py_test_modules(test_fused_linear_pass MODULES test_fused_linear_pass) py_test_modules(test_fused_linear_pass MODULES test_fused_linear_pass)
set_tests_properties(test_fused_linear_pass PROPERTIES TIMEOUT 20) set_tests_properties(test_fused_linear_pass PROPERTIES TIMEOUT 40)
py_test_modules(test_align_tool MODULES test_align_tool) py_test_modules(test_align_tool MODULES test_align_tool)
set_tests_properties(test_align_tool PROPERTIES TIMEOUT 20) set_tests_properties(test_align_tool PROPERTIES TIMEOUT 20)
py_test_modules(test_pass_base_list MODULES test_pass_base_list) py_test_modules(test_pass_base_list MODULES test_pass_base_list)
set_tests_properties(test_pass_base_list PROPERTIES TIMEOUT 20) set_tests_properties(test_pass_base_list PROPERTIES TIMEOUT 40)
py_test_modules(test_fuse_adamw_pass MODULES test_fuse_adamw_pass) py_test_modules(test_fuse_adamw_pass MODULES test_fuse_adamw_pass)
set_tests_properties(test_fuse_adamw_pass PROPERTIES TIMEOUT 20) set_tests_properties(test_fuse_adamw_pass PROPERTIES TIMEOUT 20)
py_test_modules(test_rule_based_tuner_o2 MODULES test_rule_based_tuner_o2) py_test_modules(test_rule_based_tuner_o2 MODULES test_rule_based_tuner_o2)
......
...@@ -107,7 +107,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) ...@@ -107,7 +107,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
test_collective_alltoall_single_api ENVS test_collective_alltoall_single_api ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_alltoall_single_api set_tests_properties(test_collective_alltoall_single_api
PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") PROPERTIES TIMEOUT "160" LABELS "RUN_TYPE=DIST")
endif() endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX)) if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules( py_test_modules(
...@@ -137,10 +137,10 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) ...@@ -137,10 +137,10 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
if(${CUDA_ARCH_NAME} STREQUAL "Ampere") if(${CUDA_ARCH_NAME} STREQUAL "Ampere")
set_tests_properties(test_collective_broadcast_api set_tests_properties(test_collective_broadcast_api
PROPERTIES TIMEOUT "360" LABELS "RUN_TYPE=DIST") PROPERTIES TIMEOUT "500" LABELS "RUN_TYPE=DIST")
else() else()
set_tests_properties(test_collective_broadcast_api set_tests_properties(test_collective_broadcast_api
PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") PROPERTIES TIMEOUT "450" LABELS "RUN_TYPE=DIST")
endif() endif()
endif() endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX)) if((WITH_GPU OR WITH_ROCM) AND (LINUX))
...@@ -178,7 +178,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) ...@@ -178,7 +178,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
test_collective_isend_irecv_api MODULES test_collective_isend_irecv_api test_collective_isend_irecv_api MODULES test_collective_isend_irecv_api
ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_isend_irecv_api set_tests_properties(test_collective_isend_irecv_api
PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") PROPERTIES TIMEOUT "160" LABELS "RUN_TYPE=DIST")
endif() endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX)) if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules( py_test_modules(
...@@ -240,10 +240,10 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) ...@@ -240,10 +240,10 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
if(${CUDA_ARCH_NAME} STREQUAL "Ampere") if(${CUDA_ARCH_NAME} STREQUAL "Ampere")
set_tests_properties(test_collective_reduce_scatter_api set_tests_properties(test_collective_reduce_scatter_api
PROPERTIES TIMEOUT "210" LABELS "RUN_TYPE=DIST") PROPERTIES TIMEOUT "360" LABELS "RUN_TYPE=DIST")
else() else()
set_tests_properties(test_collective_reduce_scatter_api set_tests_properties(test_collective_reduce_scatter_api
PROPERTIES TIMEOUT "150" LABELS "RUN_TYPE=DIST") PROPERTIES TIMEOUT "250" LABELS "RUN_TYPE=DIST")
endif() endif()
endif() endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX)) if((WITH_GPU OR WITH_ROCM) AND (LINUX))
...@@ -258,7 +258,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) ...@@ -258,7 +258,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
test_collective_scatter_api MODULES test_collective_scatter_api ENVS test_collective_scatter_api MODULES test_collective_scatter_api ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_scatter_api set_tests_properties(test_collective_scatter_api
PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST") PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
endif() endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX)) if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules( py_test_modules(
......
...@@ -237,7 +237,7 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) ...@@ -237,7 +237,7 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
START_BASH START_BASH
../../legacy_test/dist_test.sh ../../legacy_test/dist_test.sh
TIMEOUT TIMEOUT
"120" "160"
LABELS LABELS
"RUN_TYPE=DIST" "RUN_TYPE=DIST"
ENVS ENVS
...@@ -682,13 +682,13 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) ...@@ -682,13 +682,13 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
START_BASH START_BASH
../../legacy_test/dist_test.sh ../../legacy_test/dist_test.sh
TIMEOUT TIMEOUT
"120" "240"
LABELS LABELS
"RUN_TYPE=DIST" "RUN_TYPE=DIST"
ENVS ENVS
"PADDLE_DIST_UT_PORT=21272;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python" "PADDLE_DIST_UT_PORT=21272;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
) )
set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT "120") set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT "240")
endif() endif()
if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT) if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT)
bash_test_modules( bash_test_modules(
...@@ -922,7 +922,7 @@ if((WITH_GPU) AND (LINUX)) ...@@ -922,7 +922,7 @@ if((WITH_GPU) AND (LINUX))
test_dygraph_dist_save_load MODULES test_dygraph_dist_save_load ENVS test_dygraph_dist_save_load MODULES test_dygraph_dist_save_load ENVS
"http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python") "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_dygraph_dist_save_load set_tests_properties(test_dygraph_dist_save_load
PROPERTIES TIMEOUT "200" LABELS "RUN_TYPE=DIST") PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
endif() endif()
if((WITH_GPU) AND (LINUX)) if((WITH_GPU) AND (LINUX))
py_test_modules( py_test_modules(
......
...@@ -366,7 +366,10 @@ def test_stage2_stage3(): ...@@ -366,7 +366,10 @@ def test_stage2_stage3():
# bfp16 # bfp16
nccl_version = core.nccl_version() nccl_version = core.nccl_version()
if nccl_version >= 21000: if (
nccl_version >= 21000
and paddle.device.cuda.get_device_properties().major >= 8
):
stage2_params = train_mlp( stage2_params = train_mlp(
mlp11, mlp11,
sharding_stage=2, sharding_stage=2,
......
...@@ -215,7 +215,10 @@ def test_stage3_offload(): ...@@ -215,7 +215,10 @@ def test_stage3_offload():
# bfp16 offload # bfp16 offload
nccl_version = core.nccl_version() nccl_version = core.nccl_version()
if nccl_version >= 21000: if (
nccl_version >= 21000
and paddle.device.cuda.get_device_properties().major >= 8
):
stage3_params = train_mlp(mlp7, use_pure_fp16=True, use_bfp16=True) stage3_params = train_mlp(mlp7, use_pure_fp16=True, use_bfp16=True)
stage3_params_offload = train_mlp( stage3_params_offload = train_mlp(
mlp8, use_pure_fp16=True, offload=True, use_bfp16=True mlp8, use_pure_fp16=True, offload=True, use_bfp16=True
......
...@@ -60,5 +60,8 @@ class TestMPFP16(TestDistMPTraning): ...@@ -60,5 +60,8 @@ class TestMPFP16(TestDistMPTraning):
if __name__ == "__main__": if __name__ == "__main__":
if check_nccl_version_for_bf16(): if (
check_nccl_version_for_bf16()
and paddle.device.cuda.get_device_properties().major >= 8
):
unittest.main() unittest.main()
...@@ -165,5 +165,8 @@ class TestDistPPTraning(unittest.TestCase): ...@@ -165,5 +165,8 @@ class TestDistPPTraning(unittest.TestCase):
if __name__ == "__main__": if __name__ == "__main__":
if check_nccl_version_for_bf16(): if (
check_nccl_version_for_bf16()
and paddle.device.cuda.get_device_properties().major >= 8
):
unittest.main() unittest.main()
...@@ -29,6 +29,6 @@ endif() ...@@ -29,6 +29,6 @@ endif()
foreach(TEST_OP ${TEST_OPS}) foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS "NVIDIA_TF32_OVERRIDE=0") py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS "NVIDIA_TF32_OVERRIDE=0")
list(APPEND DIST_TEST_OPS ${TEST_OP}) list(APPEND DIST_TEST_OPS ${TEST_OP})
set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 200) set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 250)
set_tests_properties(${TEST_OP} PROPERTIES LABELS "RUN_TYPE=DIST") set_tests_properties(${TEST_OP} PROPERTIES LABELS "RUN_TYPE=DIST")
endforeach() endforeach()
...@@ -1312,4 +1312,4 @@ set_tests_properties(test_reduce_op_static_build PROPERTIES TIMEOUT 500) ...@@ -1312,4 +1312,4 @@ set_tests_properties(test_reduce_op_static_build PROPERTIES TIMEOUT 500)
set_tests_properties(test_sync_batch_norm_op_static_build set_tests_properties(test_sync_batch_norm_op_static_build
PROPERTIES LABELS "RUN_TYPE=DIST") PROPERTIES LABELS "RUN_TYPE=DIST")
set_tests_properties(test_sync_batch_norm_op_static_build PROPERTIES TIMEOUT set_tests_properties(test_sync_batch_norm_op_static_build PROPERTIES TIMEOUT
160) 250)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册