Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
fbe8982f
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
fbe8982f
编写于
6月 15, 2023
作者:
G
Ghost Screaming
提交者:
GitHub
6月 15, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Cherry-Pick] Fix problem of TimeOut of distributed testcases under cuda12. (#54635)
上级
bb4f7777
变更
9
显示空白变更内容
内联
并排
Showing
9 changed file
with
34 addition
and
22 deletion
+34
-22
test/auto_parallel/CMakeLists.txt
test/auto_parallel/CMakeLists.txt
+5
-5
test/collective/CMakeLists.txt
test/collective/CMakeLists.txt
+7
-7
test/collective/fleet/CMakeLists.txt
test/collective/fleet/CMakeLists.txt
+4
-4
test/collective/fleet/dygraph_group_sharded_stage3.py
test/collective/fleet/dygraph_group_sharded_stage3.py
+4
-1
test/collective/fleet/dygraph_group_sharded_stage3_offload.py
.../collective/fleet/dygraph_group_sharded_stage3_offload.py
+4
-1
test/collective/fleet/hybrid_parallel_mp_bf16.py
test/collective/fleet/hybrid_parallel_mp_bf16.py
+4
-1
test/collective/fleet/hybrid_parallel_pp_bf16.py
test/collective/fleet/hybrid_parallel_pp_bf16.py
+4
-1
test/distributed_passes/CMakeLists.txt
test/distributed_passes/CMakeLists.txt
+1
-1
test/legacy_test/CMakeLists.txt
test/legacy_test/CMakeLists.txt
+1
-1
未找到文件。
test/auto_parallel/CMakeLists.txt
浏览文件 @
fbe8982f
...
...
@@ -24,7 +24,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
py_test_modules
(
test_optimization_tuner_api MODULES
test_optimization_tuner_api
)
set_tests_properties
(
test_optimization_tuner_api
PROPERTIES LABELS
"RUN_TYPE=EXCLUSIVE"
TIMEOUT
8
0
)
PROPERTIES LABELS
"RUN_TYPE=EXCLUSIVE"
TIMEOUT
12
0
)
py_test_modules
(
test_converter MODULES test_converter
)
set_tests_properties
(
test_converter PROPERTIES LABELS
"RUN_TYPE=EXCLUSIVE"
TIMEOUT 50
)
...
...
@@ -48,10 +48,10 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
PROPERTIES LABELS
"RUN_TYPE=EXCLUSIVE"
TIMEOUT 50
)
py_test_modules
(
test_pass_sharding MODULES test_pass_sharding
)
set_tests_properties
(
test_pass_sharding
PROPERTIES LABELS
"RUN_TYPE=EXCLUSIVE"
TIMEOUT
5
0
)
PROPERTIES LABELS
"RUN_TYPE=EXCLUSIVE"
TIMEOUT
10
0
)
py_test_modules
(
test_pass_amp MODULES test_pass_amp
)
set_tests_properties
(
test_pass_amp PROPERTIES LABELS
"RUN_TYPE=EXCLUSIVE"
TIMEOUT
5
0
)
TIMEOUT
8
0
)
py_test_modules
(
test_amp_o2_pass MODULES test_amp_o2_pass
)
set_tests_properties
(
test_amp_o2_pass PROPERTIES LABELS
"RUN_TYPE=EXCLUSIVE"
TIMEOUT 50
)
...
...
@@ -85,11 +85,11 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
py_test_modules
(
test_tuning_recompute MODULES test_tuning_recompute
)
set_tests_properties
(
test_tuning_recompute PROPERTIES TIMEOUT 300
)
py_test_modules
(
test_fused_linear_pass MODULES test_fused_linear_pass
)
set_tests_properties
(
test_fused_linear_pass PROPERTIES TIMEOUT
2
0
)
set_tests_properties
(
test_fused_linear_pass PROPERTIES TIMEOUT
4
0
)
py_test_modules
(
test_align_tool MODULES test_align_tool
)
set_tests_properties
(
test_align_tool PROPERTIES TIMEOUT 20
)
py_test_modules
(
test_pass_base_list MODULES test_pass_base_list
)
set_tests_properties
(
test_pass_base_list PROPERTIES TIMEOUT
2
0
)
set_tests_properties
(
test_pass_base_list PROPERTIES TIMEOUT
4
0
)
py_test_modules
(
test_fuse_adamw_pass MODULES test_fuse_adamw_pass
)
set_tests_properties
(
test_fuse_adamw_pass PROPERTIES TIMEOUT 20
)
py_test_modules
(
test_rule_based_tuner_o2 MODULES test_rule_based_tuner_o2
)
...
...
test/collective/CMakeLists.txt
浏览文件 @
fbe8982f
...
...
@@ -107,7 +107,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
test_collective_alltoall_single_api ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:
${
PADDLE_BINARY_DIR
}
/python"
)
set_tests_properties
(
test_collective_alltoall_single_api
PROPERTIES TIMEOUT
"1
2
0"
LABELS
"RUN_TYPE=DIST"
)
PROPERTIES TIMEOUT
"1
6
0"
LABELS
"RUN_TYPE=DIST"
)
endif
()
if
((
WITH_GPU OR WITH_ROCM
)
AND
(
LINUX
))
py_test_modules
(
...
...
@@ -137,10 +137,10 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
"http_proxy=;https_proxy=;PYTHONPATH=..:
${
PADDLE_BINARY_DIR
}
/python"
)
if
(
${
CUDA_ARCH_NAME
}
STREQUAL
"Ampere"
)
set_tests_properties
(
test_collective_broadcast_api
PROPERTIES TIMEOUT
"
36
0"
LABELS
"RUN_TYPE=DIST"
)
PROPERTIES TIMEOUT
"
50
0"
LABELS
"RUN_TYPE=DIST"
)
else
()
set_tests_properties
(
test_collective_broadcast_api
PROPERTIES TIMEOUT
"
30
0"
LABELS
"RUN_TYPE=DIST"
)
PROPERTIES TIMEOUT
"
45
0"
LABELS
"RUN_TYPE=DIST"
)
endif
()
endif
()
if
((
WITH_GPU OR WITH_ROCM
)
AND
(
LINUX
))
...
...
@@ -178,7 +178,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
test_collective_isend_irecv_api MODULES test_collective_isend_irecv_api
ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:
${
PADDLE_BINARY_DIR
}
/python"
)
set_tests_properties
(
test_collective_isend_irecv_api
PROPERTIES TIMEOUT
"1
2
0"
LABELS
"RUN_TYPE=DIST"
)
PROPERTIES TIMEOUT
"1
6
0"
LABELS
"RUN_TYPE=DIST"
)
endif
()
if
((
WITH_GPU OR WITH_ROCM
)
AND
(
LINUX
))
py_test_modules
(
...
...
@@ -240,10 +240,10 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
"http_proxy=;https_proxy=;PYTHONPATH=..:
${
PADDLE_BINARY_DIR
}
/python"
)
if
(
${
CUDA_ARCH_NAME
}
STREQUAL
"Ampere"
)
set_tests_properties
(
test_collective_reduce_scatter_api
PROPERTIES TIMEOUT
"
21
0"
LABELS
"RUN_TYPE=DIST"
)
PROPERTIES TIMEOUT
"
36
0"
LABELS
"RUN_TYPE=DIST"
)
else
()
set_tests_properties
(
test_collective_reduce_scatter_api
PROPERTIES TIMEOUT
"
1
50"
LABELS
"RUN_TYPE=DIST"
)
PROPERTIES TIMEOUT
"
2
50"
LABELS
"RUN_TYPE=DIST"
)
endif
()
endif
()
if
((
WITH_GPU OR WITH_ROCM
)
AND
(
LINUX
))
...
...
@@ -258,7 +258,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
test_collective_scatter_api MODULES test_collective_scatter_api ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:
${
PADDLE_BINARY_DIR
}
/python"
)
set_tests_properties
(
test_collective_scatter_api
PROPERTIES TIMEOUT
"
18
0"
LABELS
"RUN_TYPE=DIST"
)
PROPERTIES TIMEOUT
"
30
0"
LABELS
"RUN_TYPE=DIST"
)
endif
()
if
((
WITH_GPU OR WITH_ROCM
)
AND
(
LINUX
))
py_test_modules
(
...
...
test/collective/fleet/CMakeLists.txt
浏览文件 @
fbe8982f
...
...
@@ -237,7 +237,7 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
START_BASH
../../legacy_test/dist_test.sh
TIMEOUT
"1
2
0"
"1
6
0"
LABELS
"RUN_TYPE=DIST"
ENVS
...
...
@@ -682,13 +682,13 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
START_BASH
../../legacy_test/dist_test.sh
TIMEOUT
"
12
0"
"
24
0"
LABELS
"RUN_TYPE=DIST"
ENVS
"PADDLE_DIST_UT_PORT=21272;http_proxy=;https_proxy=;PYTHONPATH=../..:
${
PADDLE_BINARY_DIR
}
/python"
)
set_tests_properties
(
test_ir_pass_pipeline PROPERTIES TIMEOUT
"
12
0"
)
set_tests_properties
(
test_ir_pass_pipeline PROPERTIES TIMEOUT
"
24
0"
)
endif
()
if
((
WITH_GPU OR WITH_ROCM
)
AND LOCAL_ALL_PLAT
)
bash_test_modules
(
...
...
@@ -922,7 +922,7 @@ if((WITH_GPU) AND (LINUX))
test_dygraph_dist_save_load MODULES test_dygraph_dist_save_load ENVS
"http_proxy=;https_proxy=;PYTHONPATH=../..:
${
PADDLE_BINARY_DIR
}
/python"
)
set_tests_properties
(
test_dygraph_dist_save_load
PROPERTIES TIMEOUT
"
2
00"
LABELS
"RUN_TYPE=DIST"
)
PROPERTIES TIMEOUT
"
3
00"
LABELS
"RUN_TYPE=DIST"
)
endif
()
if
((
WITH_GPU
)
AND
(
LINUX
))
py_test_modules
(
...
...
test/collective/fleet/dygraph_group_sharded_stage3.py
浏览文件 @
fbe8982f
...
...
@@ -366,7 +366,10 @@ def test_stage2_stage3():
# bfp16
nccl_version
=
core
.
nccl_version
()
if
nccl_version
>=
21000
:
if
(
nccl_version
>=
21000
and
paddle
.
device
.
cuda
.
get_device_properties
().
major
>=
8
):
stage2_params
=
train_mlp
(
mlp11
,
sharding_stage
=
2
,
...
...
test/collective/fleet/dygraph_group_sharded_stage3_offload.py
浏览文件 @
fbe8982f
...
...
@@ -215,7 +215,10 @@ def test_stage3_offload():
# bfp16 offload
nccl_version
=
core
.
nccl_version
()
if
nccl_version
>=
21000
:
if
(
nccl_version
>=
21000
and
paddle
.
device
.
cuda
.
get_device_properties
().
major
>=
8
):
stage3_params
=
train_mlp
(
mlp7
,
use_pure_fp16
=
True
,
use_bfp16
=
True
)
stage3_params_offload
=
train_mlp
(
mlp8
,
use_pure_fp16
=
True
,
offload
=
True
,
use_bfp16
=
True
...
...
test/collective/fleet/hybrid_parallel_mp_bf16.py
浏览文件 @
fbe8982f
...
...
@@ -60,5 +60,8 @@ class TestMPFP16(TestDistMPTraning):
if
__name__
==
"__main__"
:
if
check_nccl_version_for_bf16
():
if
(
check_nccl_version_for_bf16
()
and
paddle
.
device
.
cuda
.
get_device_properties
().
major
>=
8
):
unittest
.
main
()
test/collective/fleet/hybrid_parallel_pp_bf16.py
浏览文件 @
fbe8982f
...
...
@@ -165,5 +165,8 @@ class TestDistPPTraning(unittest.TestCase):
if
__name__
==
"__main__"
:
if
check_nccl_version_for_bf16
():
if
(
check_nccl_version_for_bf16
()
and
paddle
.
device
.
cuda
.
get_device_properties
().
major
>=
8
):
unittest
.
main
()
test/distributed_passes/CMakeLists.txt
浏览文件 @
fbe8982f
...
...
@@ -29,6 +29,6 @@ endif()
foreach
(
TEST_OP
${
TEST_OPS
}
)
py_test_modules
(
${
TEST_OP
}
MODULES
${
TEST_OP
}
ENVS
"NVIDIA_TF32_OVERRIDE=0"
)
list
(
APPEND DIST_TEST_OPS
${
TEST_OP
}
)
set_tests_properties
(
${
TEST_OP
}
PROPERTIES TIMEOUT 2
0
0
)
set_tests_properties
(
${
TEST_OP
}
PROPERTIES TIMEOUT 2
5
0
)
set_tests_properties
(
${
TEST_OP
}
PROPERTIES LABELS
"RUN_TYPE=DIST"
)
endforeach
()
test/legacy_test/CMakeLists.txt
浏览文件 @
fbe8982f
...
...
@@ -1304,4 +1304,4 @@ set_tests_properties(test_reduce_op_static_build PROPERTIES TIMEOUT 500)
set_tests_properties
(
test_sync_batch_norm_op_static_build
PROPERTIES LABELS
"RUN_TYPE=DIST"
)
set_tests_properties
(
test_sync_batch_norm_op_static_build PROPERTIES TIMEOUT
12
0
)
25
0
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录