未验证 提交 9eb4d89b 编写于 作者: R Roc 提交者: GitHub

move collective tests into a collective directory (#45223)

* add simple reformated ci files

* update

* add radme for new unitetsts

* add radme for new unitetsts

* add radme for new unitetsts

* reset mlu

* update for samples

* add base api

* reset some dist unit tests

* add warning in grenerated cmakelists file

* update readme for new dist unit tests

* add all collective tests

* remain base file and launcher file

* Update README.md

* Update README.md

* fix env PYTHONPATH

* Update gen_ut_cmakelists.py

* add all collective tests

* add docs for gen_ut_cmakelists.py

* pretify codes

* commont name == "name"

* update for comments

* update function's help

* update for run type

* update readme

* add all collective tests

* add all collective tests

* mv  collective test files

* update for all collective tests

* update

* update

* update

* update for all tests

* update for checking name

* Update Cmakelists.txt

* update testlist.csv

* remain test_parallel_dygraph_dataparallel in unittests

* set broadcast op all platforms

* update

* remain test_broadcast_tensors_op

* fix

* rm some collective files

* update more colective tests

* update

* update

* update
gen_ut_supports recursion

* update

* update

* update

* update

* fix nccl version

* update

* update

* update

* update

* fix a bug and try to pass

* update

* add csv

* update for timeout

* remove tcp store

* fix

* fix

* update

* update

* update for more dist tests

* move multi node tests

* update

* update

* update

* fix for auto parallele

* update

* update path in python file

* update

* reset some test in unittests

* fix

* update readme

* fix

* update

* fix port
上级 2c89bccb
...@@ -364,6 +364,18 @@ if(WIN32) ...@@ -364,6 +364,18 @@ if(WIN32)
endif() endif()
endif() endif()
if(NOT WITH_TESTING AND WITH_MULTINODE_TESTING)
message(
WARNING
"Disable WITH_MULTINODE_TESTING when compiling without TESTING. Force WITH_MULTINODE_TESTING=OFF."
)
set(WITH_MULTINODE_TESTING
OFF
CACHE STRING
"Disable WITH_MULTINODE_TESTING when compiling without TESTING"
FORCE)
endif()
if(NOT WITH_GPU AND WITH_NCCL) if(NOT WITH_GPU AND WITH_NCCL)
message( message(
WARNING "Disable NCCL when compiling without GPU. Force WITH_NCCL=OFF.") WARNING "Disable NCCL when compiling without GPU. Force WITH_NCCL=OFF.")
......
...@@ -6,64 +6,271 @@ set(LOCAL_ALL_ARCH ON) ...@@ -6,64 +6,271 @@ set(LOCAL_ALL_ARCH ON)
set(LOCAL_ALL_PLAT ON) set(LOCAL_ALL_PLAT ON)
if((WITH_GPU OR WITH_ROCM) AND (LINUX)) if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules( py_test_modules(
test_allreduce test_allreduce MODULES test_allreduce ENVS
MODULES "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
test_allreduce
ENVS
"PADDLE_DIST_UT_PORT=20071;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy="
)
set_tests_properties(test_allreduce PROPERTIES TIMEOUT "120" RUN_SERIAL 1) set_tests_properties(test_allreduce PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
endif() endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX)) if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules( py_test_modules(
test_broadcast test_broadcast MODULES test_broadcast ENVS
MODULES "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
test_broadcast set_tests_properties(test_broadcast PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_c_concat MODULES test_c_concat ENVS
"PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
set_tests_properties(test_c_concat PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_c_identity MODULES test_c_identity ENVS
"PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
set_tests_properties(test_c_identity PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_c_split MODULES test_c_split ENVS
"PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
set_tests_properties(test_c_split PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
endif()
if((WITH_ROCM OR WITH_GPU) AND (LINUX))
bash_test_modules(
test_collective_split_embedding
START_BASH
../dist_test.sh
LABELS
"RUN_TYPE=DIST"
ENVS ENVS
"PADDLE_DIST_UT_PORT=20073;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=" "PADDLE_DIST_UT_PORT=21288;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy="
) )
set_tests_properties(test_broadcast PROPERTIES TIMEOUT "120" RUN_SERIAL 1) set_tests_properties(test_collective_split_embedding PROPERTIES TIMEOUT "300"
RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_collective_allgather_api MODULES test_collective_allgather_api ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_allgather_api PROPERTIES TIMEOUT "300"
RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_collective_allgather_object_api MODULES
test_collective_allgather_object_api ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_allgather_object_api
PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_collective_allreduce_api MODULES test_collective_allreduce_api ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_allreduce_api PROPERTIES TIMEOUT "120"
RUN_SERIAL 1)
endif() endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX)) if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules( py_test_modules(
test_c_concat test_collective_alltoall_api MODULES test_collective_alltoall_api ENVS
MODULES "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
test_c_concat set_tests_properties(test_collective_alltoall_api PROPERTIES TIMEOUT "120"
RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
bash_test_modules(
test_collective_alltoall_single
START_BASH
../dist_test.sh
LABELS
"RUN_TYPE=DIST"
ENVS ENVS
"PADDLE_DIST_UT_PORT=20075;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=" "PADDLE_DIST_UT_PORT=21290;http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python"
) )
set_tests_properties(test_c_concat PROPERTIES TIMEOUT "120" RUN_SERIAL 1) set_tests_properties(test_collective_alltoall_single PROPERTIES TIMEOUT "350"
RUN_SERIAL 1)
endif() endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX)) if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules( py_test_modules(
test_c_identity test_collective_barrier_api MODULES test_collective_barrier_api ENVS
MODULES "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
test_c_identity set_tests_properties(test_collective_barrier_api PROPERTIES TIMEOUT "300"
RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
bash_test_modules(
test_collective_batch_isend_irecv
START_BASH
../dist_test.sh
LABELS
"RUN_TYPE=DIST"
ENVS ENVS
"PADDLE_DIST_UT_PORT=20077;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=" "PADDLE_DIST_UT_PORT=21292;http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python"
) )
set_tests_properties(test_c_identity PROPERTIES TIMEOUT "120" RUN_SERIAL 1) set_tests_properties(test_collective_batch_isend_irecv
PROPERTIES TIMEOUT "350" RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_collective_broadcast_api MODULES test_collective_broadcast_api ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_broadcast_api PROPERTIES TIMEOUT "120"
RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_collective_cpu_barrier_with_gloo MODULES
test_collective_cpu_barrier_with_gloo ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_cpu_barrier_with_gloo
PROPERTIES TIMEOUT "300" RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_collective_global_gather MODULES test_collective_global_gather ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_global_gather PROPERTIES TIMEOUT "200"
RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_collective_global_scatter MODULES test_collective_global_scatter ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_global_scatter PROPERTIES TIMEOUT "200"
RUN_SERIAL 1)
endif() endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX)) if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules( py_test_modules(
test_c_split test_collective_optimizer MODULES test_collective_optimizer ENVS
MODULES "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
test_c_split set_tests_properties(test_collective_optimizer PROPERTIES TIMEOUT "300"
RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
bash_test_modules(
test_collective_process_group
START_BASH
../dist_test.sh
LABELS
"RUN_TYPE=DIST"
ENVS ENVS
"PADDLE_DIST_UT_PORT=20079;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=" "PADDLE_DIST_UT_PORT=21294;http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python"
) )
set_tests_properties(test_c_split PROPERTIES TIMEOUT "120" RUN_SERIAL 1) set_tests_properties(test_collective_process_group PROPERTIES TIMEOUT "350"
RUN_SERIAL 1)
endif() endif()
if((WITH_ROCM OR WITH_GPU) AND (LINUX)) if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_collective_reduce MODULES test_collective_reduce ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_reduce PROPERTIES TIMEOUT "300"
RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_collective_reduce_api MODULES test_collective_reduce_api ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_reduce_api PROPERTIES TIMEOUT "300"
RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
bash_test_modules( bash_test_modules(
test_collective_split_embedding test_collective_reduce_scatter
START_BASH START_BASH
../dist_test.sh ../dist_test.sh
LABELS LABELS
"RUN_TYPE=DIST" "RUN_TYPE=DIST"
ENVS ENVS
"PADDLE_DIST_UT_PORT=20081;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=" "PADDLE_DIST_UT_PORT=21296;http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python"
) )
set_tests_properties(test_collective_split_embedding PROPERTIES TIMEOUT "300" set_tests_properties(test_collective_reduce_scatter PROPERTIES TIMEOUT "350"
RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_collective_scatter MODULES test_collective_scatter ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_scatter PROPERTIES TIMEOUT "300"
RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_collective_scatter_api MODULES test_collective_scatter_api ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_scatter_api PROPERTIES TIMEOUT "300"
RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_collective_sendrecv MODULES test_collective_sendrecv ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_sendrecv PROPERTIES TIMEOUT "300"
RUN_SERIAL 1) RUN_SERIAL 1)
endif() endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_collective_sendrecv_api MODULES test_collective_sendrecv_api ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_sendrecv_api PROPERTIES TIMEOUT "120"
RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_collective_split_col_linear MODULES test_collective_split_col_linear
ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_split_col_linear
PROPERTIES TIMEOUT "300" RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_collective_split_embedding_none_divisible MODULES
test_collective_split_embedding_none_divisible ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_split_embedding_none_divisible
PROPERTIES TIMEOUT "300" RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_collective_split_row_linear MODULES test_collective_split_row_linear
ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_split_row_linear
PROPERTIES TIMEOUT "300" RUN_SERIAL 1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_collective_wait MODULES test_collective_wait ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_collective_wait PROPERTIES TIMEOUT "300" RUN_SERIAL
1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_eager_dist_api MODULES test_eager_dist_api ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT "120" RUN_SERIAL
1)
endif()
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
py_test_modules(
test_new_group_api MODULES test_new_group_api ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
set_tests_properties(test_new_group_api PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
endif()
if((WITH_GPU
OR WITH_ROCM
OR WITH_ASCEND
OR WITH_ASCEND_CL
)
AND LOCAL_ALL_PLAT)
bash_test_modules(
test_gen_nccl_id_op
START_BASH
../dist_test.sh
LABELS
"RUN_TYPE=DIST"
ENVS
"PADDLE_DIST_UT_PORT=21298;http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python"
)
set_tests_properties(test_gen_nccl_id_op PROPERTIES RUN_SERIAL 1)
endif()
add_subdirectory(fleet)
add_subdirectory(multinode)
...@@ -6,16 +6,15 @@ ...@@ -6,16 +6,15 @@
and specify the properties for the new unit test and specify the properties for the new unit test
the properties are the following: the properties are the following:
* `name`: the test's name * `name`: the test's name
* `os`: The supported operator system, ignoring case. If the test run in multiple operator systems, use ";" to split systems, forexample, `apple;linux` means the test runs on both Apple and Linux. The supported values are `linux`,`win32` and `apple`. If the value is empty, this means the test runs on all opertaor systems. * `os`: The supported operator system, ignoring case. If the test run in multiple operator systems, use ";" to split systems, for example, `apple;linux` means the test runs on both Apple and Linux. The supported values are `linux`,`win32` and `apple`. If the value is empty, this means the test runs on all opertaor systems.
* `arch`: the device's architecture. similar to `os`, multiple valuse ars splited by ";" and ignoring case. The supported architectures are `gpu`, `xpu`, `npu` and `rocm`. * `arch`: the device's architecture. similar to `os`, multiple valuse ars splited by ";" and ignoring case. The supported architectures are `gpu`, `xpu`, `ASCEND`, `ASCEND_CL` and `rocm`.
* `timeout`: timeout of a unittest, whose unit is second. * `timeout`: timeout of a unittest, whose unit is second.
* `run_type`: run_type of a unittest. Supported values are `NIGHTLY`, `EXCLUSIVE`, `CINN`, `DIST`, `GPUPS`, `INFER`, `EXCLUSIVE:NIGHTLY`, `DIST:NIGHTLY`,which are case-insensitive. * `run_type`: run_type of a unittest. Supported values are `NIGHTLY`, `EXCLUSIVE`, `CINN`, `DIST`, `GPUPS`, `INFER`, `EXCLUSIVE:NIGHTLY`, `DIST:NIGHTLY`,which are case-insensitive.
* `launcher`: the test launcher.Supported values are test_runner.py, dist_test.sh and custom scripts' name. * `launcher`: the test launcher.Supported values are test_runner.py, dist_test.sh and custom scripts' name.
* `dist_ut_port`: the starting port used in a distributed unit test * `num_port`: the number os port used in a distributed unit test
* `run_serial`: whether in serial mode. the value can be 1 or 0.Default (empty) is 0. * `run_serial`: whether in serial mode. the value can be 1 or 0.Default (empty) is 0.
* `ENVS`: required environments. multiple envirenmonts are splited by ";". * `ENVS`: required environments. multiple envirenmonts are splited by ";".
* `conditions`: extra required conditions for some tests. the value is a boolean expression in cmake programmer. * `conditions`: extra required conditions for some tests. The value is a list of boolean expression in cmake programmer, splited with ";". For example, the value can be `WITH_DGC;NOT WITH_NCCL` or `WITH_NCCL;${NCCL_VERSION} VERSION_GREATER_EQUAL 2212`,The relationship between these expressions is a conjunction.
### step 3. Generate CmakeLists.txt ### step 3. Generate CmakeLists.txt
Run the cmd: Run the cmd:
......
...@@ -49,7 +49,7 @@ class TestDistMnistNCCL2DGC(TestDistBase): ...@@ -49,7 +49,7 @@ class TestDistMnistNCCL2DGC(TestDistBase):
def test_dist_train(self): def test_dist_train(self):
import paddle.fluid as fluid import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
self.check_with_place("dist_mnist.py", self.check_with_place(os.path.abspath("../../dist_mnist.py"),
delta=1e-5, delta=1e-5,
check_error_log=True, check_error_log=True,
log_name=flag_name) log_name=flag_name)
...@@ -80,7 +80,8 @@ class TestDistMnistNCCL2DGCMultiCards(TestDistBase): ...@@ -80,7 +80,8 @@ class TestDistMnistNCCL2DGCMultiCards(TestDistBase):
def test_dist_train(self): def test_dist_train(self):
import paddle.fluid as fluid import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
self.check_with_place_multi_cards("dist_mnist.py", self.check_with_place_multi_cards(
os.path.abspath("../../dist_mnist.py"),
delta=1e-5, delta=1e-5,
check_error_log=True, check_error_log=True,
log_name=flag_name) log_name=flag_name)
......
...@@ -17,8 +17,6 @@ import unittest ...@@ -17,8 +17,6 @@ import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
import os import os
import os
flag_name = os.path.splitext(__file__)[0] flag_name = os.path.splitext(__file__)[0]
...@@ -35,7 +33,7 @@ class TestDistSeResnetNCCL2DGC(TestDistBase): ...@@ -35,7 +33,7 @@ class TestDistSeResnetNCCL2DGC(TestDistBase):
def test_dist_train(self): def test_dist_train(self):
import paddle.fluid as fluid import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
self.check_with_place("dist_se_resnext.py", self.check_with_place(os.path.abspath("../../dist_se_resnext.py"),
delta=30, delta=30,
check_error_log=True, check_error_log=True,
log_name=flag_name) log_name=flag_name)
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册