From 0fe16539ef3651966080d5ae96850da4557751e0 Mon Sep 17 00:00:00 2001 From: WangXi Date: Tue, 17 Dec 2019 10:32:50 +0800 Subject: [PATCH] Fix dgc & launch tests in cpu ci (#21759) --- .../fluid/tests/unittests/CMakeLists.txt | 13 ++++++++-- .../unittests/test_dist_mnist_dgc_nccl.py | 24 +++++++++++-------- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 70fa1178f98..b61c7ea04b4 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -257,14 +257,23 @@ if(WITH_DISTRIBUTE) py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS}) py_test_modules(test_transpiler_ops MODULES test_transpiler_ops ENVS ${dist_ENVS}) if(WITH_DGC) + # if with dgc, test all dgc tests. + # NOTE. dist dgc tests is already in DIST_TEST_OPS py_test_modules(test_dgc_op MODULES test_dgc_op) py_test_modules(test_dgc_momentum_op MODULES test_dgc_momentum_op) py_test_modules(test_dgc_optimizer MODULES test_dgc_optimizer) + else() + # if not with dgc, must close all dgc tests + list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl") + list(REMOVE_ITEM DIST_TEST_OPS "test_dist_se_resnext_dgc") endif() if(NOT APPLE) bash_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv.sh) - bash_test_modules(test_launch MODULES test_launch.sh) - bash_test_modules(test_launch_ps MODULES test_launch_ps.sh) + if(WITH_GPU) + # NOTE. test_launch only work in gpu collective mode + bash_test_modules(test_launch MODULES test_launch.sh) + endif() + bash_test_modules(test_launch_ps MODULES test_launch_ps.sh) set(dist_ut_port 1000) foreach(TEST_OP ${DIST_TEST_OPS}) diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py index 757f9ba5c1c..0b9b85d5d52 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py @@ -49,13 +49,15 @@ class TestDistMnistNCCL2DGC(TestDistBase): log_name=flag_name) def tearDown(self): - result = count_of_sparse_all_reduce_calls( - 'test_dist_mnist_dgc_nccl_tr0_err.log') - # only 1 layer use dgc now, run_step=5, rampup_begin_step=2, so 1 * (5 - 2) = 3 + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + result = count_of_sparse_all_reduce_calls( + 'test_dist_mnist_dgc_nccl_tr0_err.log') + # only 1 layer use dgc now, run_step=5, rampup_begin_step=2, so 1 * (5 - 2) = 3 - # temp close this test. In python3 CI, the log is right, but the result - # has a problem, may be in multi process mode, log is not writed in time. - # self.assertEqual(result, 3) + # temp close this test. In python3 CI, the log is right, but the result + # has a problem, may be in multi process mode, log is not writed in time. + # self.assertEqual(result, 3) class TestDistMnistNCCL2DGCMultiCards(TestDistBase): @@ -76,10 +78,12 @@ class TestDistMnistNCCL2DGCMultiCards(TestDistBase): log_name=flag_name) def tearDown(self): - result = count_of_sparse_all_reduce_calls( - 'test_dist_mnist_dgc_nccl_dgc_2cards_local.log') - # same as above, but use two cards - self.assertEqual(result, 6) + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + result = count_of_sparse_all_reduce_calls( + 'test_dist_mnist_dgc_nccl_dgc_2cards_local.log') + # same as above, but use two cards + self.assertEqual(result, 6) if __name__ == "__main__": -- GitLab