diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index a1cf5fad138f068c9eac5fe8d681c9f08b192270..b61ef706ba2460a12c4fe659984917b627e20906 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -70,6 +70,7 @@ list(REMOVE_ITEM TEST_OPS test_dist_transpiler) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) +list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_nccl) list(REMOVE_ITEM TEST_OPS test_dist_transformer) list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer) list(REMOVE_ITEM TEST_OPS test_image_classification_resnet) @@ -95,11 +96,13 @@ if(WITH_DISTRIBUTE) if(NOT APPLE) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) - py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) - set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000) + py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) + set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000) + py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl) + set_tests_properties(test_dist_se_resnext_nccl PROPERTIES TIMEOUT 1000) # FIXME(typhoonzero): add these tests back - # py_test_modules(test_dist_transformer MODULES test_dist_transformer) - # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) + # py_test_modules(test_dist_transformer MODULES test_dist_transformer) + # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) set_tests_properties(test_dist_ctr test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE) endif(NOT APPLE) py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index d98b839e9b3fe10cb5b79c672284f8dfb6fbf141..969f5cb63c9dd2a773be9530abd2a49714202cd1 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -110,7 +110,8 @@ class TestDistRunnerBase(object): trainer_prog = fluid.default_main_program() if args.use_cuda: - place = fluid.CUDAPlace(0) + device_id = int(os.getenv("FLAGS_selected_gpus", "0")) + place = fluid.CUDAPlace(device_id) else: place = fluid.CPUPlace() @@ -256,6 +257,7 @@ class TestDistBase(unittest.TestCase): self._dc_asgd = False # must use with async mode self._use_reader_alloc = True self._nccl2_mode = False + self._mp_mode = False # FIXME(typhoonzero): I added this stupid argument to enable # testing allreduce layers, which users can call layers.allreduce # to accumulate tensors at anywhere. Find a better way to do this @@ -504,6 +506,10 @@ class TestDistBase(unittest.TestCase): env0 = {'CPU_NUM': '1'} env1 = {'CPU_NUM': '1'} + if self._mp_mode: + env0 = {"FLAGS_selected_gpus": "0"} + env1 = {"FLAGS_selected_gpus": "1"} + env0.update(envs) env1.update(envs) diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py new file mode 100644 index 0000000000000000000000000000000000000000..38f7bb80d2f9144800ef8f8fb1402dcf86925067 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py @@ -0,0 +1,63 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +from test_dist_base import TestDistBase +import os + + +def skip_ci(func): + on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) + + def __func__(*args, **kwargs): + if on_ci: + return + return func(*args, **kwargs) + + return __func__ + + +class TestDistSeResneXtNCCL(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_reader_alloc = False + self._nccl2_mode = True + + @skip_ci + def test_dist_train(self): + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + self.check_with_place("dist_se_resnext.py", delta=1e-5) + + +class TestDistSeResneXtNCCLMP(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_reader_alloc = False + self._nccl2_mode = True + self._mp_mode = True + + @skip_ci + def test_dist_train(self): + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "dist_se_resnext.py", + delta=1e-5, + need_envs={"NCCL_P2P_DISABLE": "1"}) + + +if __name__ == "__main__": + unittest.main()