From ebf9797ec3049726aaea216f02c4acd92565fba9 Mon Sep 17 00:00:00 2001 From: guru4elephant <35550832+guru4elephant@users.noreply.github.com> Date: Mon, 22 Jul 2019 10:00:03 +0800 Subject: [PATCH] split different comm method for mnist distributed training (#18715) * split different comm method for mnist distributed training --- .../fluid/tests/unittests/CMakeLists.txt | 12 ++- .../tests/unittests/test_dist_fleet_ctr.py | 2 +- .../test_dist_mnist_backward_deps.py | 35 ++++++++ .../unittests/test_dist_mnist_dgc_nccl.py | 35 ++++++++ .../unittests/test_dist_mnist_hallreduce.py | 35 ++++++++ .../unittests/test_dist_mnist_multi_comm.py | 35 ++++++++ .../tests/unittests/test_dist_mnist_nccl.py | 90 ------------------- .../test_dist_mnist_ring_allreduce.py | 34 +++++++ .../unittests/test_dist_se_resnext_nccl.py | 1 + 9 files changed, 186 insertions(+), 93 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py delete mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist_nccl.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index af1348b765e..7488f28f545 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -8,7 +8,11 @@ if(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_OPS test_simple_dist_transpiler) list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op) LIST(REMOVE_ITEM TEST_OPS test_dist_mnist) - LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_nccl) + LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_dgc_nccl) + LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_hallreduce) + LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_multi_comm) + LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_ring_allreduce) + LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_backward_deps) LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_lars) LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec) LIST(REMOVE_ITEM TEST_OPS test_dist_ctr) @@ -215,7 +219,11 @@ if(WITH_DISTRIBUTE) endif() if(NOT APPLE) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 350) - set_tests_properties(test_dist_mnist_nccl PROPERTIES TIMEOUT 350) + set_tests_properties(test_dist_mnist_dgc_nccl PROPERTIES TIMEOUT 350) + set_tests_properties(test_dist_mnist_hallreduce PROPERTIES TIMEOUT 350) + set_tests_properties(test_dist_mnist_multi_comm PROPERTIES TIMEOUT 350) + set_tests_properties(test_dist_mnist_ring_allreduce PROPERTIES TIMEOUT 350) + set_tests_properties(test_dist_mnist_backward_deps PROPERTIES TIMEOUT 350) set_tests_properties(test_dist_mnist_lars PROPERTIES TIMEOUT 350) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 350) py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py index 68ffe64ced9..5d3c0fbdd0c 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py @@ -46,7 +46,7 @@ class TestDistMnist2x2(TestFleetBase): def test_dist_train(self): self.check_with_place( - "dist_fleet_ctr.py", delta=1e-5, check_error_log=False) + "dist_fleet_ctr.py", delta=1e-5, check_error_log=True) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py new file mode 100644 index 00000000000..1f6274ec164 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py @@ -0,0 +1,35 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +from test_dist_base import TestDistBase + + +class TestDistMnistNCCL2BackWardDeps(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_reduce = False + self._use_reader_alloc = False + self._nccl2_mode = True + self._enable_backward_deps = True + + def test_dist_train(self): + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + self.check_with_place("dist_mnist.py", delta=1e-5) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py new file mode 100644 index 00000000000..529bd330ac9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py @@ -0,0 +1,35 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +from test_dist_base import TestDistBase + + +class TestDistMnistNCCL2DGC(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_reduce = False + self._use_reader_alloc = False + self._nccl2_mode = True + self._use_dgc = True + + def test_dist_train(self): + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + self.check_with_place("dist_mnist.py", delta=1e-5) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py new file mode 100644 index 00000000000..247e4c0500f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py @@ -0,0 +1,35 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +from test_dist_base import TestDistBase + + +class TestDistMnistNCCL2HAllreduce(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_reduce = False + self._use_reader_alloc = False + self._nccl2_mode = True + self._use_hallreduce = True + + def test_dist_train(self): + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + self.check_with_place("dist_mnist.py", delta=1e-5) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py new file mode 100644 index 00000000000..d0a21fe0dca --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py @@ -0,0 +1,35 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +from test_dist_base import TestDistBase + + +class TestDistMnistNCCL2MultiNCCLComm(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_reduce = False + self._use_reader_alloc = False + self._nccl2_mode = True + self._nccl_comm_num = 3 + + def test_dist_train(self): + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + self.check_with_place("dist_mnist.py", delta=1e-5) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_nccl.py deleted file mode 100644 index 8718dce5ee5..00000000000 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist_nccl.py +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function -import unittest -from test_dist_base import TestDistBase - - -class TestDistMnistNCCL2(TestDistBase): - def _setup_config(self): - self._sync_mode = True - self._use_reduce = False - self._use_reader_alloc = False - self._nccl2_mode = True - - def test_dist_train(self): - import paddle.fluid as fluid - if fluid.core.is_compiled_with_cuda(): - self.check_with_place("dist_mnist.py", delta=1e-5) - - -class TestDistMnistNCCL2MultiNCCLComm(TestDistBase): - def _setup_config(self): - self._sync_mode = True - self._use_reduce = False - self._use_reader_alloc = False - self._nccl2_mode = True - self._nccl_comm_num = 3 - - def test_dist_train(self): - import paddle.fluid as fluid - if fluid.core.is_compiled_with_cuda(): - self.check_with_place("dist_mnist.py", delta=1e-5) - - -class TestDistMnistNCCL2DGC(TestDistBase): - def _setup_config(self): - self._sync_mode = True - self._use_reduce = False - self._use_reader_alloc = False - self._nccl2_mode = True - self._use_dgc = True - - def test_dist_train(self): - import paddle.fluid as fluid - if fluid.core.is_compiled_with_cuda(): - self.check_with_place("dist_mnist.py", delta=1e-5) - - -class TestDistMnistNCCL2BackWardDeps(TestDistBase): - def _setup_config(self): - self._sync_mode = True - self._use_reduce = False - self._use_reader_alloc = False - self._nccl2_mode = True - self._enable_backward_deps = True - - def test_dist_train(self): - import paddle.fluid as fluid - if fluid.core.is_compiled_with_cuda(): - self.check_with_place("dist_mnist.py", delta=1e-5) - - -class TestDistMnistNCCL2HAllreduce(TestDistBase): - def _setup_config(self): - self._sync_mode = True - self._use_reduce = False - self._use_reader_alloc = False - self._nccl2_mode = True - self._use_hallreduce = True - - def test_dist_train(self): - import paddle.fluid as fluid - if fluid.core.is_compiled_with_cuda(): - self.check_with_place("dist_mnist.py", delta=1e-5) - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py new file mode 100644 index 00000000000..fd15020275b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py @@ -0,0 +1,34 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +from test_dist_base import TestDistBase + + +class TestDistMnistNCCL2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_reduce = False + self._use_reader_alloc = False + self._nccl2_mode = True + + def test_dist_train(self): + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + self.check_with_place("dist_mnist.py", delta=1e-5) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py index 38f7bb80d2f..3e55efb633d 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py @@ -56,6 +56,7 @@ class TestDistSeResneXtNCCLMP(TestDistBase): self.check_with_place( "dist_se_resnext.py", delta=1e-5, + check_error_log=True, need_envs={"NCCL_P2P_DISABLE": "1"}) -- GitLab