From 438ca7f69b8f794027ceca683e27e99e259a97a3 Mon Sep 17 00:00:00 2001 From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com> Date: Thu, 21 Jul 2022 11:55:08 +0800 Subject: [PATCH] [AutoParallel] fix unittest with paddle.distributed.launch (#44439) * fix unittest * fix log_dir * _enable_legacy_dygraph --- .../paddle/distributed/auto_parallel/process_group.py | 5 ++++- .../auto_parallel/test_auto_parallel_relaunch.py | 2 +- .../tests/unittests/auto_parallel/test_converter.py | 10 +++++----- .../tests/unittests/auto_parallel/test_engine_api.py | 4 ++-- .../unittests/auto_parallel/test_engine_api_dp.py | 4 ++-- .../unittests/auto_parallel/test_high_order_grad.py | 10 +++++----- .../auto_parallel/test_relaunch_with_gpt_planner.py | 2 +- .../auto_parallel/test_relaunch_with_planner.py | 2 +- 8 files changed, 21 insertions(+), 18 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py index 74cb6930e03..245c5c955e8 100644 --- a/python/paddle/distributed/auto_parallel/process_group.py +++ b/python/paddle/distributed/auto_parallel/process_group.py @@ -16,10 +16,12 @@ from collections import OrderedDict import paddle import paddle.fluid.core as core + from ..collective import _get_global_env from ..collective import _new_ring_id from ...fluid.framework import _non_static_mode from ...fluid.layers.tensor import fill_constant +from paddle.fluid.framework import _enable_legacy_dygraph def get_all_process_groups(): @@ -134,7 +136,8 @@ class ProcessGroup: # TODO(shenliang03): This is a temporary solution to solve the problem of # hang caused by cross-creation of new_group - paddle.framework._in_legacy_dygraph() + paddle.disable_static() + _enable_legacy_dygraph() paddle.set_device('gpu:%d' % paddle.distributed.ParallelEnv().dev_id) tmp = paddle.to_tensor( diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py index 09ec5131402..f893088782d 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py @@ -126,7 +126,7 @@ class TestAutoParallelReLaunch(unittest.TestCase): coverage_args = [] cmd = [sys.executable, "-u"] + coverage_args + [ - "-m", "launch", "--log_dir", self.temp_dir.name, + "-m", "paddle.distributed.launch", "--log_dir", self.temp_dir.name, "--cluster_topo_path", cluster_json_path, "--rank_mapping_path", mapping_json_path, "--enable_auto_mapping", "True", launch_model_path diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py index 22abd6d7995..40fc301f261 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import tempfile import unittest import os import sys @@ -32,18 +33,17 @@ class TestConverter(unittest.TestCase): else: coverage_args = [] + tmp_dir = tempfile.TemporaryDirectory() cmd = [sys.executable, "-u"] + coverage_args + [ - "-m", "launch", "--gpus", "0,1", launch_model_path + "-m", "paddle.distributed.launch", "--devices", "0,1", "--log_dir", + tmp_dir.name, launch_model_path ] process = subprocess.Popen(cmd) process.wait() self.assertEqual(process.returncode, 0) - # Remove unnecessary files - log_path = os.path.join(file_dir, "log") - if os.path.exists(log_path): - shutil.rmtree(log_path) + tmp_dir.cleanup() def test_input_invalid(self): with self.assertRaises(ValueError): diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py index 8d5051a3d48..3dfedea46f6 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py @@ -34,8 +34,8 @@ class TestEngineAPI(unittest.TestCase): tmp_dir = tempfile.TemporaryDirectory() cmd = [sys.executable, "-u"] + coverage_args + [ - "-m", "launch", "--gpus", "0,1", "--log_dir", tmp_dir.name, - launch_model_path + "-m", "paddle.distributed.launch", "--devices", "0,1", "--log_dir", + tmp_dir.name, launch_model_path ] process = subprocess.Popen(cmd) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api_dp.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api_dp.py index 92c8e534aa2..3e6105917a8 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api_dp.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api_dp.py @@ -34,8 +34,8 @@ class TestEngineAPI(unittest.TestCase): tmp_dir = tempfile.TemporaryDirectory() cmd = [sys.executable, "-u"] + coverage_args + [ - "-m", "launch", "--gpus", "0,1", "--log_dir", tmp_dir.name, - launch_model_path + "-m", "paddle.distributed.launch", "--devices", "0,1", "--log_dir", + tmp_dir.name, launch_model_path ] process = subprocess.Popen(cmd) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_high_order_grad.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_high_order_grad.py index 9fb1c22d76c..104cfb59ff5 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_high_order_grad.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_high_order_grad.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import tempfile import unittest import os import sys @@ -31,18 +32,17 @@ class TestHighOrderGrad(unittest.TestCase): else: coverage_args = [] + tmp_dir = tempfile.TemporaryDirectory() cmd = [sys.executable, "-u"] + coverage_args + [ - "-m", "launch", "--gpus", "0,1", launch_model_path + "-m", "paddle.distributed.launch", "--devices", "0,1", "--log_dir", + tmp_dir.name, launch_model_path ] process = subprocess.Popen(cmd) process.wait() self.assertEqual(process.returncode, 0) - # Remove unnecessary files - log_path = os.path.join(file_dir, "log") - if os.path.exists(log_path): - shutil.rmtree(log_path) + tmp_dir.cleanup() if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_gpt_planner.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_gpt_planner.py index bc1ebd6688e..dd7e02af854 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_gpt_planner.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_gpt_planner.py @@ -56,7 +56,7 @@ class TestPlannerReLaunch(unittest.TestCase): coverage_args = [] cmd = [sys.executable, "-u"] + coverage_args + [ - "-m", "launch", "--log_dir", self.temp_dir.name, + "-m", "paddle.distributed.launch", "--log_dir", self.temp_dir.name, "--cluster_topo_path", cluster_json_path, "--rank_mapping_path", mapping_json_path, "--enable_auto_mapping", "True", launch_model_path diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_planner.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_planner.py index efcc313a2a4..b9b02d749d8 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_planner.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_planner.py @@ -56,7 +56,7 @@ class TestPlannerReLaunch(unittest.TestCase): coverage_args = [] cmd = [sys.executable, "-u"] + coverage_args + [ - "-m", "launch", "--log_dir", self.temp_dir.name, + "-m", "paddle.distributed.launch", "--log_dir", self.temp_dir.name, "--cluster_topo_path", cluster_json_path, "--rank_mapping_path", mapping_json_path, "--enable_auto_mapping", "True", launch_model_path -- GitLab