From 668a0a41325b0ff2b47677b1c800cefa62b6a161 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Tue, 10 May 2022 15:36:31 +0800 Subject: [PATCH] [Eager] Refactor several sharding test (#42608) * [Eager] fix sharding under eager mode * [Eager] fix several sharding test under eager mode * Recover using _test_eager_guard * Ensured fleet.init under legacy * Ensured fleet.init under legacy * Fix CI issue, re-definition strategy and call fleet.init() in stage2_offload * Modified dygraph_group_sharded_api.py, move fleet.init to a better line --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 +- .../fluid/tests/unittests/dygraph_group_sharded_api.py | 2 +- .../tests/unittests/dygraph_group_sharded_stage2.py | 4 +++- .../unittests/dygraph_group_sharded_stage2_offload.py | 4 +++- .../tests/unittests/dygraph_sharding_optimizer_stage2.py | 3 +++ .../fluid/tests/unittests/dygraph_sharding_stage2.py | 2 +- .../tests/unittests/dygraph_sharding_stage2_offload.py | 9 +++++++++ .../fluid/tests/unittests/dygraph_sharding_stage3.py | 5 ++++- .../tests/unittests/dygraph_sharding_stage3_offload.py | 5 ++++- .../tests/unittests/test_dygraph_group_sharded_api.py | 4 +++- .../unittests/test_dygraph_sharding_optimizer_stage2.py | 2 -- 11 files changed, 32 insertions(+), 10 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 08e24f86a29..0b53046d056 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1148,7 +1148,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120) - set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120) + set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 200) set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 200) set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py index 574a222ba18..a1a853f006c 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py @@ -32,7 +32,6 @@ base_lr = 0.1 momentum_rate = 0.9 l2_decay = 1e-4 batch_size = 100 -fleet.init(is_collective=True) class MLP(fluid.Layer): @@ -147,4 +146,5 @@ def test_sharding_api(): if __name__ == '__main__': with _test_eager_guard(): pass + fleet.init(is_collective=True) test_sharding_api() diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py index b1f885e8cff..8c07734d513 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py @@ -26,6 +26,7 @@ import paddle.fluid as fluid from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2 @@ -224,4 +225,5 @@ def test_dp_stage2(): if __name__ == '__main__': - test_dp_stage2() + with _test_eager_guard(): + test_dp_stage2() diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py index 360992a067f..b09314ae9e3 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py @@ -23,6 +23,7 @@ import paddle.fluid as fluid from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2 @@ -107,4 +108,5 @@ def test_sharding_stage2_offload(): if __name__ == '__main__': - test_sharding_stage2_offload() + with _test_eager_guard(): + test_sharding_stage2_offload() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py index 705831d50f1..0ed9b681fdc 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py @@ -23,6 +23,7 @@ import paddle import paddle.fluid as fluid from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.utils.internal_storage import GradStorage from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 @@ -138,4 +139,6 @@ def train_mlp(): if __name__ == '__main__': + with _test_eager_guard(): + pass train_mlp() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py index 82edd1c17a5..58432540d1b 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py @@ -42,7 +42,6 @@ strategy.hybrid_configs = { "pp_degree": 1, "sharding_degree": 1 } -fleet.init(is_collective=True, strategy=strategy) np.random.seed(seed) paddle.seed(seed) @@ -225,4 +224,5 @@ def test_dp_stage2(): if __name__ == '__main__': with _test_eager_guard(): pass + fleet.init(is_collective=True, strategy=strategy) test_dp_stage2() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py index a7b16bbb759..cd2d7b3f127 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py @@ -36,6 +36,14 @@ epoch = 2 batch_size = 32 linear_size = 1000 +strategy = fleet.DistributedStrategy() +strategy.hybrid_configs = { + "dp_degree": 2, + "mp_degree": 1, + "pp_degree": 1, + "sharding_degree": 1 +} + np.random.seed(seed) paddle.seed(seed) @@ -109,4 +117,5 @@ def test_sharding_stage2_offload(): if __name__ == '__main__': with _test_eager_guard(): pass + fleet.init(is_collective=True, strategy=strategy) test_sharding_stage2_offload() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py index 82821cd7ee6..fc4002ef405 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py @@ -26,6 +26,7 @@ import paddle.fluid as fluid from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2 @@ -38,7 +39,6 @@ np.random.seed(2021) base_lr = 0.1 momentum_rate = 0.9 l2_decay = 1e-4 -fleet.init(is_collective=True) class MLP(fluid.Layer): @@ -274,4 +274,7 @@ def test_stage2_stage3(): if __name__ == '__main__': + with _test_eager_guard(): + pass + fleet.init(is_collective=True) test_stage2_stage3() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py index df7ba78d345..763a7a8b97f 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py @@ -23,6 +23,7 @@ import paddle.fluid as fluid from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3 from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler @@ -33,7 +34,6 @@ np.random.seed(2022) base_lr = 0.1 momentum_rate = 0.9 l2_decay = 1e-4 -fleet.init(is_collective=True) class MLP(fluid.Layer): @@ -196,4 +196,7 @@ def test_stage3_offload(): if __name__ == '__main__': + with _test_eager_guard(): + pass + fleet.init(is_collective=True) test_stage3_offload() diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py index e664face048..0a51045dee5 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py @@ -14,6 +14,7 @@ from __future__ import print_function +import os import unittest import paddle.fluid as fluid @@ -24,9 +25,10 @@ class TestDygraphGroupSharded(TestMultipleGpus): # check group sharded logic as well as the accuracy with single mode def test_dygraph_group_sharded(self): - self.run_mnist_2gpu('dygraph_group_sharded_api.py') + self.run_mnist_2gpu('dygraph_group_sharded_api.py', eager_mode=False) self.run_mnist_2gpu('dygraph_group_sharded_api_eager.py') if __name__ == "__main__": + os.environ["FLAGS_enable_eager_mode"] = "1" unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py index 0be455591bf..50e19851386 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py @@ -14,7 +14,6 @@ from __future__ import print_function -import os import unittest import paddle.fluid as fluid @@ -30,5 +29,4 @@ class TestDygraphShardingOptimizerStage2(TestMultipleGpus): if __name__ == "__main__": - os.environ["FLAGS_enable_eager_mode"] = "1" unittest.main() -- GitLab