diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 08e24f86a29a4cb30c6938c21cdf915806cf1bf8..0b53046d056eea505588935388b8183285be9867 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1148,7 +1148,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120) - set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120) + set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 200) set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 200) set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py index 574a222ba18c9fd7487e14cf549645b0d5e893fb..a1a853f006c0d83811427fd2fd74c8c31b36d5cf 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py @@ -32,7 +32,6 @@ base_lr = 0.1 momentum_rate = 0.9 l2_decay = 1e-4 batch_size = 100 -fleet.init(is_collective=True) class MLP(fluid.Layer): @@ -147,4 +146,5 @@ def test_sharding_api(): if __name__ == '__main__': with _test_eager_guard(): pass + fleet.init(is_collective=True) test_sharding_api() diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py index b1f885e8cffe655e7937c16696dc35e16be367dc..8c07734d513c46e3f2f244d3bfbe9d1a118951e3 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py @@ -26,6 +26,7 @@ import paddle.fluid as fluid from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2 @@ -224,4 +225,5 @@ def test_dp_stage2(): if __name__ == '__main__': - test_dp_stage2() + with _test_eager_guard(): + test_dp_stage2() diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py index 360992a067f023758ef215de55855bf2c6e97ebf..b09314ae9e31cbfb6ceef79f4036c37567283832 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py @@ -23,6 +23,7 @@ import paddle.fluid as fluid from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2 @@ -107,4 +108,5 @@ def test_sharding_stage2_offload(): if __name__ == '__main__': - test_sharding_stage2_offload() + with _test_eager_guard(): + test_sharding_stage2_offload() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py index 705831d50f171966a81635e97a149c6d9f4ba16d..0ed9b681fdcf52887ab413545064e32bd96f2e8c 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py @@ -23,6 +23,7 @@ import paddle import paddle.fluid as fluid from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.utils.internal_storage import GradStorage from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 @@ -138,4 +139,6 @@ def train_mlp(): if __name__ == '__main__': + with _test_eager_guard(): + pass train_mlp() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py index 82edd1c17a54184b91d0c087080510a674b0cd51..58432540d1b8261ac7a48c555f76f50377f0f3a6 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py @@ -42,7 +42,6 @@ strategy.hybrid_configs = { "pp_degree": 1, "sharding_degree": 1 } -fleet.init(is_collective=True, strategy=strategy) np.random.seed(seed) paddle.seed(seed) @@ -225,4 +224,5 @@ def test_dp_stage2(): if __name__ == '__main__': with _test_eager_guard(): pass + fleet.init(is_collective=True, strategy=strategy) test_dp_stage2() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py index a7b16bbb75977c9fec6d2480521254e8009b6433..cd2d7b3f127654b74f90e191d21a950abfb58bdf 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py @@ -36,6 +36,14 @@ epoch = 2 batch_size = 32 linear_size = 1000 +strategy = fleet.DistributedStrategy() +strategy.hybrid_configs = { + "dp_degree": 2, + "mp_degree": 1, + "pp_degree": 1, + "sharding_degree": 1 +} + np.random.seed(seed) paddle.seed(seed) @@ -109,4 +117,5 @@ def test_sharding_stage2_offload(): if __name__ == '__main__': with _test_eager_guard(): pass + fleet.init(is_collective=True, strategy=strategy) test_sharding_stage2_offload() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py index 82821cd7ee644b5209a594e9a43de7636cdd4958..fc4002ef405bd20c91a51586da926dff0788d535 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py @@ -26,6 +26,7 @@ import paddle.fluid as fluid from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2 @@ -38,7 +39,6 @@ np.random.seed(2021) base_lr = 0.1 momentum_rate = 0.9 l2_decay = 1e-4 -fleet.init(is_collective=True) class MLP(fluid.Layer): @@ -274,4 +274,7 @@ def test_stage2_stage3(): if __name__ == '__main__': + with _test_eager_guard(): + pass + fleet.init(is_collective=True) test_stage2_stage3() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py index df7ba78d345a3c5bbd8a2cbc6de465e2546715cb..763a7a8b97fddafa7eeca2ae461686f99b918fa8 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py @@ -23,6 +23,7 @@ import paddle.fluid as fluid from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3 from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler @@ -33,7 +34,6 @@ np.random.seed(2022) base_lr = 0.1 momentum_rate = 0.9 l2_decay = 1e-4 -fleet.init(is_collective=True) class MLP(fluid.Layer): @@ -196,4 +196,7 @@ def test_stage3_offload(): if __name__ == '__main__': + with _test_eager_guard(): + pass + fleet.init(is_collective=True) test_stage3_offload() diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py index e664face0483a49c283487f5f02ece25f37e9963..0a51045dee5e1550d38e56924d8e39a239b8fc43 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py @@ -14,6 +14,7 @@ from __future__ import print_function +import os import unittest import paddle.fluid as fluid @@ -24,9 +25,10 @@ class TestDygraphGroupSharded(TestMultipleGpus): # check group sharded logic as well as the accuracy with single mode def test_dygraph_group_sharded(self): - self.run_mnist_2gpu('dygraph_group_sharded_api.py') + self.run_mnist_2gpu('dygraph_group_sharded_api.py', eager_mode=False) self.run_mnist_2gpu('dygraph_group_sharded_api_eager.py') if __name__ == "__main__": + os.environ["FLAGS_enable_eager_mode"] = "1" unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py index 0be455591bf93980b48773616608e0bb40757d92..50e1985138610c1d410ddf3022c94d04d35e6b6c 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py @@ -14,7 +14,6 @@ from __future__ import print_function -import os import unittest import paddle.fluid as fluid @@ -30,5 +29,4 @@ class TestDygraphShardingOptimizerStage2(TestMultipleGpus): if __name__ == "__main__": - os.environ["FLAGS_enable_eager_mode"] = "1" unittest.main()