未验证 提交 668a0a41 编写于 作者: W Weilong Wu 提交者: GitHub

[Eager] Refactor several sharding test (#42608)

* [Eager] fix sharding under eager mode

* [Eager] fix several sharding test under eager mode

* Recover using _test_eager_guard

* Ensured fleet.init under legacy

* Ensured fleet.init under legacy

* Fix CI issue, re-definition strategy and call fleet.init() in stage2_offload

* Modified dygraph_group_sharded_api.py, move fleet.init to a better line
上级 e2540c17
...@@ -1148,7 +1148,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) ...@@ -1148,7 +1148,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 200)
set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 200) set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 200)
set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120)
set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
......
...@@ -32,7 +32,6 @@ base_lr = 0.1 ...@@ -32,7 +32,6 @@ base_lr = 0.1
momentum_rate = 0.9 momentum_rate = 0.9
l2_decay = 1e-4 l2_decay = 1e-4
batch_size = 100 batch_size = 100
fleet.init(is_collective=True)
class MLP(fluid.Layer): class MLP(fluid.Layer):
...@@ -147,4 +146,5 @@ def test_sharding_api(): ...@@ -147,4 +146,5 @@ def test_sharding_api():
if __name__ == '__main__': if __name__ == '__main__':
with _test_eager_guard(): with _test_eager_guard():
pass pass
fleet.init(is_collective=True)
test_sharding_api() test_sharding_api()
...@@ -26,6 +26,7 @@ import paddle.fluid as fluid ...@@ -26,6 +26,7 @@ import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Linear from paddle.fluid.dygraph.nn import Linear
from paddle.distributed import fleet from paddle.distributed import fleet
from paddle.fluid.dygraph import nn from paddle.fluid.dygraph import nn
from paddle.fluid.framework import _test_eager_guard
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2
...@@ -224,4 +225,5 @@ def test_dp_stage2(): ...@@ -224,4 +225,5 @@ def test_dp_stage2():
if __name__ == '__main__': if __name__ == '__main__':
with _test_eager_guard():
test_dp_stage2() test_dp_stage2()
...@@ -23,6 +23,7 @@ import paddle.fluid as fluid ...@@ -23,6 +23,7 @@ import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Linear from paddle.fluid.dygraph.nn import Linear
from paddle.distributed import fleet from paddle.distributed import fleet
from paddle.fluid.dygraph import nn from paddle.fluid.dygraph import nn
from paddle.fluid.framework import _test_eager_guard
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2
...@@ -107,4 +108,5 @@ def test_sharding_stage2_offload(): ...@@ -107,4 +108,5 @@ def test_sharding_stage2_offload():
if __name__ == '__main__': if __name__ == '__main__':
with _test_eager_guard():
test_sharding_stage2_offload() test_sharding_stage2_offload()
...@@ -23,6 +23,7 @@ import paddle ...@@ -23,6 +23,7 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Linear from paddle.fluid.dygraph.nn import Linear
from paddle.distributed import fleet from paddle.distributed import fleet
from paddle.fluid.framework import _test_eager_guard
from paddle.distributed.fleet.utils.internal_storage import GradStorage from paddle.distributed.fleet.utils.internal_storage import GradStorage
from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
...@@ -138,4 +139,6 @@ def train_mlp(): ...@@ -138,4 +139,6 @@ def train_mlp():
if __name__ == '__main__': if __name__ == '__main__':
with _test_eager_guard():
pass
train_mlp() train_mlp()
...@@ -42,7 +42,6 @@ strategy.hybrid_configs = { ...@@ -42,7 +42,6 @@ strategy.hybrid_configs = {
"pp_degree": 1, "pp_degree": 1,
"sharding_degree": 1 "sharding_degree": 1
} }
fleet.init(is_collective=True, strategy=strategy)
np.random.seed(seed) np.random.seed(seed)
paddle.seed(seed) paddle.seed(seed)
...@@ -225,4 +224,5 @@ def test_dp_stage2(): ...@@ -225,4 +224,5 @@ def test_dp_stage2():
if __name__ == '__main__': if __name__ == '__main__':
with _test_eager_guard(): with _test_eager_guard():
pass pass
fleet.init(is_collective=True, strategy=strategy)
test_dp_stage2() test_dp_stage2()
...@@ -36,6 +36,14 @@ epoch = 2 ...@@ -36,6 +36,14 @@ epoch = 2
batch_size = 32 batch_size = 32
linear_size = 1000 linear_size = 1000
strategy = fleet.DistributedStrategy()
strategy.hybrid_configs = {
"dp_degree": 2,
"mp_degree": 1,
"pp_degree": 1,
"sharding_degree": 1
}
np.random.seed(seed) np.random.seed(seed)
paddle.seed(seed) paddle.seed(seed)
...@@ -109,4 +117,5 @@ def test_sharding_stage2_offload(): ...@@ -109,4 +117,5 @@ def test_sharding_stage2_offload():
if __name__ == '__main__': if __name__ == '__main__':
with _test_eager_guard(): with _test_eager_guard():
pass pass
fleet.init(is_collective=True, strategy=strategy)
test_sharding_stage2_offload() test_sharding_stage2_offload()
...@@ -26,6 +26,7 @@ import paddle.fluid as fluid ...@@ -26,6 +26,7 @@ import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Linear from paddle.fluid.dygraph.nn import Linear
from paddle.distributed import fleet from paddle.distributed import fleet
from paddle.fluid.dygraph import nn from paddle.fluid.dygraph import nn
from paddle.fluid.framework import _test_eager_guard
from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
...@@ -38,7 +39,6 @@ np.random.seed(2021) ...@@ -38,7 +39,6 @@ np.random.seed(2021)
base_lr = 0.1 base_lr = 0.1
momentum_rate = 0.9 momentum_rate = 0.9
l2_decay = 1e-4 l2_decay = 1e-4
fleet.init(is_collective=True)
class MLP(fluid.Layer): class MLP(fluid.Layer):
...@@ -274,4 +274,7 @@ def test_stage2_stage3(): ...@@ -274,4 +274,7 @@ def test_stage2_stage3():
if __name__ == '__main__': if __name__ == '__main__':
with _test_eager_guard():
pass
fleet.init(is_collective=True)
test_stage2_stage3() test_stage2_stage3()
...@@ -23,6 +23,7 @@ import paddle.fluid as fluid ...@@ -23,6 +23,7 @@ import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Linear from paddle.fluid.dygraph.nn import Linear
from paddle.distributed import fleet from paddle.distributed import fleet
from paddle.fluid.dygraph import nn from paddle.fluid.dygraph import nn
from paddle.fluid.framework import _test_eager_guard
from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3
from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
...@@ -33,7 +34,6 @@ np.random.seed(2022) ...@@ -33,7 +34,6 @@ np.random.seed(2022)
base_lr = 0.1 base_lr = 0.1
momentum_rate = 0.9 momentum_rate = 0.9
l2_decay = 1e-4 l2_decay = 1e-4
fleet.init(is_collective=True)
class MLP(fluid.Layer): class MLP(fluid.Layer):
...@@ -196,4 +196,7 @@ def test_stage3_offload(): ...@@ -196,4 +196,7 @@ def test_stage3_offload():
if __name__ == '__main__': if __name__ == '__main__':
with _test_eager_guard():
pass
fleet.init(is_collective=True)
test_stage3_offload() test_stage3_offload()
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
from __future__ import print_function from __future__ import print_function
import os
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -24,9 +25,10 @@ class TestDygraphGroupSharded(TestMultipleGpus): ...@@ -24,9 +25,10 @@ class TestDygraphGroupSharded(TestMultipleGpus):
# check group sharded logic as well as the accuracy with single mode # check group sharded logic as well as the accuracy with single mode
def test_dygraph_group_sharded(self): def test_dygraph_group_sharded(self):
self.run_mnist_2gpu('dygraph_group_sharded_api.py') self.run_mnist_2gpu('dygraph_group_sharded_api.py', eager_mode=False)
self.run_mnist_2gpu('dygraph_group_sharded_api_eager.py') self.run_mnist_2gpu('dygraph_group_sharded_api_eager.py')
if __name__ == "__main__": if __name__ == "__main__":
os.environ["FLAGS_enable_eager_mode"] = "1"
unittest.main() unittest.main()
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
from __future__ import print_function from __future__ import print_function
import os
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -30,5 +29,4 @@ class TestDygraphShardingOptimizerStage2(TestMultipleGpus): ...@@ -30,5 +29,4 @@ class TestDygraphShardingOptimizerStage2(TestMultipleGpus):
if __name__ == "__main__": if __name__ == "__main__":
os.environ["FLAGS_enable_eager_mode"] = "1"
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册