未验证 提交 668a0a41 编写于 作者: W Weilong Wu 提交者: GitHub

[Eager] Refactor several sharding test (#42608)

* [Eager] fix sharding under eager mode

* [Eager] fix several sharding test under eager mode

* Recover using _test_eager_guard

* Ensured fleet.init under legacy

* Ensured fleet.init under legacy

* Fix CI issue, re-definition strategy and call fleet.init() in stage2_offload

* Modified dygraph_group_sharded_api.py, move fleet.init to a better line
上级 e2540c17
......@@ -1148,7 +1148,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120)
set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 200)
set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 200)
set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120)
set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
......
......@@ -32,7 +32,6 @@ base_lr = 0.1
momentum_rate = 0.9
l2_decay = 1e-4
batch_size = 100
fleet.init(is_collective=True)
class MLP(fluid.Layer):
......@@ -147,4 +146,5 @@ def test_sharding_api():
if __name__ == '__main__':
with _test_eager_guard():
pass
fleet.init(is_collective=True)
test_sharding_api()
......@@ -26,6 +26,7 @@ import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Linear
from paddle.distributed import fleet
from paddle.fluid.dygraph import nn
from paddle.fluid.framework import _test_eager_guard
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2
......@@ -224,4 +225,5 @@ def test_dp_stage2():
if __name__ == '__main__':
test_dp_stage2()
with _test_eager_guard():
test_dp_stage2()
......@@ -23,6 +23,7 @@ import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Linear
from paddle.distributed import fleet
from paddle.fluid.dygraph import nn
from paddle.fluid.framework import _test_eager_guard
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2
......@@ -107,4 +108,5 @@ def test_sharding_stage2_offload():
if __name__ == '__main__':
test_sharding_stage2_offload()
with _test_eager_guard():
test_sharding_stage2_offload()
......@@ -23,6 +23,7 @@ import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Linear
from paddle.distributed import fleet
from paddle.fluid.framework import _test_eager_guard
from paddle.distributed.fleet.utils.internal_storage import GradStorage
from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
......@@ -138,4 +139,6 @@ def train_mlp():
if __name__ == '__main__':
with _test_eager_guard():
pass
train_mlp()
......@@ -42,7 +42,6 @@ strategy.hybrid_configs = {
"pp_degree": 1,
"sharding_degree": 1
}
fleet.init(is_collective=True, strategy=strategy)
np.random.seed(seed)
paddle.seed(seed)
......@@ -225,4 +224,5 @@ def test_dp_stage2():
if __name__ == '__main__':
with _test_eager_guard():
pass
fleet.init(is_collective=True, strategy=strategy)
test_dp_stage2()
......@@ -36,6 +36,14 @@ epoch = 2
batch_size = 32
linear_size = 1000
strategy = fleet.DistributedStrategy()
strategy.hybrid_configs = {
"dp_degree": 2,
"mp_degree": 1,
"pp_degree": 1,
"sharding_degree": 1
}
np.random.seed(seed)
paddle.seed(seed)
......@@ -109,4 +117,5 @@ def test_sharding_stage2_offload():
if __name__ == '__main__':
with _test_eager_guard():
pass
fleet.init(is_collective=True, strategy=strategy)
test_sharding_stage2_offload()
......@@ -26,6 +26,7 @@ import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Linear
from paddle.distributed import fleet
from paddle.fluid.dygraph import nn
from paddle.fluid.framework import _test_eager_guard
from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
......@@ -38,7 +39,6 @@ np.random.seed(2021)
base_lr = 0.1
momentum_rate = 0.9
l2_decay = 1e-4
fleet.init(is_collective=True)
class MLP(fluid.Layer):
......@@ -274,4 +274,7 @@ def test_stage2_stage3():
if __name__ == '__main__':
with _test_eager_guard():
pass
fleet.init(is_collective=True)
test_stage2_stage3()
......@@ -23,6 +23,7 @@ import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Linear
from paddle.distributed import fleet
from paddle.fluid.dygraph import nn
from paddle.fluid.framework import _test_eager_guard
from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3
from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
......@@ -33,7 +34,6 @@ np.random.seed(2022)
base_lr = 0.1
momentum_rate = 0.9
l2_decay = 1e-4
fleet.init(is_collective=True)
class MLP(fluid.Layer):
......@@ -196,4 +196,7 @@ def test_stage3_offload():
if __name__ == '__main__':
with _test_eager_guard():
pass
fleet.init(is_collective=True)
test_stage3_offload()
......@@ -14,6 +14,7 @@
from __future__ import print_function
import os
import unittest
import paddle.fluid as fluid
......@@ -24,9 +25,10 @@ class TestDygraphGroupSharded(TestMultipleGpus):
# check group sharded logic as well as the accuracy with single mode
def test_dygraph_group_sharded(self):
self.run_mnist_2gpu('dygraph_group_sharded_api.py')
self.run_mnist_2gpu('dygraph_group_sharded_api.py', eager_mode=False)
self.run_mnist_2gpu('dygraph_group_sharded_api_eager.py')
if __name__ == "__main__":
os.environ["FLAGS_enable_eager_mode"] = "1"
unittest.main()
......@@ -14,7 +14,6 @@
from __future__ import print_function
import os
import unittest
import paddle.fluid as fluid
......@@ -30,5 +29,4 @@ class TestDygraphShardingOptimizerStage2(TestMultipleGpus):
if __name__ == "__main__":
os.environ["FLAGS_enable_eager_mode"] = "1"
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册