[AutoParallel] Add AutoConvert (#36958)

* add AutoConvert * add unitest * amend merge&slice * amend default dist_attr * update doc&improve coverage * add interface dist_context * tiny modify

[AutoParallel] Add AutoConvert (#36958)
* add AutoConvert * add unitest * amend merge&slice * amend default dist_attr * update doc&improve coverage * add interface dist_context * tiny modify
1773afd7 · zhaoyingli · GitHub · c1310343 · 1773afd7 · 1773afd7
5 changed file
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -39,6 +39,7 @@ list(APPEND DIST_TEST_OPS test_parallel_class_center_sample)
 list(APPEND DIST_TEST_OPS test_parallel_margin_cross_entropy)
 list(APPEND DIST_TEST_OPS test_auto_parallel_data_unshard)
 list(APPEND DIST_TEST_OPS test_auto_parallel_save_load)
+list(APPEND DIST_TEST_OPS test_auto_parallel_autoconvert)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
@@ -256,6 +257,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_cost_model)
    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_data_unshard)
    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_save_load)
+    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_autoconvert)
 elseif(WITH_GPU)
    if (${CUDNN_VERSION} VERSION_LESS 7100)
        LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
@@ -1035,6 +1037,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
    set_tests_properties(test_parallel_margin_cross_entropy PROPERTIES TIMEOUT 120)
    set_tests_properties(test_auto_parallel_data_unshard PROPERTIES TIMEOUT 120)
    set_tests_properties(test_auto_parallel_save_load PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_auto_parallel_autoconvert PROPERTIES TIMEOUT 120)
    if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
        set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120)
        set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120)

--- a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import random
+import numpy as np
+import os
+import shutil
+
+import paddle
+import paddle.nn as nn
+import paddle.utils as utils
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.distributed.auto_parallel as auto
+
+from paddle.distributed import fleet
+from paddle.fluid.initializer import NumpyArrayInitializer
+from paddle.distributed.auto_parallel.utils import save_distributed_checkpoint, load_distributed_checkpoint, load_checkpoint_into_program
+from paddle.distributed.auto_parallel.utils import get_dist_attr, merge_and_slice_parameter, load_parameter_into_program
+from paddle.distributed.auto_parallel.reshard import HAS_SENT, HAS_RECV, HAS_ALLGATHER
+
+paddle.enable_static()
+_global_parallel_strategy = None
+_global_process_mesh = None
+PP_MESH_0 = None
+PP_MESH_1 = None
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=64,
+                 intermediate_size=4 * 64,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        np.random.seed(2021)
+        arr0 = np.random.normal(0, 0.02, size=(d_model, dim_feedforward))
+        arr1 = np.random.normal(0, 0.02, size=(d_model, dim_feedforward))
+        weight_attr0 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr0))
+        weight_attr1 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr1))
+        bias_attr = None
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr0, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr1, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+
+    def forward(self, input):
+        if _global_parallel_strategy == "pp":
+            auto.shard_tensor(
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": PP_MESH_0,
+                    "dims_mapping": [-1, -1]
+                })
+            auto.shard_tensor(
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": PP_MESH_1,
+                    "dims_mapping": [-1, -1]
+                })
+        elif _global_parallel_strategy == "mp":
+            auto.shard_tensor(
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
+            auto.shard_tensor(
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
+        elif _global_parallel_strategy == "dp":
+            auto.shard_tensor(
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, -1]
+                })
+            auto.shard_tensor(
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, -1]
+                })
+
+        out = self.norm(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+        return out
+
+
+def mlp_forward(train_program, start_program):
+    with static.program_guard(train_program,start_program), \
+        utils.unique_name.guard():
+        batch_size = 4
+        hidden_size = 64
+        input = static.data(
+            name="input", shape=[batch_size, hidden_size], dtype='float32')
+        label = static.data(
+            name="label", shape=[batch_size, 1], dtype='float32')
+
+        if _global_parallel_strategy == "pp":
+            auto.shard_tensor(
+                input,
+                dist_attr={
+                    "process_mesh": PP_MESH_0,
+                    "dims_mapping": [-1, -1]
+                })
+            auto.shard_tensor(
+                label,
+                dist_attr={
+                    "process_mesh": PP_MESH_1,
+                    "dims_mapping": [-1, -1]
+                })
+        elif _global_parallel_strategy == "dp":
+            auto.shard_tensor(
+                input,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
+        elif _global_parallel_strategy == "mp":
+            auto.shard_tensor(
+                input,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, -1]
+                })
+
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            initializer_range=0.02)
+        predict = mlp(input)
+        error_cost = paddle.nn.functional.square_error_cost(predict, label)
+        loss = paddle.mean(error_cost)
+    return loss, train_program, start_program
+
+
+def get_distributed_program():
+    train_program = static.Program()
+    startup_program = static.Program()
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.semi_auto = True
+    fleet.init(is_collective=True, strategy=dist_strategy)
+    loss, train_program, startup_program = mlp_forward(train_program,
+                                                       startup_program)
+    optimizer = paddle.fluid.optimizer.SGDOptimizer(learning_rate=0.01)
+    optimizer = fleet.distributed_optimizer(optimizer)
+    _, _, dist_startup_prog, dist_main_prog = optimizer.minimize(
+        loss, startup_program)
+
+    return dist_main_prog, dist_startup_prog, loss
+
+
+class TestMLPAutoConvert(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2021)
+        random.seed(2021)
+        np.random.seed(2021)
+
+    def tearDown(self):
+        os.remove("./model_state_rank{}.pdmodel".format(
+            str(paddle.distributed.get_rank())))
+        os.remove("./dist_attr_rank{}.pdattr".format(
+            str(paddle.distributed.get_rank())))
+
+    def test_mlp_mp2pp(self):
+        global _global_parallel_strategy
+        _global_parallel_strategy = "mp"
+        global _global_process_mesh
+        _global_process_mesh = auto.ProcessMesh([0, 1])
+
+        input = np.random.random(size=(80, 64)).astype('float32')
+        label = np.random.random(size=(80, 1)).astype('float32')
+
+        dist_main_prog, dist_start_prog, loss = get_distributed_program()
+        place = paddle.set_device("gpu")
+        exe = paddle.static.Executor(place)
+        exe.run(dist_start_prog)
+
+        for step in range(20):
+            if step == 10:
+                save_distributed_checkpoint(
+                    dist_main_prog, ".", dist_attr_path=".")
+
+            res = exe.run(dist_main_prog,
+                          feed={
+                              "input": input[step * 4:(step + 1) * 4, :],
+                              "label": label[step * 4:(step + 1) * 4, :]
+                          },
+                          fetch_list=[loss])
+        last_res = res[0]
+
+        _global_parallel_strategy = "pp"
+        _global_process_mesh = auto.ProcessMesh([0, 1])
+        global PP_MESH_0
+        PP_MESH_0 = auto.ProcessMesh(mesh=[0])
+        global PP_MESH_1
+        PP_MESH_1 = auto.ProcessMesh(mesh=[1])
+
+        dist_main_prog_load, dist_start_prog_load, loss_load = get_distributed_program(
+        )
+        place = paddle.set_device("gpu")
+        exe = paddle.static.Executor(place)
+        exe.run(dist_start_prog_load)
+
+        ckpt_path = [
+            "./model_state_rank0.pdmodel", "./model_state_rank1.pdmodel"
+        ]
+        dist_attr_path = [
+            "./dist_attr_rank0.pdattr", "./dist_attr_rank1.pdattr"
+        ]
+        load_checkpoint_into_program(ckpt_path, dist_attr_path,
+                                     dist_main_prog_load)
+        for step in range(10, 20):
+            if paddle.distributed.get_rank() in [0]:
+                res = exe.run(dist_main_prog_load,
+                              feed={
+                                  "input": input[step * 4:(step + 1) * 4, :],
+                                  "label": label[step * 4:(step + 1) * 4, :]
+                              })
+            else:
+                res = exe.run(dist_main_prog_load,
+                              feed={
+                                  "input": input[step * 4:(step + 1) * 4, :],
+                                  "label": label[step * 4:(step + 1) * 4, :]
+                              },
+                              fetch_list=[loss_load])
+        if paddle.distributed.get_rank() in [1]:
+            self.assertEqual(last_res, res[0])
+
+
+class TestMLPAutoConvert2(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2021)
+        random.seed(2021)
+        np.random.seed(2021)
+        HAS_SENT.clear()
+        HAS_RECV.clear()
+        HAS_ALLGATHER.clear()
+
+    def tearDown(self):
+        os.remove("./model_state_rank{}.pdmodel".format(
+            str(paddle.distributed.get_rank())))
+        os.remove("./dist_attr_rank{}.pdattr".format(
+            str(paddle.distributed.get_rank())))
+
+    def test_mlp_pp2mp(self):
+        global _global_parallel_strategy
+        _global_parallel_strategy = "pp"
+        global _global_process_mesh
+        _global_process_mesh = auto.ProcessMesh([0, 1])
+        global PP_MESH_0
+        PP_MESH_0 = auto.ProcessMesh(mesh=[0])
+        global PP_MESH_1
+        PP_MESH_1 = auto.ProcessMesh(mesh=[1])
+        input = np.random.random(size=(80, 64)).astype('float32')
+        label = np.random.random(size=(80, 1)).astype('float32')
+
+        dist_main_prog, dist_start_prog, loss = get_distributed_program()
+        place = paddle.set_device("gpu")
+        exe = paddle.static.Executor(place)
+        exe.run(dist_start_prog)
+        for step in range(20):
+            if step == 10:
+                add_info = {"batch": step, "batch_size": 4}
+                save_distributed_checkpoint(dist_main_prog, ".", ".", add_info)
+
+            if paddle.distributed.get_rank() in [0]:
+                res = exe.run(dist_main_prog,
+                              feed={
+                                  "input": input[step * 4:(step + 1) * 4, :],
+                                  "label": label[step * 4:(step + 1) * 4, :]
+                              })
+            else:
+                res = exe.run(dist_main_prog,
+                              feed={
+                                  "input": input[step * 4:(step + 1) * 4, :],
+                                  "label": label[step * 4:(step + 1) * 4, :]
+                              },
+                              fetch_list=[loss])
+        if paddle.distributed.get_rank() in [1]:
+            last_res = res[0]
+
+        _global_parallel_strategy = "mp"
+        _global_process_mesh = auto.ProcessMesh([0, 1])
+
+        dist_main_prog_load, dist_start_prog_load, loss_load = get_distributed_program(
+        )
+        place = paddle.set_device("gpu")
+        exe = paddle.static.Executor(place)
+        exe.run(dist_start_prog_load)
+        ckpt_path = [
+            "./model_state_rank0.pdmodel", "./model_state_rank1.pdmodel"
+        ]
+        dist_attr_path = [
+            "./dist_attr_rank0.pdattr", "./dist_attr_rank1.pdattr"
+        ]
+        param_dict, pre_dist_attr, add_info = load_distributed_checkpoint(
+            ckpt_path, dist_attr_path)
+        batch = add_info["batch"]
+        batch_size = add_info["batch_size"]
+        start_index = batch * batch_size
+        input = input[start_index:, :]
+        label = label[start_index:, :]
+        cur_dist_attr = get_dist_attr(dist_main_prog_load)
+        sliced_param_dict = merge_and_slice_parameter(param_dict, pre_dist_attr,
+                                                      cur_dist_attr)
+        load_parameter_into_program(sliced_param_dict, dist_main_prog_load)
+        for step in range(10):
+            res = exe.run(dist_main_prog_load,
+                          feed={
+                              "input": input[step * 4:(step + 1) * 4, :],
+                              "label": label[step * 4:(step + 1) * 4, :]
+                          },
+                          fetch_list=[loss_load])
+        if paddle.distributed.get_rank() in [1]:
+            self.assertEqual(last_res, res[0])
+
+
+class TestMLPAutoConvertInvalid(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2021)
+        random.seed(2021)
+        np.random.seed(2021)
+
+    def test_input_invalid(self):
+        global _global_parallel_strategy
+        _global_parallel_strategy = "mp"
+        global _global_process_mesh
+        _global_process_mesh = auto.ProcessMesh([0, 1])
+        dist_main_prog, _, _ = get_distributed_program()
+        with self.assertRaises(TypeError):
+            save_distributed_checkpoint(
+                dist_main_prog, [""], [""], addition_info=[0])
+        with self.assertRaises(ValueError):
+            save_distributed_checkpoint(
+                dist_main_prog, [""], [""], addition_info={"step": 0})
+        with self.assertRaises(ValueError):
+            save_distributed_checkpoint(
+                dist_main_prog, [""], [""], addition_info={"batch": 0.0})
+        with self.assertRaises(ValueError):
+            load_checkpoint_into_program(["./model_state_rank.pdmodel"],
+                                         ["./dist_attr_rank.pdattr"],
+                                         dist_main_prog)
+        with self.assertRaises(ValueError):
+            load_distributed_checkpoint(["./model_state_rank.pdmodel"],
+                                        ["./dist_attr_rank.pdattr"])
+        with self.assertRaises(TypeError):
+            load_distributed_checkpoint({
+                "0": "./model_state_rank.pdmodel"
+            }, {"1": "./dist_attr_rank.pdattr"})
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
@@ -29,12 +29,7 @@ import paddle.distributed.auto_parallel as auto

 from paddle.distributed import fleet
 from paddle.fluid.initializer import NumpyArrayInitializer
-from paddle.distributed.auto_parallel.utils import make_data_unshard
-from paddle.distributed.auto_parallel.utils import save_distributed_checkpoint, load_distributed_checkpoint
-from paddle.distributed.auto_parallel.reshard import reshard
-from paddle.distributed.auto_parallel.partitioner import Partitioner
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
-from paddle.distributed.auto_parallel.process_group import get_all_process_groups
+from paddle.distributed.auto_parallel.utils import save_distributed_checkpoint, load_checkpoint_into_program

 paddle.enable_static()
 _global_parallel_strategy = None
@@ -204,7 +199,7 @@ class TestMLPSaveLoad(unittest.TestCase):
            if step == 10:
                path = "./output_dp{}".format(paddle.distributed.get_rank())
                os.makedirs(path, exist_ok=True)
-                save_distributed_checkpoint(dist_main_prog, path)
+                save_distributed_checkpoint(dist_main_prog, path, path)

            res = exe.run(dist_main_prog,
                          feed={
@@ -218,7 +213,11 @@ class TestMLPSaveLoad(unittest.TestCase):
            "./output_dp0/model_state_rank0.pdmodel",
            "./output_dp1/model_state_rank1.pdmodel"
        ]
-        load_distributed_checkpoint(ckpt_path, dist_main_prog)
+        dist_attr_path = [
+            "./output_dp0/dist_attr_rank0.pdattr",
+            "./output_dp1/dist_attr_rank1.pdattr"
+        ]
+        load_checkpoint_into_program(ckpt_path, dist_attr_path, dist_main_prog)
        for step in range(10, 20):
            res = exe.run(dist_main_prog,
                          feed={
@@ -248,7 +247,7 @@ class TestMLPSaveLoad(unittest.TestCase):
            if step == 10:
                path = "./output_mp{}".format(paddle.distributed.get_rank())
                os.makedirs(path, exist_ok=True)
-                save_distributed_checkpoint(dist_main_prog, path)
+                save_distributed_checkpoint(dist_main_prog, path, path)

            res = exe.run(dist_main_prog,
                          feed={
@@ -262,7 +261,11 @@ class TestMLPSaveLoad(unittest.TestCase):
            "./output_mp0/model_state_rank0.pdmodel",
            "./output_mp1/model_state_rank1.pdmodel"
        ]
-        load_distributed_checkpoint(ckpt_path, dist_main_prog)
+        dist_attr_path = [
+            "./output_mp0/dist_attr_rank0.pdattr",
+            "./output_mp1/dist_attr_rank1.pdattr"
+        ]
+        load_checkpoint_into_program(ckpt_path, dist_attr_path, dist_main_prog)
        for step in range(10, 20):
            res = exe.run(dist_main_prog,
                          feed={
@@ -296,7 +299,7 @@ class TestMLPSaveLoad(unittest.TestCase):
            if step == 10:
                path = "./output_pp{}".format(paddle.distributed.get_rank())
                os.makedirs(path, exist_ok=True)
-                save_distributed_checkpoint(dist_main_prog, path)
+                save_distributed_checkpoint(dist_main_prog, path, path)

            if paddle.distributed.get_rank() in [0]:
                res = exe.run(dist_main_prog,
@@ -319,7 +322,11 @@ class TestMLPSaveLoad(unittest.TestCase):
            "./output_pp0/model_state_rank0.pdmodel",
            "./output_pp1/model_state_rank1.pdmodel"
        ]
-        load_distributed_checkpoint(ckpt_path, dist_main_prog)
+        dist_attr_path = [
+            "./output_pp0/dist_attr_rank0.pdattr",
+            "./output_pp1/dist_attr_rank1.pdattr"
+        ]
+        load_checkpoint_into_program(ckpt_path, dist_attr_path, dist_main_prog)
        for step in range(10, 20):
            if paddle.distributed.get_rank() in [0]:
                res = exe.run(dist_main_prog,

--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_autoconvert.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_autoconvert.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestAutoParallelAutoConvert(TestMultipleGpus):
+    def test_auto_parallel_autoconvert(self):
+        self.run_mnist_2gpu('auto_parallel_autoconvert.py')
+
+
+if __name__ == "__main__":
+    unittest.main()