From c9cd47d96b2cccb34d8dc269a055f5b64346a10e Mon Sep 17 00:00:00 2001
From: JZ-LIANG <jianzhongliang10@gmail.com>
Date: Wed, 2 Mar 2022 15:58:57 +0800
Subject: [PATCH] [Auto Parallel] Adapt Partitioner & DistOp for ERNIE3.0
 Inference and cache (#39895)

* adapot dist op

* add dist_fill_constant_batch_size_like

* remvoe print

* update compitable

* add unitest
---
 .../auto_parallel/operators/__init__.py       |   1 +
 .../auto_parallel/operators/dist_eltwise.py   |   0
 .../auto_parallel/operators/dist_embedding.py |   5 +-
 .../dist_fill_constant_batch_size_like.py     | 127 ++++++++++++++++++
 .../auto_parallel/operators/dist_matmul.py    |   8 +-
 .../distributed/auto_parallel/partitioner.py  |   3 +
 .../test_auto_parallel_while_op.py            |  28 ++++
 7 files changed, 168 insertions(+), 4 deletions(-)
 mode change 100755 => 100644 python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
 create mode 100644 python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py

diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py
index 9f84df2d896..db6f909f8ca 100644
--- a/python/paddle/distributed/auto_parallel/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/operators/__init__.py
@@ -27,3 +27,4 @@ from . import dist_eltwise
 from . import dist_check_finite_and_unscale
 from . import dist_update_loss_scaling
 from . import dist_split
+from . import dist_fill_constant_batch_size_like
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
old mode 100755
new mode 100644
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
index 94eb0d2d469..32f8e2acef5 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
@@ -155,7 +155,7 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):
             kwargs['Out'])
 
         Ids_var = main_block.var(kwargs['Ids'][0])
-        Weight_var = main_block.var(kwargs['W'][0])
+        Weight_var = main_block._var_recursive(kwargs['W'][0])
         Out_var = main_block.var(kwargs['Out'][0])
 
         # got dist attribute info
@@ -277,7 +277,8 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):
 
         # param initialization sync
         if Weight_var.is_parameter and not op_dist_attr.is_recompute:
-            assert Weight_var.name not in dist_op_context.already_init_sync_vars
+            if Weight_var.name in dist_op_context.already_init_sync_vars:
+                return
             dist_op_context.already_init_sync_vars.add(Weight_var.name)
             param = startup_block.var(Weight_var.name)
             param_dist_attr = ctx.get_tensor_dist_attr_for_program(param)
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
new file mode 100644
index 00000000000..0c9d9eda02e
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+from .common import DistributedOperatorImplContainer
+from .common import DistributedOperatorImpl
+from .common import register_distributed_operator_impl_container
+from .common import register_distributed_operator_impl
+from ..utils import is_dim_shard
+from ..utils import is_dim_replicate
+from ..utils import is_valid_list_index
+from ..utils import compute_compatible_dim_mapping
+from ..utils import compute_compatible_dims_mapping
+from ..utils import compute_compatible_and_update_dim_mapping
+from ..utils import set_dist_op_desc_original_id
+from paddle.fluid import core, unique_name
+from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.framework import Program, Parameter, Variable, program_guard
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+from .dist_default import DistributedDefaultImpl0
+
+
+class DistributedFillConstantBatchSizeLike(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super(DistributedFillConstantBatchSizeLike, self).__init__(op_type)
+
+
+register_distributed_operator_impl_container(
+    DistributedFillConstantBatchSizeLike("fill_constant_batch_size_like"))
+
+
+class DistributedFillConstantBatchSizeLikeImpl0(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedFillConstantBatchSizeLikeImpl0, self).__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+
+        return True
+
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        shape_list = op_desc.attr("shape")
+
+        if len(shape_list) != len(out_dims_mapping):
+            return False
+
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)):
+            return False
+
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        in_name = op_desc.input('Input')[0]
+        in_dims_mapping = op_dist_attr.get_input_dims_mapping(in_name)
+
+        # the dim_mapping of batch dimension should be the same
+        return out_dims_mapping[0] == in_dims_mapping[0]
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        out_name = op_desc.output('Out')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+
+        # only the batch size dimemsion of input and output are relative.
+        dim_changed = compute_compatible_and_update_dim_mapping(
+            [x_dims_mapping, out_dims_mapping], [0, 0])
+        if dim_changed:
+            changed = True
+
+        return changed
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        """
+        kwargs: inputname_mapping & outputname_mapping
+        """
+        DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
+        dist_op_context = ctx.dist_op_context
+        src_op = dist_op_context.cur_src_op
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+        main_block = dist_op_context.work_block
+        op = main_block.ops[-1]
+        assert op.type == "fill_constant_batch_size_like"
+
+        # modify shape attr according to how output are partitioned
+        out_name = op.output('Out')[0]
+        dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        shape_list = op.attr("shape")
+        # modify target shape
+        for idx, axis in enumerate(dims_mapping):
+            if axis >= 0:
+                shape_list[idx] = shape_list[idx] // process_mesh_shape[axis]
+
+        op._set_attr("shape", shape_list)
+        main_block._sync_with_cpp()
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
+
+
+register_distributed_operator_impl(
+    "fill_constant_batch_size_like",
+    DistributedFillConstantBatchSizeLikeImpl0("fill_by_shape"))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index 9eb24a65e60..058ae1d0a9f 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -433,8 +433,8 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
 
 def _init_param_sync(Weight_var, dist_op_context, startup_block, ctx, rank_id):
 
-    assert Weight_var.name not in dist_op_context.already_init_sync_vars, "{} is in {}.".format(
-        Weight_var.name, dist_op_context.already_init_sync_vars)
+    if Weight_var.name in dist_op_context.already_init_sync_vars:
+        return
     assert startup_block.has_var(Weight_var.name)
     dist_op_context.already_init_sync_vars.add(Weight_var.name)
     param = startup_block.var(Weight_var.name)
@@ -819,6 +819,8 @@ class DistributedMatmulImpl1(DistributedOperatorImpl):
                                 out_var_dist_attr)
 
         intermediate_var_0 = main_block.create_var(
+            name=unique_name.generate_with_ignorable_key(".".join(
+                ["c_allreduce_sum", 'tmp'])),
             shape=Out_var.shape,
             dtype=Out_var.dtype,
             type=Out_var.type,
@@ -1323,6 +1325,8 @@ class DistributedMatmulV2Impl1(DistributedOperatorImpl):
                                 out_var_dist_attr)
 
         intermediate_var_0 = main_block.create_var(
+            name=unique_name.generate_with_ignorable_key(".".join(
+                ["c_allreduce_sum", 'tmp'])),
             shape=Out_var.shape,
             dtype=Out_var.dtype,
             type=Out_var.type,
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index 2f88407c093..ed5ec85d84f 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -285,6 +285,9 @@ def _get_dist_shape(var, dist_attr):
     var_shape = var.shape
     mapping = dist_attr.dims_mapping
     mesh = dist_attr.process_mesh.topology
+    if mapping == []:
+        return var_shape
+
     assert len(var_shape) == len(
         mapping
     ), "variable shape [{}] and dim_mapping [{}] is NOT match !".format(
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py
index 1cd8f8f3e70..07e6a2c4346 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py
@@ -174,6 +174,7 @@ def get_program():
             dtype='float32')
         label = static.data(
             name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
+
         data_holder = [input, label]
         # dataloader
         dataloader = paddle.io.DataLoader.from_generator(
@@ -194,6 +195,17 @@ def get_program():
                 "dims_mapping": [-1, -1, -1]
             })
 
+        # fill constant bsz like
+        tmp = paddle.fluid.layers.fill_constant_batch_size_like(
+            input=input, shape=[-1, 16, 0, 48], dtype='float32', value=0)
+        auto.shard_tensor(
+            tmp,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, 0, -1, -1]
+            })
+
+        # model
         mlp_start = MLPLayer(
             hidden_size=hidden_size,
             intermediate_size=4 * hidden_size,
@@ -395,6 +407,9 @@ def completion(train_program, start_program, dist_context):
                         op_dist_attr.impl_idx = 0
                     else:
                         op_dist_attr.impl_idx = 1
+            elif op.type == "fill_constant_batch_size_like":
+                op_dist_attr.impl_type = "fill_constant_batch_size_like"
+                op_dist_attr.impl_idx = 0
             else:
                 op_dist_attr.impl_type = "default"
                 op_dist_attr.impl_idx = 0
@@ -428,6 +443,12 @@ class TestMLP(unittest.TestCase):
         dist_main_prog, dist_startup_prog = partition(
             train_program, start_program, dist_context)
         global_block_ops = dist_main_prog.blocks[0].ops
+
+        fill_op = None
+        for op in global_block_ops:
+            if op.type == "fill_constant_batch_size_like":
+                fill_op = op
+
         global_block_ops = [op.type for op in global_block_ops]
         sub_block_ops = dist_main_prog.blocks[1].ops
         sub_block_ops = [op.type for op in sub_block_ops]
@@ -435,6 +456,13 @@ class TestMLP(unittest.TestCase):
         self.assertTrue("c_allreduce_sum" in global_block_ops)
         self.assertTrue("c_allreduce_sum" in sub_block_ops)
 
+        # test fill_constant_batch_size_like
+
+        self.assertTrue(fill_op is not None)
+        ref_shape = [-1, 8, 0, 48]
+        shape = fill_op.attr("shape")
+        self.assertTrue(ref_shape == shape)
+
 
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab