From 13cf4cede2c563b622335079dde391ef5c57a63c Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Wed, 15 Jun 2022 18:51:54 +0800
Subject: [PATCH] [IPU] Decoupling ipu sharding and modeling (#43164)

* Decoupling ipu sharding and modeling (#665)

* feat(shard): decoupling shard setting with modeling.

* fix(shard): split test cases to avoid failure.

* fix(shard): add function docs and fix typo.

* test(shard): add tests.

* test(shard): more test case.

* fix(): change ipu_index/stage default value to -1.

* fix format

Co-authored-by: czr-gc <96037699+czr-gc@users.noreply.github.com>
---
 python/paddle/fluid/framework.py              |  76 +++++-
 .../unittests/ipu/test_set_ipu_shard_api.py   | 252 ++++++++++++++++++
 python/paddle/static/__init__.py              |   3 +-
 3 files changed, 321 insertions(+), 10 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_set_ipu_shard_api.py

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 2412e300a77..687b244c6d5 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -48,6 +48,7 @@ __all__ = [
     'program_guard',
     'name_scope',
     'ipu_shard_guard',
+    'set_ipu_shard',
     'cuda_places',
     'cpu_places',
     'xpu_places',
@@ -252,28 +253,28 @@ def _test_eager_guard(place=None):
             _enable_legacy_dygraph()
 
 
-global_ipu_index = None
-global_ipu_stage = None
+global_ipu_index = -1
+global_ipu_stage = -1
 ipu_index_attr_name = 'ipu_index'
 ipu_stage_attr_name = 'ipu_stage'
 
 
 @signature_safe_contextmanager
-def ipu_shard_guard(index=None, stage=None):
+def ipu_shard_guard(index=-1, stage=-1):
     """
     Used to shard the graph on IPUs. Set each Op run on which IPU in the sharding and which stage in the pipelining.
 
     Args:
         index(int, optional): Specify which ipu the Tensor is computed on, (such as '0, 1, 2, 3').
-            The default value is None, which means the Op only run on IPU 0.
+            The default value is -1, which means the Op only run on IPU 0.
         stage(int, optional): Specify the computation order of the sharded model(such as '0, 1, 2, 3').
-            The sharded model will be computed from small to large. The default value is None, 
+            The sharded model will be computed from small to large. The default value is -1, 
             which means no pipelining computation order and run Ops in terms of graph.
     
     **Note**:
-    Only if the enable_manual_shard=True, the 'index' is able to be set not None. Please refer 
+    Only if the enable_manual_shard=True, the 'index' is able to be set not -1. Please refer 
     to :code:`paddle.static.IpuStrategy` . 
-    Only if the enable_pipelining=True, the 'stage' is able to be set not None. Please refer 
+    Only if the enable_pipelining=True, the 'stage' is able to be set not -1. Please refer 
     to :code:`paddle.static.IpuStrategy` .
     A index is allowed to match none stage or a stage. A stage is only allowed to match a new or 
     duplicated index.
@@ -311,6 +312,63 @@ def ipu_shard_guard(index=None, stage=None):
         global_ipu_stage = prev_ipu_stage
 
 
+def set_ipu_shard(call_func, index=-1, stage=-1):
+    """
+    Shard the ipu with the given call function. Set every ops in call function to the given ipu sharding.
+
+    Args:
+        call_func(Layer|function): Specify the call function to be wrapped.
+        index(int, optional): Specify which ipu the Tensor is computed on, (such as ‘0, 1, 2, 3’).
+            The default value is -1, which means the Op only run on IPU 0.
+        stage(int, optional): Specify the computation order of the sharded model(such as ‘0, 1, 2, 3’).
+            The sharded model will be computed from small to large. The default value is -1, 
+            which means no pipelining computation order and run Ops in terms of graph.
+
+    Returns:
+        The wrapped call function.
+
+
+    Examples:
+        .. code-block:: python
+
+            # required: ipu
+
+            import paddle
+            paddle.enable_static()
+            a = paddle.static.data(name='data', shape=[None, 1], dtype='float32')
+            relu = paddle.nn.ReLU()
+            relu = paddle.static.set_ipu_shard(relu, index=1, stage=1)
+            relu(a)
+    """
+
+    def decorate(func):
+
+        def wrapper(*args, **kwargs):
+            with ipu_shard_guard(index=index, stage=stage):
+                return func(*args, **kwargs)
+
+        return wrapper
+
+    from .dygraph.layers import Layer
+    if not isinstance(call_func, Layer):
+        if callable(call_func):
+            return decorate(call_func)
+        else:
+            raise TypeError(
+                "Unsupported type. Only accept paddle.nn.Layer or function.")
+
+    # patch paddle.nn.Layer
+    class BlockFn(type(call_func)):
+
+        def __call__(self, *args, **kwargs):
+            with ipu_shard_guard(index=index, stage=stage):
+                return super().__call__(*args, **kwargs)
+
+    BlockFn.__name__ = type(call_func).__name__
+    call_func.__class__ = BlockFn
+    return call_func
+
+
 def require_version(min_version, max_version=None):
     """
         Check if the installed version of PaddlePaddle is in [min_version, max_version],
@@ -2772,10 +2830,10 @@ class Operator(object):
 
             # proto.attrs doesn't include ipu_index
             if core.is_compiled_with_ipu():
-                if global_ipu_index is not None:
+                if global_ipu_index >= 0:
                     self._update_desc_attr(ipu_index_attr_name,
                                            global_ipu_index)
-                if global_ipu_stage is not None:
+                if global_ipu_stage >= 0:
                     self._update_desc_attr(ipu_stage_attr_name,
                                            global_ipu_stage)
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_ipu_shard_api.py b/python/paddle/fluid/tests/unittests/ipu/test_set_ipu_shard_api.py
new file mode 100644
index 00000000000..a7104fd4266
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_set_ipu_shard_api.py
@@ -0,0 +1,252 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from paddle.static import set_ipu_shard
+
+paddle.enable_static()
+
+
+class SimpleNet(paddle.nn.Layer):
+
+    def __init__(self, input_size, output_size):
+        super(SimpleNet, self).__init__()
+        self.linear1 = nn.Linear(input_size, output_size)
+        self.relu1 = nn.ReLU()
+        self.linear2 = nn.Linear(input_size, output_size)
+        self.relu2 = nn.ReLU()
+        self.linear3 = nn.Linear(input_size, output_size)
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.relu1(x)
+        x = self.linear_relu2(x)
+        x = self.linear3(x)
+        return x
+
+    def linear_relu2(self, x):
+        x = self.linear2(x)
+        x = self.relu2(x)
+        return x
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestSetIpuShard(unittest.TestCase):
+
+    def _test(self):
+        # build graph
+        main_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog):
+            x = paddle.static.data(name='X', shape=[10, 46], dtype='float32')
+            label = paddle.static.data(name='Y',
+                                       shape=[10, 46],
+                                       dtype='float32')
+            model = SimpleNet(46, 46)
+
+            set_ipu_shard(model.linear1, index=1)
+            set_ipu_shard(model.relu1, index=2)
+            model.linear_relu2 = set_ipu_shard(model.linear_relu2, index=3)
+            model.linear3 = set_ipu_shard(model.linear3, index=4)
+            out = model(x)
+
+        ipu_index_list = []
+        for op in main_prog.global_block().ops:
+            if op.desc.has_attr("ipu_index"):
+                ipu_index_list.append(op.desc.attr("ipu_index"))
+
+        return ipu_index_list
+
+    def test_set_ipu_shard(self):
+        ipu_index_list = self._test()
+        expected_ipu_index_list = [1, 1, 2, 3, 3, 3, 4, 4]
+
+        self.assertTrue(
+            np.allclose(ipu_index_list, expected_ipu_index_list, atol=0))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestSetIpuPipeline(unittest.TestCase):
+
+    def _test(self):
+        # build graph
+        main_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog):
+            x = paddle.static.data(name='X', shape=[10, 46], dtype='float32')
+            label = paddle.static.data(name='Y',
+                                       shape=[10, 46],
+                                       dtype='float32')
+            model = SimpleNet(46, 46)
+
+            set_ipu_shard(model.linear1, stage=1)
+            set_ipu_shard(model.relu1, stage=2)
+            model.linear_relu2 = set_ipu_shard(model.linear_relu2, stage=3)
+            model.linear3 = set_ipu_shard(model.linear3, stage=4)
+            out = model(x)
+
+        ipu_index_list = []
+        for op in main_prog.global_block().ops:
+            if op.desc.has_attr("ipu_stage"):
+                ipu_index_list.append(op.desc.attr("ipu_stage"))
+
+        return ipu_index_list
+
+    def test_set_ipu_shard(self):
+        ipu_index_list = self._test()
+        expected_ipu_index_list = [1, 1, 2, 3, 3, 3, 4, 4]
+
+        self.assertTrue(
+            np.allclose(ipu_index_list, expected_ipu_index_list, atol=0))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestSetIpuShardAndPipeline(unittest.TestCase):
+
+    def _test(self):
+        # build graph
+        main_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog):
+            x = paddle.static.data(name='X', shape=[10, 46], dtype='float32')
+            label = paddle.static.data(name='Y',
+                                       shape=[10, 46],
+                                       dtype='float32')
+            model = SimpleNet(46, 46)
+
+            set_ipu_shard(model.linear1, index=1, stage=2)
+            set_ipu_shard(model.relu1, index=2, stage=3)
+            model.linear_relu2 = set_ipu_shard(model.linear_relu2,
+                                               index=3,
+                                               stage=4)
+            model.linear3 = set_ipu_shard(model.linear3, index=4, stage=1)
+            out = model(x)
+
+        ipu_index_list = []
+        ipu_stage_list = []
+        for op in main_prog.global_block().ops:
+            if op.desc.has_attr("ipu_index"):
+                ipu_index_list.append(op.desc.attr("ipu_index"))
+            if op.desc.has_attr("ipu_stage"):
+                ipu_stage_list.append(op.desc.attr("ipu_stage"))
+
+        return ipu_index_list + ipu_stage_list
+
+    def test_set_ipu_shard(self):
+        ipu_index_list = self._test()
+        expected_ipu_index_list = [
+            1, 1, 2, 3, 3, 3, 4, 4, 2, 2, 3, 4, 4, 4, 1, 1
+        ]
+
+        self.assertTrue(
+            np.allclose(ipu_index_list, expected_ipu_index_list, atol=0))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestSetIpuForModel(unittest.TestCase):
+
+    def _test(self):
+        # build graph
+        main_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog):
+            x = paddle.static.data(name='X', shape=[10, 46], dtype='float32')
+            label = paddle.static.data(name='Y',
+                                       shape=[10, 46],
+                                       dtype='float32')
+            model = SimpleNet(46, 46)
+
+            set_ipu_shard(model, index=1, stage=2)
+            out = model(x)
+
+        ipu_index_list = []
+        ipu_stage_list = []
+        for op in main_prog.global_block().ops:
+            if op.desc.has_attr("ipu_index"):
+                ipu_index_list.append(op.desc.attr("ipu_index"))
+            if op.desc.has_attr("ipu_stage"):
+                ipu_stage_list.append(op.desc.attr("ipu_stage"))
+
+        return ipu_index_list + ipu_stage_list
+
+    def test_set_ipu_shard(self):
+        ipu_index_list = self._test()
+        expected_ipu_index_list = [
+            1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2
+        ]
+
+        self.assertTrue(
+            np.allclose(ipu_index_list, expected_ipu_index_list, atol=0))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestSetIpuMixedModel(unittest.TestCase):
+
+    def setUp(self):
+
+        def linear_relu2_mixed(self, x):
+            with paddle.static.ipu_shard_guard(index=2, stage=3):
+                x = self.linear2(x)
+            with paddle.static.ipu_shard_guard(index=3, stage=4):
+                x = self.relu2(x)
+            return x
+
+        self._old_linear = SimpleNet.linear_relu2
+        SimpleNet.linear_relu2 = linear_relu2_mixed
+
+    def tearDown(self):
+        SimpleNet.linear_relu2 = self._old_linear
+
+    def _test(self):
+        # build graph
+        main_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog):
+            x = paddle.static.data(name='X', shape=[10, 46], dtype='float32')
+            label = paddle.static.data(name='Y',
+                                       shape=[10, 46],
+                                       dtype='float32')
+            model = SimpleNet(46, 46)
+
+            set_ipu_shard(model.linear1, index=1, stage=2)
+            set_ipu_shard(model.relu1, index=2, stage=3)
+            model.linear3 = set_ipu_shard(model.linear3, index=4, stage=1)
+            out = model(x)
+
+        ipu_index_list = []
+        ipu_stage_list = []
+        for op in main_prog.global_block().ops:
+            if op.desc.has_attr("ipu_index"):
+                ipu_index_list.append(op.desc.attr("ipu_index"))
+            if op.desc.has_attr("ipu_stage"):
+                ipu_stage_list.append(op.desc.attr("ipu_stage"))
+
+        return ipu_index_list + ipu_stage_list
+
+    def test_set_ipu_shard(self):
+        ipu_index_list = self._test()
+        expected_ipu_index_list = [
+            1, 1, 2, 2, 2, 3, 4, 4, 2, 2, 3, 3, 3, 4, 1, 1
+        ]
+
+        self.assertTrue(
+            np.allclose(ipu_index_list, expected_ipu_index_list, atol=0))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 8707c259ead..191a76c6dc7 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -51,6 +51,7 @@ from ..fluid.framework import mlu_places  # noqa: F401
 from ..fluid.framework import npu_places  # noqa: F401
 from ..fluid.framework import Variable  # noqa: F401
 from ..fluid.framework import ipu_shard_guard  # noqa: F401
+from ..fluid.framework import set_ipu_shard  # noqa: F401
 from ..fluid.layers.control_flow import Print  # noqa: F401
 from ..fluid.layers.nn import py_func  # noqa: F401
 from ..fluid.parallel_executor import ParallelExecutor  # noqa: F401
@@ -81,5 +82,5 @@ __all__ = [  #noqa
     'deserialize_persistables', 'load_from_file', 'normalize_program',
     'load_program_state', 'set_program_state', 'cpu_places', 'cuda_places',
     'xpu_places', 'npu_places', 'mlu_places', 'Variable', 'create_global_var',
-    'accuracy', 'auc', 'device_guard', 'create_parameter'
+    'accuracy', 'auc', 'device_guard', 'create_parameter', 'set_ipu_shard'
 ]
-- 
GitLab