fix pylint warnings

14fe72f3 · Yi Huaijie · bd845dd0 · 14fe72f3 · 14fe72f3 · 14fe72f3
90 changed file
--- a/tests/st/auto_parallel/onehot_model_parallel.py
+++ b/tests/st/auto_parallel/onehot_model_parallel.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-import numpy as np
-import os
-import pytest
-
-import mindspore as ms
-import mindspore.communication.management as distributedTool
-import mindspore.context as context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-
-device_num = 2
-device_id = int(os.getenv('DEVICE_ID'))
-rank_id = 0
-
-
-def setup_module():
-    global device_num
-    global rank_id
-    np.random.seed(0)
-    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
-    context.set_context(device_id=device_id)
-    distributedTool.init()
-    device_num = distributedTool.get_group_size()
-    rank_id = distributedTool.get_rank()
-    context.set_auto_parallel_context(device_num=device_num,
-                                      global_rank=rank_id)
-
-
-def teardown_module():
-    distributedTool.release()
-
-
-class Onehot(Cell):
-    def __init__(self, axis=-1, depth=1, on_value=1.0, off_value=0.0, strategy=None):
-        super(Onehot, self).__init__()
-        trans_stra = None
-        if strategy:
-            trans_stra = (strategy[0],)
-        self.onehot = P.OneHot().set_strategy(strategy=strategy)
-        self.depth = depth
-        self.on_value = Tensor(on_value, ms.float32)
-        self.off_value = Tensor(off_value, ms.float32)
-        self.transpose = P.Transpose().set_strategy(strategy=trans_stra)
-        self.sub = P.Sub().set_strategy(strategy=((1, 1), (1, 1)))
-
-    def construct(self, input, indices):
-        x = self.onehot(indices, self.depth, self.on_value, self.off_value)
-        x = self.transpose(x, (1, 0))
-        x = self.sub(input, x)
-        return x
-
-
-class DataGenerator():
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-
-    def generate_data(self, shape):
-        data = np.random.rand(*shape)
-        return data
-
-    def input_data(self, shape):
-        data = (self.generate_data(shape) * 2).astype(np.float32)
-        stra = [1] * len(shape)
-        stra[0] = device_num
-        datas = self.get_parallel_blocks(data, stra)
-        return Tensor(data), Tensor(datas[rank_id])
-
-    def label_data(self, shape, classes):
-        data = (self.generate_data(shape) * (classes - 1)).astype(np.int32)
-        stra = [1] * len(shape)
-        stra[0] = device_num
-        datas = self.get_parallel_blocks(data, stra)
-        return Tensor(data), Tensor(datas[rank_id])
-
-
-class OneHotFactory:
-    def __init__(self, batch_size, classes, on_value=1.0, off_value=0.0, axis=None, strategy=None):
-        dataGen = DataGenerator()
-        self.input_full, self.input_part = dataGen.input_data((classes, batch_size))
-        self.label_full, self.label_part = dataGen.label_data((batch_size,), classes)
-        self.depth = classes
-        self.on_value = on_value
-        self.off_value = off_value
-        self.axis = axis
-        self.strategy = strategy
-
-    def forward_mindspore_single_impl(self):
-        net = Onehot(axis=self.axis,
-                     depth=self.depth,
-                     on_value=self.on_value,
-                     off_value=self.off_value)
-        out = net(self.input_full, self.label_full)
-        return out
-
-    def forward_mindspore_parallel_impl(self):
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        net = Onehot(axis=self.axis,
-                     depth=self.depth,
-                     on_value=self.on_value,
-                     off_value=self.off_value, strategy=self.strategy)
-        out = net.compile_and_run(self.input_full, self.label_full)
-        return out
-
-    def forward_cmp(self):
-        out_mindspore_single = self.forward_mindspore_single_impl().asnumpy()
-        context.reset_auto_parallel_context()
-        out_mindspore_parallel = self.forward_mindspore_parallel_impl().asnumpy()
-        context.reset_auto_parallel_context()
-        assert np.allclose(out_mindspore_single, out_mindspore_parallel, 0.0001, 0.0001)
-
-
-def test_reid_onehot_forward_int32_128_depth1024_model_parallel():
-    fact = OneHotFactory(batch_size=128,
-                         classes=1024,
-                         on_value=1.000000,
-                         off_value=0.000000,
-                         axis=-1,
-                         strategy=((1, device_num), (), ()))
-    fact.forward_cmp()
-
-
-def test_reid_onehot_forward_int32_1024_depth128_model_parallel():
-    fact = OneHotFactory(batch_size=1024,
-                         classes=128,
-                         on_value=1.000000,
-                         off_value=0.000000,
-                         axis=-1,
-                         strategy=((1, device_num), (), ()))
-    fact.forward_cmp()
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import os
+import numpy as np
+
+import mindspore as ms
+import mindspore.communication.management as distributedTool
+import mindspore.context as context
+from mindspore.common.tensor import Tensor
+from mindspore.nn import Cell
+from mindspore.ops import operations as P
+
+device_num = 2
+device_id = int(os.getenv('DEVICE_ID'))
+rank_id = 0
+
+
+def setup_module():
+    global device_num
+    global rank_id
+    np.random.seed(0)
+    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
+    context.set_context(device_id=device_id)
+    distributedTool.init()
+    device_num = distributedTool.get_group_size()
+    rank_id = distributedTool.get_rank()
+    context.set_auto_parallel_context(device_num=device_num,
+                                      global_rank=rank_id)
+
+
+def teardown_module():
+    distributedTool.release()
+
+
+class Onehot(Cell):
+    def __init__(self, axis=-1, depth=1, on_value=1.0, off_value=0.0, strategy=None):
+        super(Onehot, self).__init__()
+        trans_stra = None
+        if strategy:
+            trans_stra = (strategy[0],)
+        self.onehot = P.OneHot().set_strategy(strategy=strategy)
+        self.depth = depth
+        self.on_value = Tensor(on_value, ms.float32)
+        self.off_value = Tensor(off_value, ms.float32)
+        self.transpose = P.Transpose().set_strategy(strategy=trans_stra)
+        self.sub = P.Sub().set_strategy(strategy=((1, 1), (1, 1)))
+        self.axis = axis
+
+    def construct(self, input_, indices):
+        x = self.onehot(indices, self.depth, self.on_value, self.off_value)
+        x = self.transpose(x, (1, 0))
+        x = self.sub(input_, x)
+        return x
+
+
+class DataGenerator():
+    def get_parallel_blocks(self, input_, strategy):
+        blocks = [input_]
+        i = 0
+        for stra in strategy:
+            temp = []
+            while len(blocks) > 0:
+                block = blocks.pop(0)
+                temp.extend(np.split(block, stra, axis=i))
+            blocks.extend(temp)
+            i += 1
+        return blocks
+
+    def generate_data(self, shape):
+        data = np.random.rand(*shape)
+        return data
+
+    def input_data(self, shape):
+        data = (self.generate_data(shape) * 2).astype(np.float32)
+        stra = [1] * len(shape)
+        stra[0] = device_num
+        datas = self.get_parallel_blocks(data, stra)
+        return Tensor(data), Tensor(datas[rank_id])
+
+    def label_data(self, shape, classes):
+        data = (self.generate_data(shape) * (classes - 1)).astype(np.int32)
+        stra = [1] * len(shape)
+        stra[0] = device_num
+        datas = self.get_parallel_blocks(data, stra)
+        return Tensor(data), Tensor(datas[rank_id])
+
+
+class OneHotFactory:
+    def __init__(self, batch_size, classes, on_value=1.0, off_value=0.0, axis=None, strategy=None):
+        data_gen = DataGenerator()
+        self.input_full, self.input_part = data_gen.input_data((classes, batch_size))
+        self.label_full, self.label_part = data_gen.label_data((batch_size,), classes)
+        self.depth = classes
+        self.on_value = on_value
+        self.off_value = off_value
+        self.axis = axis
+        self.strategy = strategy
+
+    def forward_mindspore_single_impl(self):
+        net = Onehot(axis=self.axis,
+                     depth=self.depth,
+                     on_value=self.on_value,
+                     off_value=self.off_value)
+        out = net(self.input_full, self.label_full)
+        return out
+
+    def forward_mindspore_parallel_impl(self):
+        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
+        net = Onehot(axis=self.axis,
+                     depth=self.depth,
+                     on_value=self.on_value,
+                     off_value=self.off_value, strategy=self.strategy)
+        out = net.compile_and_run(self.input_full, self.label_full)
+        return out
+
+    def forward_cmp(self):
+        out_mindspore_single = self.forward_mindspore_single_impl().asnumpy()
+        context.reset_auto_parallel_context()
+        out_mindspore_parallel = self.forward_mindspore_parallel_impl().asnumpy()
+        context.reset_auto_parallel_context()
+        assert np.allclose(out_mindspore_single, out_mindspore_parallel, 0.0001, 0.0001)
+
+
+def test_reid_onehot_forward_int32_128_depth1024_model_parallel():
+    fact = OneHotFactory(batch_size=128,
+                         classes=1024,
+                         on_value=1.000000,
+                         off_value=0.000000,
+                         axis=-1,
+                         strategy=((1, device_num), (), ()))
+    fact.forward_cmp()
+
+
+def test_reid_onehot_forward_int32_1024_depth128_model_parallel():
+    fact = OneHotFactory(batch_size=1024,
+                         classes=128,
+                         on_value=1.000000,
+                         off_value=0.000000,
+                         axis=-1,
+                         strategy=((1, device_num), (), ()))
+    fact.forward_cmp()
--- a/tests/st/auto_parallel/soft_entropy_loss_expand_parallel.py
+++ b/tests/st/auto_parallel/soft_entropy_loss_expand_parallel.py
--- a/tests/st/auto_parallel/test_expand_loss.py
+++ b/tests/st/auto_parallel/test_expand_loss.py
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-import os
-import pytest
-
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_ascend_training
-@pytest.mark.platform_arm_ascend_training
-@pytest.mark.env_single
-def test_expand_loss():
-    sh_path = os.path.split(os.path.realpath(__file__))[0]
-    ret = os.system(f"sh {sh_path}/run_auto_parallel_loss_expand.sh")
-    assert (ret == 0)
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import os
+import pytest
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.env_single
+def test_expand_loss():
+    sh_path = os.path.split(os.path.realpath(__file__))[0]
+    ret = os.system(f"sh {sh_path}/run_auto_parallel_loss_expand.sh")
+    assert ret == 0
--- a/tests/st/auto_parallel/test_model_parallel_onehot.py
+++ b/tests/st/auto_parallel/test_model_parallel_onehot.py
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-import os
-import pytest
-
-
-def test_expand_loss():
-    ret = os.system("sh run_onehot_model_parallel.sh")
-    assert (ret == 0)
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import os
+
+
+def test_expand_loss():
+    ret = os.system("sh run_onehot_model_parallel.sh")
+    assert ret == 0
--- a/tests/st/auto_parallel/test_resnet50_expand_loss_2p.py
+++ b/tests/st/auto_parallel/test_resnet50_expand_loss_2p.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 # ============================================================================

-import numpy as np
 import os
+import numpy as np
 import pytest

 import mindspore.common.dtype as mstype
@@ -37,31 +37,29 @@ init()
 context.set_auto_parallel_context(mirror_mean=True, parallel_mode=ParallelMode.AUTO_PARALLEL)


-def weight_variable(shape, factor=0.1):
+def weight_variable():
    return One()


 def _conv3x3(in_channels, out_channels, stride=1, padding=0, pad_mode='same'):
-    init_value = weight_variable((out_channels, in_channels, 3, 3))
+    init_value = weight_variable()
    return nn.Conv2d(in_channels, out_channels,
                     kernel_size=3, stride=stride, padding=padding, pad_mode=pad_mode, weight_init=init_value)


 def _conv1x1(in_channels, out_channels, stride=1, padding=0, pad_mode='same'):
-    init_value = weight_variable((out_channels, in_channels, 1, 1))
+    init_value = weight_variable()
    return nn.Conv2d(in_channels, out_channels,
                     kernel_size=1, stride=stride, padding=padding, pad_mode=pad_mode, weight_init=init_value)


 def _conv7x7(in_channels, out_channels, stride=1, padding=0, pad_mode='same'):
-    init_value = weight_variable((out_channels, in_channels, 7, 7))
+    init_value = weight_variable()
    return nn.Conv2d(in_channels, out_channels,
                     kernel_size=7, stride=stride, padding=padding, pad_mode=pad_mode, weight_init=init_value)


 def _fused_bn(channels, momentum=0.9):
-    init_weight = weight_variable((channels,))
-    init_bias = weight_variable((channels,))
    return nn.BatchNorm2d(channels, momentum=momentum)


@@ -210,8 +208,8 @@ class ResNet(nn.Cell):

        self.mean = P.ReduceMean(keep_dims=True)
        self.end_point = nn.Dense(2048, num_classes, has_bias=True,
-                                  weight_init=weight_variable((num_classes, 2048)),
-                                  bias_init=weight_variable((num_classes,)))
+                                  weight_init=weight_variable(),
+                                  bias_init=weight_variable())
        self.squeeze = P.Squeeze()
        self.cast = P.Cast()

@@ -345,9 +343,8 @@ class Dataset():
            raise StopIteration
        self.index += 1
        if self.input_num == 2:
-            return self.predict, self.label
-        else:
-            return self.predict,
+            return (self.predict, self.label)
+        return (self.predict,)

    def reset(self):
        self.index = 0
@@ -364,7 +361,7 @@ class ModelCallback(Callback):
        super(ModelCallback, self).__init__()
        self.loss_list = []

-    def epoch_end(self, run_context, *args):
+    def epoch_end(self, run_context):
        cb_params = run_context.original_args()
        result = cb_params.net_outputs
        self.loss_list.append(result.asnumpy().mean())
@@ -376,9 +373,9 @@ class ModelCallback(Callback):
 def test_train_feed(num_classes=8192):
    set_algo_parameters(elementwise_op_strategy_follow=True)
    parallel_callback = ModelCallback()
-    dataGen = DataGenerator()
-    input_full, input_part = dataGen.input_data((32 * 2, 3, 224, 224))
-    label_full, label_part = dataGen.label_data((32 * 2,))
+    data_gen = DataGenerator()
+    _, input_part = data_gen.input_data((32 * 2, 3, 224, 224))
+    _, label_part = data_gen.label_data((32 * 2,))
    dataset = Dataset(input_part, label_part)
    net = resnet50(num_classes)
    loss = SoftmaxCrossEntropyExpand(sparse=True)
@@ -396,9 +393,9 @@ def test_train_feed(num_classes=8192):
 def test_train_feed2(num_classes=1001):
    set_algo_parameters(elementwise_op_strategy_follow=True)
    parallel_callback = ModelCallback()
-    dataGen = DataGenerator()
-    input_full, input_part = dataGen.input_data((32 * 2, 3, 224, 224))
-    label_full, label_part = dataGen.label_data((32 * 2,))
+    data_gen = DataGenerator()
+    _, input_part = data_gen.input_data((32 * 2, 3, 224, 224))
+    _, label_part = data_gen.label_data((32 * 2,))
    dataset = Dataset(input_part, label_part)
    net = resnet50(num_classes)
    loss = SoftmaxCrossEntropyExpand(sparse=True)

--- a/tests/ut/python/communication/__init__.py
+++ b/tests/ut/python/communication/__init__.py
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-
-sys.path.append("../../..")
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+sys.path.append("../../..")
--- a/tests/ut/python/communication/test_comm.py
+++ b/tests/ut/python/communication/test_comm.py
@@ -25,7 +25,6 @@ from mindspore.nn import Dense
 from mindspore.nn import Momentum
 from mindspore.nn import ReLU
 from mindspore.nn import TrainOneStepCell, WithLossCell
-from mindspore.ops.operations import Split
 from mindspore.ops.operations.comm_ops import AllReduce, AllGather, _AlltoAll, ReduceOp, ReduceScatter
 from mindspore.ops.operations.comm_ops import Broadcast


--- a/tests/ut/python/communication/test_data_parallel_lenet.py
+++ b/tests/ut/python/communication/test_data_parallel_lenet.py
@@ -16,8 +16,8 @@
 @File   : test_data_parallel_lenet.py
 @Desc   : test data parallel lenet
 """
-import numpy as np
 import os
+import numpy as np

 import mindspore.context as context
 import mindspore.nn as nn
@@ -80,7 +80,6 @@ def test_lenet5_train_step_training_pynative():
    context.reset_auto_parallel_context()
    context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL,
                                      device_num=8, mirror_mean=True)
-    size = 3
    predict = Tensor(np.ones([1, 1, 32, 32]).astype(np.float32) * 0.01)
    label = Tensor(np.zeros([1, 10]).astype(np.float32))
    DatasetLenet(predict, label, 2)

--- a/tests/ut/python/parallel/__init__.py
+++ b/tests/ut/python/parallel/__init__.py
@@ -19,7 +19,7 @@ from mindspore.parallel._utils import _reset_op_id
 from mindspore.parallel.algo_parameter_config import reset_algo_parameters


-def setup_module(module):
+def setup_module():
    auto_parallel_context().set_enable_all_reduce_fusion(enable_all_reduce_fusion=True)
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False)
    reset_cost_model_context()

--- a/tests/ut/python/parallel/parallel_end_to_end/add_relu/_test_add_relu_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/add_relu/_test_add_relu_parallel_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import os
-import pytest
-
-import mindspore as ms
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all_with_sens
-
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-
-
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    distributedTool.create_group("0-3", [0, 1, 2, 3])
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-
-
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-
-
-class AddRelu(Cell):
-    def __init__(self, strategy0=None, strategy1=None):
-        super(AddRelu, self).__init__()
-        self.add = P.TensorAdd(strategy=strategy0)
-        self.relu = P.ReLU(strategy=strategy1)
-
-    def construct(self, x, z):
-        out = self.add(x, z)
-        return self.relu(out)
-
-
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-
-    def construct(self, x, y, output_grad):
-        return grad_all_with_sens(self.network)(x, y, output_grad)
-
-
-class AddReluFactory:
-    def __init__(self, input_shape, strategy0, strategy1):
-        prefix = ""
-        size = 1
-        for s in input_shape:
-            prefix = prefix + str(s)
-            size = size * s
-        self.prefix = prefix
-        number_range = min(1000, size)
-        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
-            np.float32)
-        self.input_np2 = 1.0
-        self.output_grad_np = np.reshape((np.arange(0, size) % (number_range - 10) - number_range / 2) * 0.1,
-                                         input_shape).astype(np.float32)
-        self.strategy0 = strategy0
-        self.strategy1 = strategy1
-        need_dev_num = 1
-        need_dev_num_ = 1
-        for s in strategy0[1]:
-            need_dev_num = need_dev_num * s
-        for s in strategy1[1]:
-            need_dev_num_ = need_dev_num_ * s
-        self.x_id = device_id % need_dev_num
-        self.y_id = device_id % need_dev_num
-        self.out_id = device_id % need_dev_num_
-
-    def forward_mindspore_impl(self):
-        net = AddRelu()
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2, ms.float32)
-        out = net(x, y)
-        return out.asnumpy()
-
-    def forward_mindspore_parallel_impl(self):
-        net = AddRelu(strategy0=self.strategy0, strategy1=self.strategy1)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        net.set_auto_parallel()
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2, ms.float32)
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(self.input_np2, ms.float32)
-        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
-        return out.asnumpy()
-
-    def grad_mindspore_impl(self):
-        output_grad = Tensor(self.output_grad_np)
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2, ms.float32)
-        net = AddRelu()
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(x, y, output_grad)
-        return input_grad
-
-    def grad_mindspore_parallel_impl(self):
-        output_grads = self.get_parallel_blocks(self.output_grad_np, self.strategy1[1])
-        output_grad = Tensor(output_grads[self.out_id])
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2, ms.float32)
-        net = AddRelu(strategy0=self.strategy0, strategy1=self.strategy1)
-        grad_net = Grad(net)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        grad_net.set_auto_parallel()
-        grad_net.set_train()
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(self.input_np2, ms.float32)
-        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad],
-                              parallel_inputs_run=[x1, y1, output_grad])
-        return input_grad
-
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-
-    def forward_cmp(self):
-        out_mindspore = self.forward_mindspore_impl()
-        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
-        out_blocks = self.get_parallel_blocks(out_mindspore, self.strategy1[1])
-        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)
-
-    def grad_cmp(self):
-        input_grad_mindspore = self.grad_mindspore_impl()
-        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
-        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
-        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
-        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
-        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
-        assert np.allclose(input_grad_mindspore1, input_grad_mindspore_parallel1, 0.0001, 0.0001)
-
-
-@pytest.mark.reid_forward
-def test_reid_add_relu_input_256_64():
-    stra0 = (0, (2, 2), ())
-    stra1 = (0, (2, 2))
-    fact = AddReluFactory(input_shape=(256, 64), strategy0=stra0, strategy1=stra1)
-    fact.forward_cmp()
-
-
-@pytest.mark.reid_grad
-def test_reid_grad_add_relu_input_256_64():
-    stra0 = (0, (2, 2), ())
-    stra1 = (0, (2, 2))
-    fact = AddReluFactory(input_shape=(256, 64), strategy0=stra0, strategy1=stra1)
-    fact.grad_cmp()
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import pytest
+
+import mindspore as ms
+import mindspore.communication.management as distributedTool
+from mindspore import context
+from mindspore.common.tensor import Tensor
+from mindspore.nn import Cell
+from mindspore.ops import operations as P
+from mindspore.ops.composite import grad_all_with_sens
+
+device_num = 4
+device_id = int(os.environ["RANK_ID"])
+path = "./output/"
+
+
+def setup_module():
+    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
+    context.set_context(mode=context.GRAPH_MODE)
+    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
+    distributedTool.init()
+    distributedTool.create_group("0-3", [0, 1, 2, 3])
+    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
+
+
+def teardown_module():
+    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
+
+
+class AddRelu(Cell):
+    def __init__(self, strategy0=None, strategy1=None):
+        super(AddRelu, self).__init__()
+        self.add = P.TensorAdd(strategy=strategy0)
+        self.relu = P.ReLU(strategy=strategy1)
+
+    def construct(self, x, z):
+        out = self.add(x, z)
+        return self.relu(out)
+
+
+class Grad(Cell):
+    def __init__(self, network):
+        super(Grad, self).__init__()
+        self.network = network
+
+    def construct(self, x, y, output_grad):
+        return grad_all_with_sens(self.network)(x, y, output_grad)
+
+
+class AddReluFactory:
+    def __init__(self, input_shape, strategy0, strategy1):
+        prefix = ""
+        size = 1
+        for s in input_shape:
+            prefix = prefix + str(s)
+            size = size * s
+        self.prefix = prefix
+        number_range = min(1000, size)
+        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
+            np.float32)
+        self.input_np2 = 1.0
+        self.output_grad_np = np.reshape((np.arange(0, size) % (number_range - 10) - number_range / 2) * 0.1,
+                                         input_shape).astype(np.float32)
+        self.strategy0 = strategy0
+        self.strategy1 = strategy1
+        need_dev_num = 1
+        need_dev_num_ = 1
+        for s in strategy0[1]:
+            need_dev_num = need_dev_num * s
+        for s in strategy1[1]:
+            need_dev_num_ = need_dev_num_ * s
+        self.x_id = device_id % need_dev_num
+        self.y_id = device_id % need_dev_num
+        self.out_id = device_id % need_dev_num_
+
+    def forward_mindspore_impl(self):
+        net = AddRelu()
+        x = Tensor(self.input_np1)
+        y = Tensor(self.input_np2, ms.float32)
+        out = net(x, y)
+        return out.asnumpy()
+
+    def forward_mindspore_parallel_impl(self):
+        net = AddRelu(strategy0=self.strategy0, strategy1=self.strategy1)
+        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
+        net.set_auto_parallel()
+        x = Tensor(self.input_np1)
+        y = Tensor(self.input_np2, ms.float32)
+        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
+        x1 = Tensor(inputs_x[self.x_id])
+        y1 = Tensor(self.input_np2, ms.float32)
+        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
+        return out.asnumpy()
+
+    def grad_mindspore_impl(self):
+        output_grad = Tensor(self.output_grad_np)
+        x = Tensor(self.input_np1)
+        y = Tensor(self.input_np2, ms.float32)
+        net = AddRelu()
+        grad_net = Grad(net)
+        grad_net.set_train()
+        input_grad = grad_net(x, y, output_grad)
+        return input_grad
+
+    def grad_mindspore_parallel_impl(self):
+        output_grads = self.get_parallel_blocks(self.output_grad_np, self.strategy1[1])
+        output_grad = Tensor(output_grads[self.out_id])
+        x = Tensor(self.input_np1)
+        y = Tensor(self.input_np2, ms.float32)
+        net = AddRelu(strategy0=self.strategy0, strategy1=self.strategy1)
+        grad_net = Grad(net)
+        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
+        grad_net.set_auto_parallel()
+        grad_net.set_train()
+        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
+        x1 = Tensor(inputs_x[self.x_id])
+        y1 = Tensor(self.input_np2, ms.float32)
+        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad],
+                              parallel_inputs_run=[x1, y1, output_grad])
+        return input_grad
+
+    def get_parallel_blocks(self, input_, strategy):
+        blocks = [input_]
+        i = 0
+        for stra in strategy:
+            temp = []
+            while len(blocks) > 0:
+                block = blocks.pop(0)
+                temp.extend(np.split(block, stra, axis=i))
+            blocks.extend(temp)
+            i += 1
+        return blocks
+
+    def forward_cmp(self):
+        out_mindspore = self.forward_mindspore_impl()
+        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
+        out_blocks = self.get_parallel_blocks(out_mindspore, self.strategy1[1])
+        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)
+
+    def grad_cmp(self):
+        input_grad_mindspore = self.grad_mindspore_impl()
+        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
+        _ = input_grad_mindspore[0].asnumpy()
+        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
+        _ = input_grad_mindspore_parallel[0].asnumpy()
+        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
+        assert np.allclose(input_grad_mindspore1, input_grad_mindspore_parallel1, 0.0001, 0.0001)
+
+
+@pytest.mark.reid_forward
+def test_reid_add_relu_input_256_64():
+    stra0 = (0, (2, 2), ())
+    stra1 = (0, (2, 2))
+    fact = AddReluFactory(input_shape=(256, 64), strategy0=stra0, strategy1=stra1)
+    fact.forward_cmp()
+
+
+@pytest.mark.reid_grad
+def test_reid_grad_add_relu_input_256_64():
+    stra0 = (0, (2, 2), ())
+    stra1 = (0, (2, 2))
+    fact = AddReluFactory(input_shape=(256, 64), strategy0=stra0, strategy1=stra1)
+    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/batch_parallel/_test_conv2d_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/batch_parallel/_test_conv2d_parallel_4p.py
--- a/tests/ut/python/parallel/parallel_end_to_end/dropout/_test_dropout_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/dropout/_test_dropout_parallel_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import os
-
-import mindspore as ms
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.nn import Dropout
-
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-
-
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    distributedTool.create_group("0-3", [0, 1, 2, 3])
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-
-
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-
-
-class Net(Cell):
-    def __init__(self, keep_prob, seed0, seed1, strategy=None):
-        super(Net, self).__init__()
-        self.drop = Dropout(keep_prob, seed0, seed1, dtype=ms.float32, strategy=strategy)
-
-    def construct(self, input):
-        x = self.drop(input)
-        return x
-
-
-# pylint: disable=comparison-with-itself
-class DropoutFactory:
-    def __init__(self, input_shape, keep_prob, seed0, seed1, strategy0=None):
-        size = 1
-        prefix = ""
-        for s in input_shape:
-            prefix = prefix + str(s)
-            size = size * s
-        self.prefix = prefix
-        number_range = min(10, size)
-        self.input_np = np.reshape(np.arange(0, size) % number_range, input_shape).astype(np.float32)
-        self.keep_prob = keep_prob
-        self.seed0 = seed0
-        self.seed1 = seed1
-        self.strategy0 = strategy0
-        need_dev_num = 1
-        for s in strategy0[1]:
-            need_dev_num = need_dev_num * s
-        self.x_id = device_id % need_dev_num
-        self.out_id = device_id % need_dev_num
-
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-
-    def d4_tensor_compare(self, input, out_me):
-        [a, b, c, d] = input.shape
-        for i in range(a):
-            for j in range(b):
-                for k in range(c):
-                    for e in range(d):
-                        if out_me[i, j, k, e] == 0:
-                            assert True == True
-                        else:
-                            assert np.allclose(out_me[i, j, k, e], input[i, j, k, e] * (1 / 0.4), 0.0001, 0.0001)
-
-    def forward_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np)
-        inputs_x = self.get_parallel_blocks(self.input_np, self.strategy0[1])
-        x1 = Tensor(inputs_x[self.x_id])
-        net = Net(0.4, 0, 0, strategy=self.strategy0)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        net.set_auto_parallel()
-        out = net(x, parallel_inputs_compile=[x], parallel_inputs_run=[x1])
-        return out.asnumpy()
-
-    def forward_cmp(self):
-        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
-        input_blocks = self.get_parallel_blocks(self.input_np, self.strategy0[1])
-        self.d4_tensor_compare(input_blocks[self.out_id], out_mindspore_parallel)
-
-
-def test_reid_dropout_forward_seed_F32_64_512_8_8():
-    fact = DropoutFactory(input_shape=(64, 512, 8, 8), keep_prob=0.4, seed0=0, seed1=0, strategy0=(0, (4, 1, 1, 1)))
-    fact.forward_cmp()
-
-
-def test_reid_dropout_forward_seed_F32_64_512_8_8_repeat():
-    fact = DropoutFactory(input_shape=(64, 512, 8, 8), keep_prob=0.4, seed0=0, seed1=0, strategy0=(0, (2, 1, 1, 1)))
-    fact.forward_cmp()
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+
+import mindspore as ms
+import mindspore.communication.management as distributedTool
+from mindspore import context
+from mindspore.common.tensor import Tensor
+from mindspore.nn import Cell
+from mindspore.nn import Dropout
+
+device_num = 4
+device_id = int(os.environ["RANK_ID"])
+path = "./output/"
+
+
+def setup_module():
+    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
+    context.set_context(mode=context.GRAPH_MODE)
+    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
+    distributedTool.init()
+    distributedTool.create_group("0-3", [0, 1, 2, 3])
+    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
+
+
+def teardown_module():
+    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
+
+
+class Net(Cell):
+    def __init__(self, keep_prob, seed0, seed1, strategy=None):
+        super(Net, self).__init__()
+        self.drop = Dropout(keep_prob, seed0, seed1, dtype=ms.float32, strategy=strategy)
+
+    def construct(self, input_):
+        x = self.drop(input_)
+        return x
+
+
+# pylint: disable=comparison-with-itself
+class DropoutFactory:
+    def __init__(self, input_shape, keep_prob, seed0, seed1, strategy0=None):
+        size = 1
+        prefix = ""
+        for s in input_shape:
+            prefix = prefix + str(s)
+            size = size * s
+        self.prefix = prefix
+        number_range = min(10, size)
+        self.input_np = np.reshape(np.arange(0, size) % number_range, input_shape).astype(np.float32)
+        self.keep_prob = keep_prob
+        self.seed0 = seed0
+        self.seed1 = seed1
+        self.strategy0 = strategy0
+        need_dev_num = 1
+        for s in strategy0[1]:
+            need_dev_num = need_dev_num * s
+        self.x_id = device_id % need_dev_num
+        self.out_id = device_id % need_dev_num
+
+    def get_parallel_blocks(self, input_, strategy):
+        blocks = [input_]
+        i = 0
+        for stra in strategy:
+            temp = []
+            while len(blocks) > 0:
+                block = blocks.pop(0)
+                temp.extend(np.split(block, stra, axis=i))
+            blocks.extend(temp)
+            i += 1
+        return blocks
+
+    def d4_tensor_compare(self, input_, out_me):
+        [a, b, c, d] = input_.shape
+        for i in range(a):
+            for j in range(b):
+                for k in range(c):
+                    for e in range(d):
+                        if out_me[i, j, k, e] == 0:
+                            assert True
+                        else:
+                            assert np.allclose(out_me[i, j, k, e], input_[i, j, k, e] * (1 / 0.4), 0.0001, 0.0001)
+
+    def forward_mindspore_parallel_impl(self):
+        x = Tensor(self.input_np)
+        inputs_x = self.get_parallel_blocks(self.input_np, self.strategy0[1])
+        x1 = Tensor(inputs_x[self.x_id])
+        net = Net(0.4, 0, 0, strategy=self.strategy0)
+        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
+        net.set_auto_parallel()
+        out = net(x, parallel_inputs_compile=[x], parallel_inputs_run=[x1])
+        return out.asnumpy()
+
+    def forward_cmp(self):
+        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
+        input_blocks = self.get_parallel_blocks(self.input_np, self.strategy0[1])
+        self.d4_tensor_compare(input_blocks[self.out_id], out_mindspore_parallel)
+
+
+def test_reid_dropout_forward_seed_F32_64_512_8_8():
+    fact = DropoutFactory(input_shape=(64, 512, 8, 8), keep_prob=0.4, seed0=0, seed1=0, strategy0=(0, (4, 1, 1, 1)))
+    fact.forward_cmp()
+
+
+def test_reid_dropout_forward_seed_F32_64_512_8_8_repeat():
+    fact = DropoutFactory(input_shape=(64, 512, 8, 8), keep_prob=0.4, seed0=0, seed1=0, strategy0=(0, (2, 1, 1, 1)))
+    fact.forward_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/hcom/_test_allgather_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/hcom/_test_allgather_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import os
-
-import mindspore as ms
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all_with_sens
-
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-
-
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-
-
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-
-
-class MatmulSingle(Cell):
-    def __init__(self, transpose_a=False, transpose_b=False):
-        super(MatmulSingle, self).__init__()
-        self.matmul = P.MatMul(transpose_a, transpose_b)
-        self.pow = P.Pow()
-        self.reduce_sum = P.ReduceSum()
-
-    def construct(self, x, y):
-        out = self.matmul(x, y)
-        out = self.pow(out, 2.0)
-        out = self.reduce_sum(out, None)
-        return out
-
-
-class MatmulAllgather(Cell):
-    def __init__(self, group, transpose_a=False, transpose_b=False):
-        super(MatmulAllgather, self).__init__()
-        self.allgather = P.AllGather(group=group)
-        self.matmul = P.MatMul(transpose_a, transpose_b)
-        self.pow = P.Pow()
-        self.reduce_sum = P.ReduceSum()
-        self.allreduce = P.AllReduce(group=group)
-
-    def construct(self, x, y):
-        x = self.allgather(x)
-        out = self.matmul(x, y)
-        out = self.pow(out, 2.0)
-        out = self.reduce_sum(out, None)
-        out = self.allreduce(out)
-        return out
-
-
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-
-    def construct(self, x, y, sens):
-        return grad_all_with_sens(self.network)(x, y, sens)
-
-
-class MatmulAllgatherFactory:
-    def __init__(self, inputx_shape, inputy_shape, x_stra, y_stra):
-        self.inputx = self.GenValue(inputx_shape, 10)
-        self.inputy = self.GenValue(inputy_shape, 20)
-        self.x_stra = x_stra
-        self.y_stra = y_stra
-        stra_size = 1
-        for s in x_stra:
-            stra_size = stra_size * s
-        self.stra_size = stra_size
-
-    def GenValue(self, input_shape, delta):
-        size = 1
-        for s in input_shape:
-            size = size * s
-        number_range = min(100, size)
-        input_np = np.reshape(np.arange(0, size) % number_range - delta, input_shape).astype(np.float32)
-        return input_np
-
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-
-    def grad_mindspore_impl_single(self):
-        x = Tensor(self.inputx)
-        y = Tensor(self.inputy)
-        sens = Tensor(1.0, dtype=ms.float32)
-        net = MatmulSingle()
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(x, y, sens)
-        return input_grad
-
-    def grad_mindspore_impl_reduce(self):
-        inputxs = self.get_parallel_blocks(self.inputx, self.x_stra)
-        inputys = self.get_parallel_blocks(self.inputy, self.y_stra)
-        x = Tensor(inputxs[device_id % self.stra_size])
-        y = Tensor(inputys[device_id % self.stra_size])
-        repeat_num = device_num / self.stra_size
-        v = self.stra_size * repeat_num * repeat_num * repeat_num
-        sens = Tensor(1.0 / v, dtype=ms.float32)
-        net = MatmulAllgather("hccl_world_group")
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(x, y, sens)
-        return input_grad
-
-    def grad_cmp(self):
-        single_results = self.grad_mindspore_impl_single()
-        reduce_results = self.grad_mindspore_impl_reduce()
-        single_result0 = self.get_parallel_blocks(single_results[0].asnumpy(), self.x_stra)[device_id % self.stra_size]
-        reduce_result0 = reduce_results[0].asnumpy()
-        single_result1 = self.get_parallel_blocks(single_results[1].asnumpy(), self.y_stra)[device_id % self.stra_size]
-        reduce_result1 = reduce_results[1].asnumpy()
-        assert np.allclose(single_result0, reduce_result0, 0.0001, 0.0001)
-        assert np.allclose(single_result1, reduce_result1, 0.0001, 0.0001)
-
-
-def test_reduce_grad():
-    inputx_shape = (64, 32)
-    inputy_shape = (32, 64)
-    fact = MatmulAllgatherFactory(inputx_shape, inputy_shape, (4, 1), (1, 4))
-    fact.grad_cmp()
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+
+import mindspore as ms
+import mindspore.communication.management as distributedTool
+from mindspore import context
+from mindspore.common.tensor import Tensor
+from mindspore.nn import Cell
+from mindspore.ops import operations as P
+from mindspore.ops.composite import grad_all_with_sens
+
+device_num = 4
+device_id = int(os.environ["RANK_ID"])
+path = "./output/"
+
+
+def setup_module():
+    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
+    context.set_context(mode=context.GRAPH_MODE)
+    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
+    distributedTool.init()
+    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
+
+
+def teardown_module():
+    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
+
+
+class MatmulSingle(Cell):
+    def __init__(self, transpose_a=False, transpose_b=False):
+        super(MatmulSingle, self).__init__()
+        self.matmul = P.MatMul(transpose_a, transpose_b)
+        self.pow = P.Pow()
+        self.reduce_sum = P.ReduceSum()
+
+    def construct(self, x, y):
+        out = self.matmul(x, y)
+        out = self.pow(out, 2.0)
+        out = self.reduce_sum(out, None)
+        return out
+
+
+class MatmulAllgather(Cell):
+    def __init__(self, group, transpose_a=False, transpose_b=False):
+        super(MatmulAllgather, self).__init__()
+        self.allgather = P.AllGather(group=group)
+        self.matmul = P.MatMul(transpose_a, transpose_b)
+        self.pow = P.Pow()
+        self.reduce_sum = P.ReduceSum()
+        self.allreduce = P.AllReduce(group=group)
+
+    def construct(self, x, y):
+        x = self.allgather(x)
+        out = self.matmul(x, y)
+        out = self.pow(out, 2.0)
+        out = self.reduce_sum(out, None)
+        out = self.allreduce(out)
+        return out
+
+
+class Grad(Cell):
+    def __init__(self, network):
+        super(Grad, self).__init__()
+        self.network = network
+
+    def construct(self, x, y, sens):
+        return grad_all_with_sens(self.network)(x, y, sens)
+
+
+class MatmulAllgatherFactory:
+    def __init__(self, inputx_shape, inputy_shape, x_stra, y_stra):
+        self.inputx = self.gen_value(inputx_shape, 10)
+        self.inputy = self.gen_value(inputy_shape, 20)
+        self.x_stra = x_stra
+        self.y_stra = y_stra
+        stra_size = 1
+        for s in x_stra:
+            stra_size = stra_size * s
+        self.stra_size = stra_size
+
+    def gen_value(self, input_shape, delta):
+        size = 1
+        for s in input_shape:
+            size = size * s
+        number_range = min(100, size)
+        input_np = np.reshape(np.arange(0, size) % number_range - delta, input_shape).astype(np.float32)
+        return input_np
+
+    def get_parallel_blocks(self, input_, strategy):
+        blocks = [input_]
+        i = 0
+        for stra in strategy:
+            temp = []
+            while len(blocks) > 0:
+                block = blocks.pop(0)
+                temp.extend(np.split(block, stra, axis=i))
+            blocks.extend(temp)
+            i += 1
+        return blocks
+
+    def grad_mindspore_impl_single(self):
+        x = Tensor(self.inputx)
+        y = Tensor(self.inputy)
+        sens = Tensor(1.0, dtype=ms.float32)
+        net = MatmulSingle()
+        grad_net = Grad(net)
+        grad_net.set_train()
+        input_grad = grad_net(x, y, sens)
+        return input_grad
+
+    def grad_mindspore_impl_reduce(self):
+        inputxs = self.get_parallel_blocks(self.inputx, self.x_stra)
+        inputys = self.get_parallel_blocks(self.inputy, self.y_stra)
+        x = Tensor(inputxs[device_id % self.stra_size])
+        y = Tensor(inputys[device_id % self.stra_size])
+        repeat_num = device_num / self.stra_size
+        v = self.stra_size * repeat_num * repeat_num * repeat_num
+        sens = Tensor(1.0 / v, dtype=ms.float32)
+        net = MatmulAllgather("hccl_world_group")
+        grad_net = Grad(net)
+        grad_net.set_train()
+        input_grad = grad_net(x, y, sens)
+        return input_grad
+
+    def grad_cmp(self):
+        single_results = self.grad_mindspore_impl_single()
+        reduce_results = self.grad_mindspore_impl_reduce()
+        single_result0 = self.get_parallel_blocks(single_results[0].asnumpy(), self.x_stra)[device_id % self.stra_size]
+        reduce_result0 = reduce_results[0].asnumpy()
+        single_result1 = self.get_parallel_blocks(single_results[1].asnumpy(), self.y_stra)[device_id % self.stra_size]
+        reduce_result1 = reduce_results[1].asnumpy()
+        assert np.allclose(single_result0, reduce_result0, 0.0001, 0.0001)
+        assert np.allclose(single_result1, reduce_result1, 0.0001, 0.0001)
+
+
+def test_reduce_grad():
+    inputx_shape = (64, 32)
+    inputy_shape = (32, 64)
+    fact = MatmulAllgatherFactory(inputx_shape, inputy_shape, (4, 1), (1, 4))
+    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/hcom/_test_allreduce_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/hcom/_test_allreduce_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import os
-
-import mindspore as ms
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all_with_sens
-
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-
-
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-
-
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-
-
-class MatmulSingle(Cell):
-    def __init__(self, transpose_a=False, transpose_b=False):
-        super(MatmulSingle, self).__init__()
-        self.matmul1 = P.MatMul(transpose_a, transpose_b)
-        self.matmul2 = P.MatMul(transpose_a, transpose_b)
-        self.pow = P.Pow()
-        self.reduce_sum = P.ReduceSum()
-
-    def construct(self, x, y, z):
-        out = self.matmul1(x, y)
-        out = self.matmul2(out, z)
-        out = self.pow(out, 2.0)
-        out = self.reduce_sum(out, None)
-        return out
-
-
-class MatmulReduce(Cell):
-    def __init__(self, group, transpose_a=False, transpose_b=False):
-        super(MatmulReduce, self).__init__()
-        self.matmul1 = P.MatMul(transpose_a, transpose_b)
-        self.allreduce1 = P.AllReduce(group=group)
-        self.matmul2 = P.MatMul(transpose_a, transpose_b)
-        self.pow = P.Pow()
-        self.reduce_sum = P.ReduceSum()
-        self.allreduce2 = P.AllReduce(group=group)
-
-    def construct(self, x, y, z):
-        out = self.matmul1(x, y)
-        out = self.allreduce1(out)
-        out = self.matmul2(out, z)
-        out = self.pow(out, 2.0)
-        out = self.reduce_sum(out, None)
-        out = self.allreduce2(out)
-        return out
-
-
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-
-    def construct(self, x, y, z, sens):
-        return grad_all_with_sens(self.network)(x, y, z, sens)
-
-
-class MatmulReduceFactory:
-    def __init__(self, inputx_shape, inputy_shape, inputz_shape, x_stra, y_stra, z_stra):
-        self.inputx = self.GenValue(inputx_shape, 10)
-        self.inputy = self.GenValue(inputy_shape, 20)
-        self.inputz = self.GenValue(inputz_shape, 30)
-        self.x_stra = x_stra
-        self.y_stra = y_stra
-        self.z_stra = z_stra
-        stra_size = 1
-        for s in x_stra:
-            stra_size = stra_size * s
-        self.stra_size = stra_size
-
-    def GenValue(self, input_shape, delta):
-        size = 1
-        for s in input_shape:
-            size = size * s
-        number_range = min(100, size)
-        input_np = np.reshape(np.arange(0, size) % number_range - delta, input_shape).astype(np.float32)
-        return input_np
-
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-
-    def grad_mindspore_impl_single(self):
-        x = Tensor(self.inputx)
-        y = Tensor(self.inputy)
-        z = Tensor(self.inputz)
-        sens = Tensor(1.0, dtype=ms.float32)
-        net = MatmulSingle()
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(x, y, z, sens)
-        return input_grad
-
-    def grad_mindspore_impl_reduce(self):
-        inputxs = self.get_parallel_blocks(self.inputx, self.x_stra)
-        inputys = self.get_parallel_blocks(self.inputy, self.y_stra)
-        inputzs = self.get_parallel_blocks(self.inputz, self.z_stra)
-        x = Tensor(inputxs[device_id % self.stra_size])
-        y = Tensor(inputys[device_id % self.stra_size])
-        z = Tensor(inputzs[device_id % self.stra_size])
-        repeat_num = device_num / self.stra_size
-        v = self.stra_size * repeat_num * repeat_num * repeat_num
-        sens = Tensor(1.0 / v, dtype=ms.float32)
-        net = MatmulReduce("hccl_world_group")
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(x, y, z, sens)
-        return input_grad
-
-    def grad_cmp(self):
-        single_results = self.grad_mindspore_impl_single()
-        reduce_results = self.grad_mindspore_impl_reduce()
-        single_result0 = self.get_parallel_blocks(single_results[0].asnumpy(), self.x_stra)[device_id % self.stra_size]
-        reduce_result0 = reduce_results[0].asnumpy()
-        single_result1 = self.get_parallel_blocks(single_results[1].asnumpy(), self.y_stra)[device_id % self.stra_size]
-        reduce_result1 = reduce_results[1].asnumpy()
-        single_result2 = self.get_parallel_blocks(single_results[2].asnumpy(), self.z_stra)[device_id % self.stra_size]
-        reduce_result2 = reduce_results[2].asnumpy()
-        assert np.allclose(single_result0, reduce_result0, 0.0001, 0.0001)
-        assert np.allclose(single_result1, reduce_result1, 0.0001, 0.0001)
-        assert np.allclose(single_result2, reduce_result2, 0.0001, 0.0001)
-
-
-def test_reduce_grad():
-    inputx_shape = (32, 64)
-    inputy_shape = (64, 64)
-    inputz_shape = (64, 32)
-    fact = MatmulReduceFactory(inputx_shape, inputy_shape, inputz_shape, (1, 4), (4, 1), (1, 4))
-    fact.grad_cmp()
-
-
-def test_reduce_grad_repeat():
-    inputx_shape = (32, 64)
-    inputy_shape = (64, 64)
-    inputz_shape = (64, 32)
-    fact = MatmulReduceFactory(inputx_shape, inputy_shape, inputz_shape, (1, 2), (2, 1), (1, 2))
-    fact.grad_cmp()
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+
+import mindspore as ms
+import mindspore.communication.management as distributedTool
+from mindspore import context
+from mindspore.common.tensor import Tensor
+from mindspore.nn import Cell
+from mindspore.ops import operations as P
+from mindspore.ops.composite import grad_all_with_sens
+
+device_num = 4
+device_id = int(os.environ["RANK_ID"])
+path = "./output/"
+
+
+def setup_module():
+    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
+    context.set_context(mode=context.GRAPH_MODE)
+    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
+    distributedTool.init()
+    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
+
+
+def teardown_module():
+    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
+
+
+class MatmulSingle(Cell):
+    def __init__(self, transpose_a=False, transpose_b=False):
+        super(MatmulSingle, self).__init__()
+        self.matmul1 = P.MatMul(transpose_a, transpose_b)
+        self.matmul2 = P.MatMul(transpose_a, transpose_b)
+        self.pow = P.Pow()
+        self.reduce_sum = P.ReduceSum()
+
+    def construct(self, x, y, z):
+        out = self.matmul1(x, y)
+        out = self.matmul2(out, z)
+        out = self.pow(out, 2.0)
+        out = self.reduce_sum(out, None)
+        return out
+
+
+class MatmulReduce(Cell):
+    def __init__(self, group, transpose_a=False, transpose_b=False):
+        super(MatmulReduce, self).__init__()
+        self.matmul1 = P.MatMul(transpose_a, transpose_b)
+        self.allreduce1 = P.AllReduce(group=group)
+        self.matmul2 = P.MatMul(transpose_a, transpose_b)
+        self.pow = P.Pow()
+        self.reduce_sum = P.ReduceSum()
+        self.allreduce2 = P.AllReduce(group=group)
+
+    def construct(self, x, y, z):
+        out = self.matmul1(x, y)
+        out = self.allreduce1(out)
+        out = self.matmul2(out, z)
+        out = self.pow(out, 2.0)
+        out = self.reduce_sum(out, None)
+        out = self.allreduce2(out)
+        return out
+
+
+class Grad(Cell):
+    def __init__(self, network):
+        super(Grad, self).__init__()
+        self.network = network
+
+    def construct(self, x, y, z, sens):
+        return grad_all_with_sens(self.network)(x, y, z, sens)
+
+
+class MatmulReduceFactory:
+    def __init__(self, inputx_shape, inputy_shape, inputz_shape, x_stra, y_stra, z_stra):
+        self.inputx = self.gen_value(inputx_shape, 10)
+        self.inputy = self.gen_value(inputy_shape, 20)
+        self.inputz = self.gen_value(inputz_shape, 30)
+        self.x_stra = x_stra
+        self.y_stra = y_stra
+        self.z_stra = z_stra
+        stra_size = 1
+        for s in x_stra:
+            stra_size = stra_size * s
+        self.stra_size = stra_size
+
+    def gen_value(self, input_shape, delta):
+        size = 1
+        for s in input_shape:
+            size = size * s
+        number_range = min(100, size)
+        input_np = np.reshape(np.arange(0, size) % number_range - delta, input_shape).astype(np.float32)
+        return input_np
+
+    def get_parallel_blocks(self, input_, strategy):
+        blocks = [input_]
+        i = 0
+        for stra in strategy:
+            temp = []
+            while len(blocks) > 0:
+                block = blocks.pop(0)
+                temp.extend(np.split(block, stra, axis=i))
+            blocks.extend(temp)
+            i += 1
+        return blocks
+
+    def grad_mindspore_impl_single(self):
+        x = Tensor(self.inputx)
+        y = Tensor(self.inputy)
+        z = Tensor(self.inputz)
+        sens = Tensor(1.0, dtype=ms.float32)
+        net = MatmulSingle()
+        grad_net = Grad(net)
+        grad_net.set_train()
+        input_grad = grad_net(x, y, z, sens)
+        return input_grad
+
+    def grad_mindspore_impl_reduce(self):
+        inputxs = self.get_parallel_blocks(self.inputx, self.x_stra)
+        inputys = self.get_parallel_blocks(self.inputy, self.y_stra)
+        inputzs = self.get_parallel_blocks(self.inputz, self.z_stra)
+        x = Tensor(inputxs[device_id % self.stra_size])
+        y = Tensor(inputys[device_id % self.stra_size])
+        z = Tensor(inputzs[device_id % self.stra_size])
+        repeat_num = device_num / self.stra_size
+        v = self.stra_size * repeat_num * repeat_num * repeat_num
+        sens = Tensor(1.0 / v, dtype=ms.float32)
+        net = MatmulReduce("hccl_world_group")
+        grad_net = Grad(net)
+        grad_net.set_train()
+        input_grad = grad_net(x, y, z, sens)
+        return input_grad
+
+    def grad_cmp(self):
+        single_results = self.grad_mindspore_impl_single()
+        reduce_results = self.grad_mindspore_impl_reduce()
+        single_result0 = self.get_parallel_blocks(single_results[0].asnumpy(), self.x_stra)[device_id % self.stra_size]
+        reduce_result0 = reduce_results[0].asnumpy()
+        single_result1 = self.get_parallel_blocks(single_results[1].asnumpy(), self.y_stra)[device_id % self.stra_size]
+        reduce_result1 = reduce_results[1].asnumpy()
+        single_result2 = self.get_parallel_blocks(single_results[2].asnumpy(), self.z_stra)[device_id % self.stra_size]
+        reduce_result2 = reduce_results[2].asnumpy()
+        assert np.allclose(single_result0, reduce_result0, 0.0001, 0.0001)
+        assert np.allclose(single_result1, reduce_result1, 0.0001, 0.0001)
+        assert np.allclose(single_result2, reduce_result2, 0.0001, 0.0001)
+
+
+def test_reduce_grad():
+    inputx_shape = (32, 64)
+    inputy_shape = (64, 64)
+    inputz_shape = (64, 32)
+    fact = MatmulReduceFactory(inputx_shape, inputy_shape, inputz_shape, (1, 4), (4, 1), (1, 4))
+    fact.grad_cmp()
+
+
+def test_reduce_grad_repeat():
+    inputx_shape = (32, 64)
+    inputy_shape = (64, 64)
+    inputz_shape = (64, 32)
+    fact = MatmulReduceFactory(inputx_shape, inputy_shape, inputz_shape, (1, 2), (2, 1), (1, 2))
+    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/l2normalize/_test_l2normalize_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/l2normalize/_test_l2normalize_parallel_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import os
-import pytest
-
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all_with_sens
-
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-
-
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    distributedTool.create_group("0-3", [0, 1, 2, 3])
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-
-
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-
-
-class L2normalize(Cell):
-    def __init__(self, axis=0, epsilon=1e-4, strategy0=None, strategy1=None):
-        super(L2normalize, self).__init__()
-        self.add = P.TensorAdd(strategy=strategy0)
-        self.l2norm = P.L2Normalize(axis, epsilon, strategy1)
-
-    def construct(self, x, y):
-        out = self.add(x, y)
-        out = self.l2norm(out)
-        return out
-
-
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-
-    def construct(self, x, y, output_grad):
-        return grad_all_with_sens(self.network)(x, y, output_grad)
-
-
-class L2normalizeFactory:
-    def __init__(self, input_shape, axis, strategy0, strategy1):
-        prefix = ""
-        size = 1
-        for s in input_shape:
-            prefix = prefix + str(s)
-            size = size * s
-        self.prefix = prefix
-        number_range = min(1000, size)
-        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
-            np.float32)
-        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
-            np.float32)
-        target_shape = input_shape
-        self.target_shape = target_shape
-        target_size = 1
-        for s in target_shape:
-            target_size = target_size * s
-        number_range = min(1000, target_size)
-        self.output_grad_np = np.reshape(np.arange(0, target_size) % number_range - number_range / 2,
-                                         target_shape).astype(np.float32)
-        self.axis = axis
-        self.epsilon = 1e-4
-        self.strategy0 = strategy0
-        self.strategy1 = strategy1
-        out_strategy = strategy1[1]
-        self.out_strategy = out_strategy
-        need_dev_num0 = 1
-        need_dev_num1 = 1
-        for s in strategy0[1]:
-            need_dev_num0 = need_dev_num0 * s
-        for s in out_strategy:
-            need_dev_num1 = need_dev_num1 * s
-        self.x_id = device_id % need_dev_num0
-        self.y_id = device_id % need_dev_num0
-        self.out_id = device_id % need_dev_num1
-
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-
-    def forward_mindspore_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        net = L2normalize(self.axis, self.epsilon)
-        out = net(x, y)
-        return out.asnumpy()
-
-    def forward_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(inputs_y[self.y_id])
-        net = L2normalize(self.axis, self.epsilon, strategy0=self.strategy0, strategy1=self.strategy1)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        net.set_auto_parallel()
-        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
-        return out.asnumpy()
-
-    def grad_mindspore_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        output_grad = Tensor(self.output_grad_np)
-        net = L2normalize(self.axis, self.epsilon)
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(x, y, output_grad)
-        return input_grad
-
-    def grad_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        output_grad = Tensor(self.output_grad_np)
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
-        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(inputs_y[self.y_id])
-        output_grad1 = Tensor(outgrads[self.out_id])
-        net = L2normalize(self.axis, self.epsilon, strategy0=self.strategy0, strategy1=self.strategy1)
-        grad_net = Grad(net)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        grad_net.set_auto_parallel()
-        grad_net.set_train()
-        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
-                              parallel_inputs_run=[x1, y1, output_grad1])
-        return input_grad
-
-    def forward_cmp(self):
-        out_mindspore = self.forward_mindspore_impl()
-        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
-        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
-        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.001, 0.001)
-
-    def grad_cmp(self):
-        input_grad_mindspore = self.grad_mindspore_impl()
-        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
-        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
-        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
-        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
-        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
-        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
-        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
-        assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
-        assert np.allclose(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)
-
-
-def test_reid_l2normalize_input_128_512():
-    input_shape = (128, 512)
-    axis = 0
-    fact = L2normalizeFactory(input_shape, axis, strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
-    fact.forward_cmp()
-
-
-def test_reid_l2normalize_grad_input_128_512():
-    input_shape = (128, 512)
-    axis = 0
-    fact = L2normalizeFactory(input_shape, axis, (0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
-    fact.grad_cmp()
-
-
-def test_reid_l2normalize_input_128_512_repeat():
-    input_shape = (128, 512)
-    axis = 0
-    fact = L2normalizeFactory(input_shape, axis, strategy0=(0, (1, 2), (1, 2)), strategy1=(0, (1, 2)))
-    fact.forward_cmp()
-
-
-def test_reid_l2normalize_grad_input_128_512_repeat():
-    input_shape = (128, 512)
-    axis = 0
-    fact = L2normalizeFactory(input_shape, axis, strategy0=(0, (1, 2), (1, 2)), strategy1=(0, (1, 2)))
-    fact.grad_cmp()
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+
+import mindspore.communication.management as distributedTool
+from mindspore import context
+from mindspore.common.tensor import Tensor
+from mindspore.nn import Cell
+from mindspore.ops import operations as P
+from mindspore.ops.composite import grad_all_with_sens
+
+device_num = 4
+device_id = int(os.environ["RANK_ID"])
+path = "./output/"
+
+
+def setup_module():
+    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
+    context.set_context(mode=context.GRAPH_MODE)
+    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
+    distributedTool.init()
+    distributedTool.create_group("0-3", [0, 1, 2, 3])
+    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
+
+
+def teardown_module():
+    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
+
+
+class L2normalize(Cell):
+    def __init__(self, axis=0, epsilon=1e-4, strategy0=None, strategy1=None):
+        super(L2normalize, self).__init__()
+        self.add = P.TensorAdd(strategy=strategy0)
+        self.l2norm = P.L2Normalize(axis, epsilon, strategy1)
+
+    def construct(self, x, y):
+        out = self.add(x, y)
+        out = self.l2norm(out)
+        return out
+
+
+class Grad(Cell):
+    def __init__(self, network):
+        super(Grad, self).__init__()
+        self.network = network
+
+    def construct(self, x, y, output_grad):
+        return grad_all_with_sens(self.network)(x, y, output_grad)
+
+
+class L2normalizeFactory:
+    def __init__(self, input_shape, axis, strategy0, strategy1):
+        prefix = ""
+        size = 1
+        for s in input_shape:
+            prefix = prefix + str(s)
+            size = size * s
+        self.prefix = prefix
+        number_range = min(1000, size)
+        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
+            np.float32)
+        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
+            np.float32)
+        target_shape = input_shape
+        self.target_shape = target_shape
+        target_size = 1
+        for s in target_shape:
+            target_size = target_size * s
+        number_range = min(1000, target_size)
+        self.output_grad_np = np.reshape(np.arange(0, target_size) % number_range - number_range / 2,
+                                         target_shape).astype(np.float32)
+        self.axis = axis
+        self.epsilon = 1e-4
+        self.strategy0 = strategy0
+        self.strategy1 = strategy1
+        out_strategy = strategy1[1]
+        self.out_strategy = out_strategy
+        need_dev_num0 = 1
+        need_dev_num1 = 1
+        for s in strategy0[1]:
+            need_dev_num0 = need_dev_num0 * s
+        for s in out_strategy:
+            need_dev_num1 = need_dev_num1 * s
+        self.x_id = device_id % need_dev_num0
+        self.y_id = device_id % need_dev_num0
+        self.out_id = device_id % need_dev_num1
+
+    def get_parallel_blocks(self, input_, strategy):
+        blocks = [input_]
+        i = 0
+        for stra in strategy:
+            temp = []
+            while len(blocks) > 0:
+                block = blocks.pop(0)
+                temp.extend(np.split(block, stra, axis=i))
+            blocks.extend(temp)
+            i += 1
+        return blocks
+
+    def forward_mindspore_impl(self):
+        x = Tensor(self.input_np1)
+        y = Tensor(self.input_np2)
+        net = L2normalize(self.axis, self.epsilon)
+        out = net(x, y)
+        return out.asnumpy()
+
+    def forward_mindspore_parallel_impl(self):
+        x = Tensor(self.input_np1)
+        y = Tensor(self.input_np2)
+        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
+        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
+        x1 = Tensor(inputs_x[self.x_id])
+        y1 = Tensor(inputs_y[self.y_id])
+        net = L2normalize(self.axis, self.epsilon, strategy0=self.strategy0, strategy1=self.strategy1)
+        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
+        net.set_auto_parallel()
+        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
+        return out.asnumpy()
+
+    def grad_mindspore_impl(self):
+        x = Tensor(self.input_np1)
+        y = Tensor(self.input_np2)
+        output_grad = Tensor(self.output_grad_np)
+        net = L2normalize(self.axis, self.epsilon)
+        grad_net = Grad(net)
+        grad_net.set_train()
+        input_grad = grad_net(x, y, output_grad)
+        return input_grad
+
+    def grad_mindspore_parallel_impl(self):
+        x = Tensor(self.input_np1)
+        y = Tensor(self.input_np2)
+        output_grad = Tensor(self.output_grad_np)
+        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
+        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
+        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
+        x1 = Tensor(inputs_x[self.x_id])
+        y1 = Tensor(inputs_y[self.y_id])
+        output_grad1 = Tensor(outgrads[self.out_id])
+        net = L2normalize(self.axis, self.epsilon, strategy0=self.strategy0, strategy1=self.strategy1)
+        grad_net = Grad(net)
+        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
+        grad_net.set_auto_parallel()
+        grad_net.set_train()
+        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
+                              parallel_inputs_run=[x1, y1, output_grad1])
+        return input_grad
+
+    def forward_cmp(self):
+        out_mindspore = self.forward_mindspore_impl()
+        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
+        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
+        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.001, 0.001)
+
+    def grad_cmp(self):
+        input_grad_mindspore = self.grad_mindspore_impl()
+        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
+        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
+        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
+        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
+        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
+        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
+        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
+        assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
+        assert np.allclose(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)
+
+
+def test_reid_l2normalize_input_128_512():
+    input_shape = (128, 512)
+    axis = 0
+    fact = L2normalizeFactory(input_shape, axis, strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
+    fact.forward_cmp()
+
+
+def test_reid_l2normalize_grad_input_128_512():
+    input_shape = (128, 512)
+    axis = 0
+    fact = L2normalizeFactory(input_shape, axis, (0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
+    fact.grad_cmp()
+
+
+def test_reid_l2normalize_input_128_512_repeat():
+    input_shape = (128, 512)
+    axis = 0
+    fact = L2normalizeFactory(input_shape, axis, strategy0=(0, (1, 2), (1, 2)), strategy1=(0, (1, 2)))
+    fact.forward_cmp()
+
+
+def test_reid_l2normalize_grad_input_128_512_repeat():
+    input_shape = (128, 512)
+    axis = 0
+    fact = L2normalizeFactory(input_shape, axis, strategy0=(0, (1, 2), (1, 2)), strategy1=(0, (1, 2)))
+    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/loss/_test_loss_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/loss/_test_loss_parallel_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import os
-import pytest
-
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all
-
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-
-
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    distributedTool.create_group("0-3", [0, 1, 2, 3])
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-
-
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-
-
-class AddRelu(Cell):
-    def __init__(self, strategy0=None, strategy1=None):
-        super(AddRelu, self).__init__()
-        self.add = P.TensorAdd(strategy=strategy0)
-        self.relu = P.ReLU(strategy=strategy1)
-
-    def construct(self, x, y):
-        out = self.add(x, y)
-        out = self.relu(out)
-        return out
-
-
-class NetWithLoss(Cell):
-    def __init__(self, network, strategy2=None):
-        super(NetWithLoss, self).__init__()
-        self.loss = P.SoftmaxCrossEntropyWithLogits(strategy=strategy2)
-        self.network = network
-
-    def construct(self, x, y, b):
-        predict = self.network(x, y)
-        return self.loss(predict, b)[0]
-
-
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-
-    def construct(self, x, y, b):
-        return grad_all(self.network)(x, y, b)
-
-
-class AddReluFactory:
-    def __init__(self, input_shape, strategy0, strategy1, strategy2):
-        prefix = ""
-        size = 1
-        for s in input_shape:
-            prefix = prefix + str(s)
-            size = size * s
-        self.prefix = prefix
-        number_range = min(1000, size)
-        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
-            np.float32)
-        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
-            np.float32)
-        target_shape = input_shape
-        self.target_shape = target_shape
-        target_size = 1
-        for s in target_shape:
-            target_size = target_size * s
-        number_range = min(10, target_size)
-        self.output_grad_np = np.reshape((np.arange(0, target_size) % number_range) * 0.1, target_shape).astype(
-            np.float32)
-        self.strategy0 = strategy0
-        self.strategy1 = strategy1
-        self.strategy2 = strategy2
-        out_strategy = strategy1[1]
-        self.out_strategy = out_strategy
-        need_dev_num0 = 1
-        need_dev_num1 = 1
-        for s in strategy0[1]:
-            need_dev_num0 = need_dev_num0 * s
-        for s in out_strategy:
-            need_dev_num1 = need_dev_num1 * s
-        self.x_id = device_id % need_dev_num0
-        self.y_id = device_id % need_dev_num0
-        self.out_id = device_id % need_dev_num1
-
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-
-    def grad_mindspore_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        output_grad = Tensor(self.output_grad_np)
-        net = AddRelu()
-        net_with_loss = NetWithLoss(net)
-        grad_net = Grad(net_with_loss)
-        grad_net.set_train()
-        input_grads = []
-        for i in range(0, 3):
-            input_grad = grad_net(x, y, output_grad)
-            input_grads.append(input_grad)
-        return input_grads
-
-    def grad_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        output_grad = Tensor(self.output_grad_np)
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
-        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(inputs_y[self.y_id])
-        output_grad1 = Tensor(outgrads[self.out_id])
-        net = AddRelu(strategy0=self.strategy0, strategy1=self.strategy1)
-        net_with_loss = NetWithLoss(net, strategy2=self.strategy2)
-        grad_net = Grad(net_with_loss)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        grad_net.set_auto_parallel()
-        grad_net.set_train()
-        input_grads = []
-        for i in range(0, 3):
-            input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad],
-                                  parallel_inputs_run=[x1, y1, output_grad1])
-            input_grads.append(input_grad)
-        return input_grads
-
-    def grad_cmp(self):
-        input_grad_mindspores = self.grad_mindspore_impl()
-        input_grad_mindspore_parallels = self.grad_mindspore_parallel_impl()
-        for i in range(0, len(input_grad_mindspores)):
-            input_grad_mindspore = input_grad_mindspores[i]
-            input_grad_mindspore_parallel = input_grad_mindspore_parallels[i]
-            input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
-            input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
-            input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
-            input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
-            input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
-            input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
-            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_single0.npy",
-                    input_grad_blocks_0[self.x_id])
-            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_single1.npy",
-                    input_grad_blocks_1[self.y_id])
-            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_parallel0.npy",
-                    input_grad_mindspore_parallel0)
-            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_parallel1.npy",
-                    input_grad_mindspore_parallel1)
-            assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
-            assert np.allclose(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)
-
-
-def test_reid_l2normalize_grad_input_128_512():
-    input_shape = (128, 512)
-    fact = AddReluFactory(input_shape, strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (4, 1)),
-                          strategy2=(0, (4, 1), (4, 1)))
-    fact.grad_cmp()
-
-
-def test_reid_l2normalize_grad_input_128_512_stridesplit():
-    input_shape = (128, 512)
-    fact = AddReluFactory(input_shape, strategy0=(0, (1, 1), (1, 1)), strategy1=(0, (4, 1)),
-                          strategy2=(0, (4, 1), (4, 1)))
-    fact.grad_cmp()
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+
+import mindspore.communication.management as distributedTool
+from mindspore import context
+from mindspore.common.tensor import Tensor
+from mindspore.nn import Cell
+from mindspore.ops import operations as P
+from mindspore.ops.composite import grad_all
+
+device_num = 4
+device_id = int(os.environ["RANK_ID"])
+path = "./output/"
+
+
+def setup_module():
+    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
+    context.set_context(mode=context.GRAPH_MODE)
+    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
+    distributedTool.init()
+    distributedTool.create_group("0-3", [0, 1, 2, 3])
+    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
+
+
+def teardown_module():
+    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
+
+
+class AddRelu(Cell):
+    def __init__(self, strategy0=None, strategy1=None):
+        super(AddRelu, self).__init__()
+        self.add = P.TensorAdd(strategy=strategy0)
+        self.relu = P.ReLU(strategy=strategy1)
+
+    def construct(self, x, y):
+        out = self.add(x, y)
+        out = self.relu(out)
+        return out
+
+
+class NetWithLoss(Cell):
+    def __init__(self, network, strategy2=None):
+        super(NetWithLoss, self).__init__()
+        self.loss = P.SoftmaxCrossEntropyWithLogits(strategy=strategy2)
+        self.network = network
+
+    def construct(self, x, y, b):
+        predict = self.network(x, y)
+        return self.loss(predict, b)[0]
+
+
+class Grad(Cell):
+    def __init__(self, network):
+        super(Grad, self).__init__()
+        self.network = network
+
+    def construct(self, x, y, b):
+        return grad_all(self.network)(x, y, b)
+
+
+class AddReluFactory:
+    def __init__(self, input_shape, strategy0, strategy1, strategy2):
+        prefix = ""
+        size = 1
+        for s in input_shape:
+            prefix = prefix + str(s)
+            size = size * s
+        self.prefix = prefix
+        number_range = min(1000, size)
+        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
+            np.float32)
+        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
+            np.float32)
+        target_shape = input_shape
+        self.target_shape = target_shape
+        target_size = 1
+        for s in target_shape:
+            target_size = target_size * s
+        number_range = min(10, target_size)
+        self.output_grad_np = np.reshape((np.arange(0, target_size) % number_range) * 0.1, target_shape).astype(
+            np.float32)
+        self.strategy0 = strategy0
+        self.strategy1 = strategy1
+        self.strategy2 = strategy2
+        out_strategy = strategy1[1]
+        self.out_strategy = out_strategy
+        need_dev_num0 = 1
+        need_dev_num1 = 1
+        for s in strategy0[1]:
+            need_dev_num0 = need_dev_num0 * s
+        for s in out_strategy:
+            need_dev_num1 = need_dev_num1 * s
+        self.x_id = device_id % need_dev_num0
+        self.y_id = device_id % need_dev_num0
+        self.out_id = device_id % need_dev_num1
+
+    def get_parallel_blocks(self, input_, strategy):
+        blocks = [input_]
+        i = 0
+        for stra in strategy:
+            temp = []
+            while len(blocks) > 0:
+                block = blocks.pop(0)
+                temp.extend(np.split(block, stra, axis=i))
+            blocks.extend(temp)
+            i += 1
+        return blocks
+
+    def grad_mindspore_impl(self):
+        x = Tensor(self.input_np1)
+        y = Tensor(self.input_np2)
+        output_grad = Tensor(self.output_grad_np)
+        net = AddRelu()
+        net_with_loss = NetWithLoss(net)
+        grad_net = Grad(net_with_loss)
+        grad_net.set_train()
+        input_grads = []
+        for i in range(0, 3):
+            input_grad = grad_net(x, y, output_grad)
+            input_grads.append(input_grad)
+        return input_grads
+
+    def grad_mindspore_parallel_impl(self):
+        x = Tensor(self.input_np1)
+        y = Tensor(self.input_np2)
+        output_grad = Tensor(self.output_grad_np)
+        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
+        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
+        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
+        x1 = Tensor(inputs_x[self.x_id])
+        y1 = Tensor(inputs_y[self.y_id])
+        output_grad1 = Tensor(outgrads[self.out_id])
+        net = AddRelu(strategy0=self.strategy0, strategy1=self.strategy1)
+        net_with_loss = NetWithLoss(net, strategy2=self.strategy2)
+        grad_net = Grad(net_with_loss)
+        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
+        grad_net.set_auto_parallel()
+        grad_net.set_train()
+        input_grads = []
+        for i in range(0, 3):
+            input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad],
+                                  parallel_inputs_run=[x1, y1, output_grad1])
+            input_grads.append(input_grad)
+        return input_grads
+
+    def grad_cmp(self):
+        input_grad_mindspores = self.grad_mindspore_impl()
+        input_grad_mindspore_parallels = self.grad_mindspore_parallel_impl()
+        for i in range(0, len(input_grad_mindspores)):
+            input_grad_mindspore = input_grad_mindspores[i]
+            input_grad_mindspore_parallel = input_grad_mindspore_parallels[i]
+            input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
+            input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
+            input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
+            input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
+            input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
+            input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
+            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_single0.npy",
+                    input_grad_blocks_0[self.x_id])
+            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_single1.npy",
+                    input_grad_blocks_1[self.y_id])
+            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_parallel0.npy",
+                    input_grad_mindspore_parallel0)
+            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_parallel1.npy",
+                    input_grad_mindspore_parallel1)
+            assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
+            assert np.allclose(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)
+
+
+def test_reid_l2normalize_grad_input_128_512():
+    input_shape = (128, 512)
+    fact = AddReluFactory(input_shape, strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (4, 1)),
+                          strategy2=(0, (4, 1), (4, 1)))
+    fact.grad_cmp()
+
+
+def test_reid_l2normalize_grad_input_128_512_stridesplit():
+    input_shape = (128, 512)
+    fact = AddReluFactory(input_shape, strategy0=(0, (1, 1), (1, 1)), strategy1=(0, (4, 1)),
+                          strategy2=(0, (4, 1), (4, 1)))
+    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/matmul/_test_matmul_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/matmul/_test_matmul_parallel_4p.py
--- a/tests/ut/python/parallel/parallel_end_to_end/max/_test_max_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/max/_test_max_parallel_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import os
-import pytest
-
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all_with_sens
-
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-
-
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    distributedTool.create_group("0-3", [0, 1, 2, 3])
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-
-
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-
-
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-
-    def construct(self, input1, input2, output_grad):
-        return grad_all_with_sens(self.network)(input1, input2, output_grad)
-
-
-class Max(Cell):
-    def __init__(self, axis, keep_dims, strategy0=None, strategy1=None):
-        super(Max, self).__init__()
-        self.add = P.TensorAdd(strategy=strategy0)
-        self.reduce_max = P.ReduceMax(keep_dims=keep_dims).set_strategy(strategy=strategy1)
-        self.axis = axis
-
-    def construct(self, input1, input2):
-        out = self.add(input1, input2)
-        return self.reduce_max(out, self.axis)
-
-
-class MaxFactory:
-    def __init__(self, input_shape, axis, keep_dims, strategy0, strategy1):
-        self.strategy0 = strategy0
-        self.strategy1 = strategy1
-        self.axis = axis
-        self.keep_dims = keep_dims
-        input_size = 1
-        prefix = ""
-        for s in input_shape:
-            prefix = prefix + str(s) + "_"
-            input_size = input_size * s
-        number_range = min(1000, input_size)
-        self.input_np1 = np.reshape(np.arange(0, input_size) % number_range - number_range / 2, input_shape).astype(
-            np.float32)
-        self.input_np2 = self.input_np1.copy()
-        self.out_grad_np = None
-        out_shape = list(input_shape)
-        out_shape.pop(axis)
-        out_size = input_size / input_shape[axis]
-        number_range_ = min(1000, out_size)
-        self.out_grad_np = np.reshape(np.arange(0, out_size) % number_range_ - number_range_ / 2, out_shape).astype(
-            np.float32)
-        out_strategy = list(strategy1[1])
-        out_strategy.pop(axis)
-        self.out_strategy = out_strategy
-        need_dev_num = 1
-        need_dev_num_ = 1
-        for s in strategy0[1]:
-            need_dev_num = need_dev_num * s
-        for s in out_strategy:
-            need_dev_num_ = need_dev_num_ * s
-        self.x_id = device_id % need_dev_num
-        self.y_id = device_id % need_dev_num
-        self.out_id = device_id % need_dev_num_
-
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-
-    def forward_mindspore_impl(self):
-        input1 = Tensor(self.input_np1)
-        input2 = Tensor(self.input_np2)
-        net = Max(axis=self.axis, keep_dims=self.keep_dims)
-        out = net(input1, input2)
-        return out.asnumpy()
-
-    def forward_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        xs = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        ys = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
-        x1 = Tensor(xs[self.x_id])
-        y1 = Tensor(ys[self.y_id])
-        net = Max(axis=self.axis, keep_dims=self.keep_dims, strategy0=self.strategy0, strategy1=self.strategy1)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        net.set_auto_parallel()
-        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
-        return out.asnumpy()
-
-    def grad_mindspore_impl(self):
-        input1 = Tensor(self.input_np1)
-        input2 = Tensor(self.input_np2)
-        out_grad = Tensor(self.out_grad_np)
-        net = Max(axis=self.axis, keep_dims=self.keep_dims)
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(input1, input2, out_grad)
-        return input_grad
-
-    def grad_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        output_grads = self.get_parallel_blocks(self.out_grad_np, self.out_strategy)
-        out_grad = Tensor(output_grads[self.out_id])
-        xs = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        ys = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
-        x1 = Tensor(xs[self.x_id])
-        y1 = Tensor(ys[self.y_id])
-        net = Max(axis=self.axis, keep_dims=self.keep_dims, strategy0=self.strategy0, strategy1=self.strategy1)
-        grad_net = Grad(net)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        grad_net.set_auto_parallel()
-        grad_net.set_train()
-        input_grad = grad_net(x, y, out_grad, parallel_inputs_compile=[x, y, out_grad],
-                              parallel_inputs_run=[x1, y1, out_grad])
-        return input_grad
-
-    def forward_cmp(self):
-        out_mindspore = self.forward_mindspore_impl()
-        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
-        print(out_mindspore)
-        print(out_mindspore_parallel)
-        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
-        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.001, 0.001)
-
-    def grad_cmp(self):
-        input_grad_mindspore = self.grad_mindspore_impl()
-        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
-        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
-        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
-        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
-        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
-        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
-        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
-        assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
-        assert np.allclose(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)
-
-
-def test_reid_max_forward_input_256_64():
-    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (4, 1), (4, 1)),
-                      strategy1=(0, (4, 1)))
-    fact.forward_cmp()
-
-
-def test_reid_max_grad_input_256_64():
-    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (4, 1), (4, 1)),
-                      strategy1=(0, (4, 1)))
-    fact.grad_cmp()
-
-
-def test_reid_max_forward_input_128_64_32_32():
-    fact = MaxFactory(input_shape=(128, 64, 32, 32), axis=3, keep_dims=False, strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)),
-                      strategy1=(0, (2, 1, 2, 1)))
-    fact.forward_cmp()
-
-
-def test_reid_max_grad_input_128_64_32_32():
-    fact = MaxFactory(input_shape=(128, 64, 32, 32), axis=3, keep_dims=False, strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)),
-                      strategy1=(0, (2, 1, 2, 1)))
-    fact.grad_cmp()
-
-
-def test_reid_max_forward_input_256_64_repeat():
-    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (2, 1), (2, 1)),
-                      strategy1=(0, (2, 1)))
-    fact.forward_cmp()
-
-
-def test_reid_max_grad_input_256_64_repeat():
-    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (2, 1), (2, 1)),
-                      strategy1=(0, (2, 1)))
-    fact.grad_cmp()
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+
+import mindspore.communication.management as distributedTool
+from mindspore import context
+from mindspore.common.tensor import Tensor
+from mindspore.nn import Cell
+from mindspore.ops import operations as P
+from mindspore.ops.composite import grad_all_with_sens
+
+device_num = 4
+device_id = int(os.environ["RANK_ID"])
+path = "./output/"
+
+
+def setup_module():
+    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
+    context.set_context(mode=context.GRAPH_MODE)
+    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
+    distributedTool.init()
+    distributedTool.create_group("0-3", [0, 1, 2, 3])
+    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
+
+
+def teardown_module():
+    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
+
+
+class Grad(Cell):
+    def __init__(self, network):
+        super(Grad, self).__init__()
+        self.network = network
+
+    def construct(self, input1, input2, output_grad):
+        return grad_all_with_sens(self.network)(input1, input2, output_grad)
+
+
+class Max(Cell):
+    def __init__(self, axis, keep_dims, strategy0=None, strategy1=None):
+        super(Max, self).__init__()
+        self.add = P.TensorAdd(strategy=strategy0)
+        self.reduce_max = P.ReduceMax(keep_dims=keep_dims).set_strategy(strategy=strategy1)
+        self.axis = axis
+
+    def construct(self, input1, input2):
+        out = self.add(input1, input2)
+        return self.reduce_max(out, self.axis)
+
+
+class MaxFactory:
+    def __init__(self, input_shape, axis, keep_dims, strategy0, strategy1):
+        self.strategy0 = strategy0
+        self.strategy1 = strategy1
+        self.axis = axis
+        self.keep_dims = keep_dims
+        input_size = 1
+        prefix = ""
+        for s in input_shape:
+            prefix = prefix + str(s) + "_"
+            input_size = input_size * s
+        number_range = min(1000, input_size)
+        self.input_np1 = np.reshape(np.arange(0, input_size) % number_range - number_range / 2, input_shape).astype(
+            np.float32)
+        self.input_np2 = self.input_np1.copy()
+        self.out_grad_np = None
+        out_shape = list(input_shape)
+        out_shape.pop(axis)
+        out_size = input_size / input_shape[axis]
+        number_range_ = min(1000, out_size)
+        self.out_grad_np = np.reshape(np.arange(0, out_size) % number_range_ - number_range_ / 2, out_shape).astype(
+            np.float32)
+        out_strategy = list(strategy1[1])
+        out_strategy.pop(axis)
+        self.out_strategy = out_strategy
+        need_dev_num = 1
+        need_dev_num_ = 1
+        for s in strategy0[1]:
+            need_dev_num = need_dev_num * s
+        for s in out_strategy:
+            need_dev_num_ = need_dev_num_ * s
+        self.x_id = device_id % need_dev_num
+        self.y_id = device_id % need_dev_num
+        self.out_id = device_id % need_dev_num_
+
+    def get_parallel_blocks(self, input_, strategy):
+        blocks = [input_]
+        i = 0
+        for stra in strategy:
+            temp = []
+            while len(blocks) > 0:
+                block = blocks.pop(0)
+                temp.extend(np.split(block, stra, axis=i))
+            blocks.extend(temp)
+            i += 1
+        return blocks
+
+    def forward_mindspore_impl(self):
+        input1 = Tensor(self.input_np1)
+        input2 = Tensor(self.input_np2)
+        net = Max(axis=self.axis, keep_dims=self.keep_dims)
+        out = net(input1, input2)
+        return out.asnumpy()
+
+    def forward_mindspore_parallel_impl(self):
+        x = Tensor(self.input_np1)
+        y = Tensor(self.input_np2)
+        xs = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
+        ys = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
+        x1 = Tensor(xs[self.x_id])
+        y1 = Tensor(ys[self.y_id])
+        net = Max(axis=self.axis, keep_dims=self.keep_dims, strategy0=self.strategy0, strategy1=self.strategy1)
+        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
+        net.set_auto_parallel()
+        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
+        return out.asnumpy()
+
+    def grad_mindspore_impl(self):
+        input1 = Tensor(self.input_np1)
+        input2 = Tensor(self.input_np2)
+        out_grad = Tensor(self.out_grad_np)
+        net = Max(axis=self.axis, keep_dims=self.keep_dims)
+        grad_net = Grad(net)
+        grad_net.set_train()
+        input_grad = grad_net(input1, input2, out_grad)
+        return input_grad
+
+    def grad_mindspore_parallel_impl(self):
+        x = Tensor(self.input_np1)
+        y = Tensor(self.input_np2)
+        output_grads = self.get_parallel_blocks(self.out_grad_np, self.out_strategy)
+        out_grad = Tensor(output_grads[self.out_id])
+        xs = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
+        ys = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
+        x1 = Tensor(xs[self.x_id])
+        y1 = Tensor(ys[self.y_id])
+        net = Max(axis=self.axis, keep_dims=self.keep_dims, strategy0=self.strategy0, strategy1=self.strategy1)
+        grad_net = Grad(net)
+        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
+        grad_net.set_auto_parallel()
+        grad_net.set_train()
+        input_grad = grad_net(x, y, out_grad, parallel_inputs_compile=[x, y, out_grad],
+                              parallel_inputs_run=[x1, y1, out_grad])
+        return input_grad
+
+    def forward_cmp(self):
+        out_mindspore = self.forward_mindspore_impl()
+        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
+        print(out_mindspore)
+        print(out_mindspore_parallel)
+        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
+        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.001, 0.001)
+
+    def grad_cmp(self):
+        input_grad_mindspore = self.grad_mindspore_impl()
+        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
+        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
+        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
+        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
+        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
+        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
+        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
+        assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
+        assert np.allclose(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)
+
+
+def test_reid_max_forward_input_256_64():
+    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (4, 1), (4, 1)),
+                      strategy1=(0, (4, 1)))
+    fact.forward_cmp()
+
+
+def test_reid_max_grad_input_256_64():
+    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (4, 1), (4, 1)),
+                      strategy1=(0, (4, 1)))
+    fact.grad_cmp()
+
+
+def test_reid_max_forward_input_128_64_32_32():
+    fact = MaxFactory(input_shape=(128, 64, 32, 32), axis=3, keep_dims=False, strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)),
+                      strategy1=(0, (2, 1, 2, 1)))
+    fact.forward_cmp()
+
+
+def test_reid_max_grad_input_128_64_32_32():
+    fact = MaxFactory(input_shape=(128, 64, 32, 32), axis=3, keep_dims=False, strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)),
+                      strategy1=(0, (2, 1, 2, 1)))
+    fact.grad_cmp()
+
+
+def test_reid_max_forward_input_256_64_repeat():
+    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (2, 1), (2, 1)),
+                      strategy1=(0, (2, 1)))
+    fact.forward_cmp()
+
+
+def test_reid_max_grad_input_256_64_repeat():
+    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (2, 1), (2, 1)),
+                      strategy1=(0, (2, 1)))
+    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/mul_softmax/need_fix_test_mul_softmax_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/mul_softmax/need_fix_test_mul_softmax_parallel_4p.py
--- a/tests/ut/python/parallel/parallel_end_to_end/onehot/_test_onehot_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/onehot/_test_onehot_parallel_4p.py
--- a/tests/ut/python/parallel/parallel_end_to_end/prelu/_test_prelu_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/prelu/_test_prelu_parallel_4p.py
--- a/tests/ut/python/parallel/parallel_end_to_end/reducemean/_test_reducemean_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/reducemean/_test_reducemean_parallel_4p.py
--- a/tests/ut/python/parallel/parallel_end_to_end/reshape/_test_reshape_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/reshape/_test_reshape_parallel_4p.py
--- a/tests/ut/python/parallel/parallel_end_to_end/transpose/_test_transpose_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/transpose/_test_transpose_parallel_4p.py
--- a/tests/ut/python/parallel/test_add_relu_redistribution.py
+++ b/tests/ut/python/parallel/test_add_relu_redistribution.py
@@ -54,7 +54,7 @@ class Grad(nn.Cell):
        return C.grad_all(self.network)(x, y)


-def compile(net, x, y):
+def compile_net(net, x, y):
    net.set_auto_parallel()
    _executor.compile(net, x, y)

@@ -69,7 +69,7 @@ def test_add_relu_stride_slice():

    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32]), dtype=ms.float32)
-    compile(net, x, y)
+    compile_net(net, x, y)


 def test_add_relu_all_gather():
@@ -82,4 +82,4 @@ def test_add_relu_all_gather():

    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32]), dtype=ms.float32)
-    compile(net, x, y)
+    compile_net(net, x, y)
--- a/tests/ut/python/parallel/test_allreduce_fusion.py
+++ b/tests/ut/python/parallel/test_allreduce_fusion.py
--- a/tests/ut/python/parallel/test_alltoall.py
+++ b/tests/ut/python/parallel/test_alltoall.py
--- a/tests/ut/python/parallel/test_arithmetic.py
+++ b/tests/ut/python/parallel/test_arithmetic.py
--- a/tests/ut/python/parallel/test_auto_parallel_BN_PReLU.py
+++ b/tests/ut/python/parallel/test_auto_parallel_BN_PReLU.py
@@ -20,7 +20,6 @@ from mindspore import Tensor
 from mindspore import context
 from mindspore.common.api import _executor
 from mindspore.ops import composite as C
-from mindspore.ops import operations as P
 from tests.ut.python.ops.test_math_ops import VirtualLoss



--- a/tests/ut/python/parallel/test_auto_parallel_arithmetic.py
+++ b/tests/ut/python/parallel/test_auto_parallel_arithmetic.py
--- a/tests/ut/python/parallel/test_auto_parallel_assign_sub_with_ref_key.py
+++ b/tests/ut/python/parallel/test_auto_parallel_assign_sub_with_ref_key.py
--- a/tests/ut/python/parallel/test_auto_parallel_double_subgraphs.py
+++ b/tests/ut/python/parallel/test_auto_parallel_double_subgraphs.py
--- a/tests/ut/python/parallel/test_auto_parallel_four_matmul.py
+++ b/tests/ut/python/parallel/test_auto_parallel_four_matmul.py
--- a/tests/ut/python/parallel/test_auto_parallel_inference.py
+++ b/tests/ut/python/parallel/test_auto_parallel_inference.py
--- a/tests/ut/python/parallel/test_auto_parallel_matmul_prelu.py
+++ b/tests/ut/python/parallel/test_auto_parallel_matmul_prelu.py
--- a/tests/ut/python/parallel/test_auto_parallel_parameter_cast.py
+++ b/tests/ut/python/parallel/test_auto_parallel_parameter_cast.py
--- a/tests/ut/python/parallel/test_auto_parallel_reduce_method.py
+++ b/tests/ut/python/parallel/test_auto_parallel_reduce_method.py
--- a/tests/ut/python/parallel/test_auto_parallel_reshape.py
+++ b/tests/ut/python/parallel/test_auto_parallel_reshape.py
--- a/tests/ut/python/parallel/test_auto_parallel_rhombus.py
+++ b/tests/ut/python/parallel/test_auto_parallel_rhombus.py
--- a/tests/ut/python/parallel/test_auto_parallel_softmax_loss.py
+++ b/tests/ut/python/parallel/test_auto_parallel_softmax_loss.py
--- a/tests/ut/python/parallel/test_auto_parallel_transformer.py
+++ b/tests/ut/python/parallel/test_auto_parallel_transformer.py
--- a/tests/ut/python/parallel/test_auto_parallel_two_bn.py
+++ b/tests/ut/python/parallel/test_auto_parallel_two_bn.py
--- a/tests/ut/python/parallel/test_auto_parallel_two_matmul.py
+++ b/tests/ut/python/parallel/test_auto_parallel_two_matmul.py
--- a/tests/ut/python/parallel/test_auto_star_elimination.py
+++ b/tests/ut/python/parallel/test_auto_star_elimination.py
--- a/tests/ut/python/parallel/test_batch_matmul.py
+++ b/tests/ut/python/parallel/test_batch_matmul.py
--- a/tests/ut/python/parallel/test_batchnorm_batch_parallel.py
+++ b/tests/ut/python/parallel/test_batchnorm_batch_parallel.py
--- a/tests/ut/python/parallel/test_bn_prelu_cell.py
+++ b/tests/ut/python/parallel/test_bn_prelu_cell.py
--- a/tests/ut/python/parallel/test_bool_grad.py
+++ b/tests/ut/python/parallel/test_bool_grad.py
--- a/tests/ut/python/parallel/test_broadcast_dict.py
+++ b/tests/ut/python/parallel/test_broadcast_dict.py
--- a/tests/ut/python/parallel/test_comparison_function_info.py
+++ b/tests/ut/python/parallel/test_comparison_function_info.py
--- a/tests/ut/python/parallel/test_dataset_util.py
+++ b/tests/ut/python/parallel/test_dataset_util.py
--- a/tests/ut/python/parallel/test_dense_matmul.py
+++ b/tests/ut/python/parallel/test_dense_matmul.py
--- a/tests/ut/python/parallel/test_different_type_for_div_op.py
+++ b/tests/ut/python/parallel/test_different_type_for_div_op.py
--- a/tests/ut/python/parallel/test_dropout_do_mask.py
+++ b/tests/ut/python/parallel/test_dropout_do_mask.py
--- a/tests/ut/python/parallel/test_element_wise_function.py
+++ b/tests/ut/python/parallel/test_element_wise_function.py
--- a/tests/ut/python/parallel/test_expand_dims.py
+++ b/tests/ut/python/parallel/test_expand_dims.py
--- a/tests/ut/python/parallel/test_forward_graph.py
+++ b/tests/ut/python/parallel/test_forward_graph.py
--- a/tests/ut/python/parallel/test_gather_v2.py
+++ b/tests/ut/python/parallel/test_gather_v2.py
--- a/tests/ut/python/parallel/test_gather_v2_primitive.py
+++ b/tests/ut/python/parallel/test_gather_v2_primitive.py
--- a/tests/ut/python/parallel/test_get_next.py
+++ b/tests/ut/python/parallel/test_get_next.py
--- a/tests/ut/python/parallel/test_get_parameter_layout.py
+++ b/tests/ut/python/parallel/test_get_parameter_layout.py
--- a/tests/ut/python/parallel/test_hybird_parallel_activation.py
+++ b/tests/ut/python/parallel/test_hybird_parallel_activation.py
--- a/tests/ut/python/parallel/test_layer_norm.py
+++ b/tests/ut/python/parallel/test_layer_norm.py
--- a/tests/ut/python/parallel/test_linear.py
+++ b/tests/ut/python/parallel/test_linear.py
--- a/tests/ut/python/parallel/test_loss_and_optimizer.py
+++ b/tests/ut/python/parallel/test_loss_and_optimizer.py
--- a/tests/ut/python/parallel/test_matmul_tensor.py
+++ b/tests/ut/python/parallel/test_matmul_tensor.py
--- a/tests/ut/python/parallel/test_neg.py
+++ b/tests/ut/python/parallel/test_neg.py
--- a/tests/ut/python/parallel/test_one_dev.py
+++ b/tests/ut/python/parallel/test_one_dev.py
--- a/tests/ut/python/parallel/test_one_hot_net.py
+++ b/tests/ut/python/parallel/test_one_hot_net.py
--- a/tests/ut/python/parallel/test_one_weight_parameter.py
+++ b/tests/ut/python/parallel/test_one_weight_parameter.py
--- a/tests/ut/python/parallel/test_onehot.py
+++ b/tests/ut/python/parallel/test_onehot.py
--- a/tests/ut/python/parallel/test_operator_model_parallel.py
+++ b/tests/ut/python/parallel/test_operator_model_parallel.py
--- a/tests/ut/python/parallel/test_optimizer_clone_weight.py
+++ b/tests/ut/python/parallel/test_optimizer_clone_weight.py
--- a/tests/ut/python/parallel/test_parameter_init.py
+++ b/tests/ut/python/parallel/test_parameter_init.py
--- a/tests/ut/python/parallel/test_prelu.py
+++ b/tests/ut/python/parallel/test_prelu.py
--- a/tests/ut/python/parallel/test_prelu_cell.py
+++ b/tests/ut/python/parallel/test_prelu_cell.py
--- a/tests/ut/python/parallel/test_reduce_method_info.py
+++ b/tests/ut/python/parallel/test_reduce_method_info.py
--- a/tests/ut/python/parallel/test_reshape.py
+++ b/tests/ut/python/parallel/test_reshape.py
--- a/tests/ut/python/parallel/test_reshape_parameter.py
+++ b/tests/ut/python/parallel/test_reshape_parameter.py
--- a/tests/ut/python/parallel/test_scalar_loss.py
+++ b/tests/ut/python/parallel/test_scalar_loss.py
--- a/tests/ut/python/parallel/test_set_auto_parallel_context.py
+++ b/tests/ut/python/parallel/test_set_auto_parallel_context.py
--- a/tests/ut/python/parallel/test_sigmoid_cross_entropy_with_logits.py
+++ b/tests/ut/python/parallel/test_sigmoid_cross_entropy_with_logits.py
--- a/tests/ut/python/parallel/test_softmax_cross_entropy_loss.py
+++ b/tests/ut/python/parallel/test_softmax_cross_entropy_loss.py
--- a/tests/ut/python/parallel/test_split_grad_sens.py
+++ b/tests/ut/python/parallel/test_split_grad_sens.py
--- a/tests/ut/python/parallel/test_squeeze_info.py
+++ b/tests/ut/python/parallel/test_squeeze_info.py
--- a/tests/ut/python/parallel/test_sum_as_loss.py
+++ b/tests/ut/python/parallel/test_sum_as_loss.py
--- a/tests/ut/python/parallel/test_transpose.py
+++ b/tests/ut/python/parallel/test_transpose.py
--- a/tests/ut/python/parallel/test_two_matmul.py
+++ b/tests/ut/python/parallel/test_two_matmul.py
--- a/tests/ut/python/parallel/test_two_weights_parameter.py
+++ b/tests/ut/python/parallel/test_two_weights_parameter.py
--- a/tests/ut/python/parallel/test_virtual_dataset_3_input.py
+++ b/tests/ut/python/parallel/test_virtual_dataset_3_input.py