!1370 delete parallel end-to-end test cases

Merge pull request !1370 from yihuaijie/master

!1370 delete parallel end-to-end test cases
Merge pull request !1370 from yihuaijie/master
d402b944 · mindspore-ci-bot · Gitee · f967700e · 1e6ee838 · f967700e
32 changed file
--- a/tests/ut/python/parallel/parallel_end_to_end/add_relu/_test_add_relu_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/add_relu/_test_add_relu_parallel_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import numpy as np
-import pytest
-import mindspore as ms
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all_with_sens
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    distributedTool.create_group("0-3", [0, 1, 2, 3])
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-class AddRelu(Cell):
-    def __init__(self, strategy0=None, strategy1=None):
-        super(AddRelu, self).__init__()
-        self.add = P.TensorAdd(strategy=strategy0)
-        self.relu = P.ReLU(strategy=strategy1)
-    def construct(self, x, z):
-        out = self.add(x, z)
-        return self.relu(out)
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-    def construct(self, x, y, output_grad):
-        return grad_all_with_sens(self.network)(x, y, output_grad)
-class AddReluFactory:
-    def __init__(self, input_shape, strategy0, strategy1):
-        prefix = ""
-        size = 1
-        for s in input_shape:
-            prefix = prefix + str(s)
-            size = size * s
-        self.prefix = prefix
-        number_range = min(1000, size)
-        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
-            np.float32)
-        self.input_np2 = 1.0
-        self.output_grad_np = np.reshape((np.arange(0, size) % (number_range - 10) - number_range / 2) * 0.1,
-                                         input_shape).astype(np.float32)
-        self.strategy0 = strategy0
-        self.strategy1 = strategy1
-        need_dev_num = 1
-        need_dev_num_ = 1
-        for s in strategy0[1]:
-            need_dev_num = need_dev_num * s
-        for s in strategy1[1]:
-            need_dev_num_ = need_dev_num_ * s
-        self.x_id = device_id % need_dev_num
-        self.y_id = device_id % need_dev_num
-        self.out_id = device_id % need_dev_num_
-    def forward_mindspore_impl(self):
-        net = AddRelu()
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2, ms.float32)
-        out = net(x, y)
-        return out.asnumpy()
-    def forward_mindspore_parallel_impl(self):
-        net = AddRelu(strategy0=self.strategy0, strategy1=self.strategy1)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        net.set_auto_parallel()
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2, ms.float32)
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(self.input_np2, ms.float32)
-        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
-        return out.asnumpy()
-    def grad_mindspore_impl(self):
-        output_grad = Tensor(self.output_grad_np)
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2, ms.float32)
-        net = AddRelu()
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(x, y, output_grad)
-        return input_grad
-    def grad_mindspore_parallel_impl(self):
-        output_grads = self.get_parallel_blocks(self.output_grad_np, self.strategy1[1])
-        output_grad = Tensor(output_grads[self.out_id])
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2, ms.float32)
-        net = AddRelu(strategy0=self.strategy0, strategy1=self.strategy1)
-        grad_net = Grad(net)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        grad_net.set_auto_parallel()
-        grad_net.set_train()
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(self.input_np2, ms.float32)
-        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad],
-                              parallel_inputs_run=[x1, y1, output_grad])
-        return input_grad
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-    def forward_cmp(self):
-        out_mindspore = self.forward_mindspore_impl()
-        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
-        out_blocks = self.get_parallel_blocks(out_mindspore, self.strategy1[1])
-        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)
-    def grad_cmp(self):
-        input_grad_mindspore = self.grad_mindspore_impl()
-        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
-        _ = input_grad_mindspore[0].asnumpy()
-        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
-        _ = input_grad_mindspore_parallel[0].asnumpy()
-        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
-        assert np.allclose(input_grad_mindspore1, input_grad_mindspore_parallel1, 0.0001, 0.0001)
-@pytest.mark.reid_forward
-def test_reid_add_relu_input_256_64():
-    stra0 = (0, (2, 2), ())
-    stra1 = (0, (2, 2))
-    fact = AddReluFactory(input_shape=(256, 64), strategy0=stra0, strategy1=stra1)
-    fact.forward_cmp()
-@pytest.mark.reid_grad
-def test_reid_grad_add_relu_input_256_64():
-    stra0 = (0, (2, 2), ())
-    stra1 = (0, (2, 2))
-    fact = AddReluFactory(input_shape=(256, 64), strategy0=stra0, strategy1=stra1)
-    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/add_relu/add_relu_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/add_relu/add_relu_parallel_4p.sh
-#!/bin/bash
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-for((i=0;i<4;i++));
-do
-	rm -rf device$i
-	mkdir device$i
-	cd device$i
-    mkdir output
-    source ../../dist_env_4p.sh $i
-    env  >log$i.log
-    pytest -s ../test_add_relu_parallel_4p.py>../../log/test_add_relu_parallel_4p_log$i.log  2>&1 &
-    cd ..    
-done
--- a/tests/ut/python/parallel/parallel_end_to_end/batch_parallel/_test_conv2d_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/batch_parallel/_test_conv2d_parallel_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import numpy as np
-from numpy import allclose
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore._checkparam import check_bool, twice
-from mindspore.common.initializer import initializer
-from mindspore.common.parameter import Parameter
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all_with_sens
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    distributedTool.create_group("0-3", [0, 1, 2, 3])
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-class _Conv(Cell):
-    r"""Applies a N-D convolution over an input signal composed of several input
-       planes.
-    """
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride,
-                 pad_mode,
-                 padding,
-                 dilation,
-                 group,
-                 has_bias,
-                 weight_init,
-                 bias_init):
-        super(_Conv, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.pad_mode = pad_mode
-        self.padding = padding
-        self.dilation = dilation
-        self.group = group
-        self.has_bias = has_bias
-        if not (isinstance(in_channels, int) and in_channels > 0):
-            raise ValueError('Attr \'in_channels\' of \'Conv2D\' Op passed '
-                             + str(in_channels) + ', should be a int and greater than 0.')
-        if (not isinstance(kernel_size, tuple)) or len(kernel_size) != 2 or \
-                (not isinstance(kernel_size[0], int)) or (not isinstance(kernel_size[1], int)) or \
-                kernel_size[0] < 1 or kernel_size[1] < 1:
-            raise ValueError('Attr \'kernel_size\' of \'Conv2D\' Op passed '
-                             + str(self.kernel_size) + ', should be a int or tuple and equal to or greater than 1.')
-        if in_channels % group != 0:
-            raise ValueError('Attr \'in_channels\' of \'Conv2D\' Op must be divisible by '
-                             'attr \'group\' of \'Conv2D\' Op.')
-        if out_channels % group != 0:
-            raise ValueError('Attr \'out_channels\' of \'Conv2D\' Op must be divisible by '
-                             'attr \'group\' of \'Conv2D\' Op.')
-        self.weight = Parameter(initializer(
-            weight_init, [out_channels, in_channels // group, *kernel_size]), name='weight')
-        if check_bool(has_bias):
-            self.bias = Parameter(initializer(
-                bias_init, [out_channels]), name='bias')
-        else:
-            if bias_init != 'zeros':
-                print("Value of 'has_bias' is False, value of 'bias_init' will be ignored.")
-            self.bias = None
-    def construct(self, *inputs):
-        raise NotImplementedError
-class Conv2d(_Conv):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 pad_mode='same',
-                 padding=0,
-                 dilation=1,
-                 group=1,
-                 has_bias=False,
-                 weight_init='normal',
-                 bias_init='zeros',
-                 strategy=None):
-        kernel_size = twice(kernel_size)
-        super(Conv2d, self).__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            pad_mode,
-            padding,
-            dilation,
-            group,
-            has_bias,
-            weight_init,
-            bias_init)
-        self.add = P.TensorAdd(strategy)
-        self.conv2d = P.Conv2D(out_channel=self.out_channels,
-                               kernel_size=self.kernel_size,
-                               mode=1,
-                               pad_mode=self.pad_mode,
-                               pad=self.padding,
-                               stride=self.stride,
-                               dilation=self.dilation,
-                               group=self.group,
-                               strategy=None)
-        self.bias_add = P.BiasAdd()
-    def construct(self, input1, input2):
-        x = self.add(input1, input2)
-        if self.has_bias:
-            return self.bias_add(self.conv2d(x, self.weight),
-                                 self.bias)
-        return self.conv2d(x, self.weight)
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-    def construct(self, input1, input2, output_grad):
-        return grad_all_with_sens(self.network)(input1, input2, output_grad)
-class Conv2dFactory:
-    def __init__(self, input_shape, filter_shape, stride, pad_mode, padding, dilation, group, has_bias):
-        self.in_n, self.in_c, self.in_h, self.in_w = input_shape
-        self.out_c, self.kernel_c, self.kernel_h, self.kernel_w = filter_shape
-        self.stride = stride
-        self.pad_mode = pad_mode
-        self.padding = padding
-        self.dilation = dilation
-        self.group = group
-        self.strategy0 = (0, (4, 1, 1, 1), (1, 1, 1, 1))
-        prefix = ""
-        input_size = 1
-        filter_size = 1
-        for s in input_shape:
-            prefix = prefix + str(s) + "_"
-            input_size = input_size * s
-        self.prefix = prefix
-        for s in filter_shape:
-            filter_size = filter_size * s
-        number_range1 = min(10, input_size)
-        number_range2 = min(10, filter_size)
-        self.input_np1 = np.reshape(np.arange(0, input_size) % number_range1 - number_range1 / 2, input_shape).astype(
-            np.float16)
-        self.input_np2 = np.reshape(np.arange(0, input_size) % number_range1 - number_range1 / 4, input_shape).astype(
-            np.float16)
-        self.weight_np = np.reshape(np.arange(0, filter_size) % number_range2 - number_range2 / 2, filter_shape).astype(
-            np.float16)
-        self.has_bias = has_bias
-        if self.has_bias is True:
-            self.bias_np = np.arange(0, self.out_c).astype(np.float16)
-        self.out_shape = (128, 64, 56, 56)
-        out_size = 1
-        for s in self.out_shape:
-            out_size = out_size * s
-        number_range3 = min(10, out_size)
-        self.output_grad_np = np.reshape(np.arange(0, out_size) % number_range3 - number_range3 / 2,
-                                         self.out_shape).astype(np.float16)
-        self.x_id = device_id % 4
-        self.y_id = device_id % 4
-        self.out_strategy = self.strategy0[1]
-        self.out_id = device_id % 4
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-    def forward_conv2d_mindspore_impl(self):
-        input1 = Tensor(self.input_np1)
-        input2 = Tensor(self.input_np2)
-        weight = Tensor(self.weight_np)
-        if self.has_bias:
-            bias = Tensor(self.bias_np)
-            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
-                         kernel_size=(self.kernel_h, self.kernel_w),
-                         stride=self.stride, pad_mode=self.pad_mode,
-                         padding=self.padding, dilation=self.dilation,
-                         group=self.group, has_bias=True, weight_init=weight,
-                         bias_init=bias)
-        else:
-            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
-                         kernel_size=(self.kernel_h, self.kernel_w),
-                         stride=self.stride, pad_mode=self.pad_mode,
-                         padding=self.padding, dilation=self.dilation,
-                         group=self.group, has_bias=False, weight_init=weight)
-        out = net(input1, input2)
-        return out.asnumpy()
-    def forward_conv2d_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        weight = Tensor(self.weight_np)
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(inputs_y[self.y_id])
-        if self.has_bias:
-            bias = Tensor(self.bias_np)
-            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
-                         kernel_size=(self.kernel_h, self.kernel_w),
-                         stride=self.stride, pad_mode=self.pad_mode,
-                         padding=self.padding, dilation=self.dilation,
-                         group=self.group, has_bias=True, weight_init=weight,
-                         bias_init=bias, strategy=(self.strategy0[0], self.strategy0[1], self.strategy0[1]))
-        else:
-            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
-                         kernel_size=(self.kernel_h, self.kernel_w),
-                         stride=self.stride, pad_mode=self.pad_mode,
-                         padding=self.padding, dilation=self.dilation,
-                         group=self.group, has_bias=False, weight_init=weight,
-                         strategy=(self.strategy0[0], self.strategy0[1], self.strategy0[1]))
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        net.set_auto_parallel()
-        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
-        return out.asnumpy()
-    def grad_conv2d_mindspore_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        weight = Tensor(self.weight_np)
-        output_grad = Tensor(self.output_grad_np)
-        if self.has_bias:
-            bias = Tensor(self.bias_np)
-            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
-                         kernel_size=(self.kernel_h, self.kernel_w),
-                         stride=self.stride, pad_mode=self.pad_mode,
-                         padding=self.padding, dilation=self.dilation,
-                         group=self.group, has_bias=True, weight_init=weight,
-                         bias_init=bias,)
-        else:
-            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
-                         kernel_size=(self.kernel_h, self.kernel_w),
-                         stride=self.stride, pad_mode=self.pad_mode,
-                         padding=self.padding, dilation=self.dilation,
-                         group=self.group, has_bias=False, weight_init=weight)
-        grad_net = Grad(net)
-        grad_net.set_train()
-        out_grad = grad_net(x, y, output_grad)
-        return out_grad
-    def grad_conv2d_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        weight = Tensor(self.weight_np)
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(inputs_y[self.y_id])
-        output_grad = Tensor(self.output_grad_np)
-        output_grads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
-        output_grad1 = Tensor(output_grads[self.out_id])
-        if self.has_bias:
-            bias = Tensor(self.bias_np)
-            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
-                         kernel_size=(self.kernel_h, self.kernel_w),
-                         stride=self.stride, pad_mode=self.pad_mode,
-                         padding=self.padding, dilation=self.dilation,
-                         group=self.group, has_bias=True, weight_init=weight,
-                         bias_init=bias, strategy=(self.strategy0[0], self.strategy0[1], self.strategy0[1]))
-        else:
-            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
-                         kernel_size=(self.kernel_h, self.kernel_w),
-                         stride=self.stride, pad_mode=self.pad_mode,
-                         padding=self.padding, dilation=self.dilation,
-                         group=self.group, has_bias=False, weight_init=weight,
-                         strategy=(self.strategy0[0], self.strategy0[1], self.strategy0[1]))
-        grad_net = Grad(net)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        grad_net.set_train()
-        grad_net.set_auto_parallel()
-        out_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
-                            parallel_inputs_run=[x1, y1, output_grad1])
-        return out_grad
-    def forward_conv2d_cmp(self):
-        out_mindspore = self.forward_conv2d_mindspore_impl()
-        out_mindspore_parallel = self.forward_conv2d_mindspore_parallel_impl()
-        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
-        assert allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.001, 0.001)
-    def grad_conv2d_cmp(self):
-        input_grad_mindspore = self.grad_conv2d_mindspore_impl()
-        input_grad_mindspore_parallel = self.grad_conv2d_mindspore_parallel_impl()
-        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
-        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
-        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
-        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
-        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
-        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[1])
-        assert allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.001, 0.001)
-        assert allclose(input_grad_blocks_1[self.x_id], input_grad_mindspore_parallel1, 0.001, 0.001)
-def test_reid_conv2d_input_128_64_112_112_kernel_64_64_1_1_stride_2_padding_0_bias_true():
-    fact = Conv2dFactory(input_shape=(128, 64, 112, 112),
-                         filter_shape=(64, 64, 1, 1),
-                         stride=2, pad_mode='valid', padding=0,
-                         dilation=1, group=1, has_bias=False)
-    fact.forward_conv2d_cmp()
-def test_reid_conv2d_grad_input_128_64_112_112_kernel_64_64_1_1_stride_2_padding_0_bias_true():
-    fact = Conv2dFactory(input_shape=(128, 64, 112, 112),
-                         filter_shape=(64, 64, 1, 1),
-                         stride=2, pad_mode='valid', padding=0,
-                         dilation=1, group=1, has_bias=False)
-    fact.grad_conv2d_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/batch_parallel/conv2d_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/batch_parallel/conv2d_parallel_4p.sh
-#!/bin/bash
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-for((i=0;i<4;i++));
-do
-    rm -rf device$i
-    mkdir device$i
-    cd device$i
-    mkdir output
-    source ../../dist_env_4p.sh $i
-    env  >log$i.log
-    pytest -s ../test_conv2d_parallel_4p.py>../../log/test_conv2d_parallel_4p_log$i.log  2>&1 &
-    cd ..     
-done
--- a/tests/ut/python/parallel/parallel_end_to_end/dist_env_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/dist_env_4p.sh
-#!/bin/bash
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-export SLOG_PRINT_TO_STDOUT=1
-source /root/miniconda3/bin/activate ci3.6
-export RANK_SIZE=4
-export RANK_TABLE_FILE=../../rank_table_4p.json
-export RANK_ID=$1
-export DEVICE_ID=$1
-export HCCL_FLAG=1
-export DEPLOY_MODE=0
-export AICPU_FLAG=1
-export DUMP_OP=1
-export PYTHONPATH=../../../../../../../../mindspore:/usr/local/HiAI/runtime/python3.6/site-packages/topi.egg/:/usr/local/HiAI/runtime/python3.6/site-packages/te.egg/:/usr/local/HiAI/runtime/ops/op_impl/built-in/ai_core/tbe/
-export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/HiAI/runtime/lib64/libhccl.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/libfe.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/libaicpu_plugin.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/libge_local_engine.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/librts_engine.so
-export LD_LIBRARY_PATH=/usr/local/HiAI/runtime/lib64
-export FE_FLAG=1
-export PATH=/usr/local/HiAI/runtime/ccec_compiler/bin:$PATH
-if [ $1 -eq 0 ];
-then
-    export DUMP_GE_GRAPH=true
-    export ME_DRAW_GRAPH=1
-fi
--- a/tests/ut/python/parallel/parallel_end_to_end/dropout/_test_dropout_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/dropout/_test_dropout_parallel_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import numpy as np
-import mindspore as ms
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.nn import Dropout
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    distributedTool.create_group("0-3", [0, 1, 2, 3])
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-class Net(Cell):
-    def __init__(self, keep_prob, seed0, seed1, strategy=None):
-        super(Net, self).__init__()
-        self.drop = Dropout(keep_prob, seed0, seed1, dtype=ms.float32, strategy=strategy)
-    def construct(self, input_):
-        x = self.drop(input_)
-        return x
-# pylint: disable=comparison-with-itself
-class DropoutFactory:
-    def __init__(self, input_shape, keep_prob, seed0, seed1, strategy0=None):
-        size = 1
-        prefix = ""
-        for s in input_shape:
-            prefix = prefix + str(s)
-            size = size * s
-        self.prefix = prefix
-        number_range = min(10, size)
-        self.input_np = np.reshape(np.arange(0, size) % number_range, input_shape).astype(np.float32)
-        self.keep_prob = keep_prob
-        self.seed0 = seed0
-        self.seed1 = seed1
-        self.strategy0 = strategy0
-        need_dev_num = 1
-        for s in strategy0[1]:
-            need_dev_num = need_dev_num * s
-        self.x_id = device_id % need_dev_num
-        self.out_id = device_id % need_dev_num
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-    def d4_tensor_compare(self, input_, out_me):
-        [a, b, c, d] = input_.shape
-        for i in range(a):
-            for j in range(b):
-                for k in range(c):
-                    for e in range(d):
-                        if out_me[i, j, k, e] == 0:
-                            assert True
-                        else:
-                            assert np.allclose(out_me[i, j, k, e], input_[i, j, k, e] * (1 / 0.4), 0.0001, 0.0001)
-    def forward_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np)
-        inputs_x = self.get_parallel_blocks(self.input_np, self.strategy0[1])
-        x1 = Tensor(inputs_x[self.x_id])
-        net = Net(0.4, 0, 0, strategy=self.strategy0)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        net.set_auto_parallel()
-        out = net(x, parallel_inputs_compile=[x], parallel_inputs_run=[x1])
-        return out.asnumpy()
-    def forward_cmp(self):
-        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
-        input_blocks = self.get_parallel_blocks(self.input_np, self.strategy0[1])
-        self.d4_tensor_compare(input_blocks[self.out_id], out_mindspore_parallel)
-def test_reid_dropout_forward_seed_F32_64_512_8_8():
-    fact = DropoutFactory(input_shape=(64, 512, 8, 8), keep_prob=0.4, seed0=0, seed1=0, strategy0=(0, (4, 1, 1, 1)))
-    fact.forward_cmp()
-def test_reid_dropout_forward_seed_F32_64_512_8_8_repeat():
-    fact = DropoutFactory(input_shape=(64, 512, 8, 8), keep_prob=0.4, seed0=0, seed1=0, strategy0=(0, (2, 1, 1, 1)))
-    fact.forward_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/dropout/dropout_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/dropout/dropout_parallel_4p.sh
-#!/bin/bash
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-for((i=0;i<4;i++));
-do
-    rm -rf device$i
-    mkdir device$i
-    cd device$i
-    mkdir output
-    source ../../dist_env_4p.sh $i
-    env  >log$i.log
-    pytest -s ../test_dropout_parallel_4p.py>../../log/test_dropout_parallel_4p_log$i.log  2>&1 &
-    cd ..    
-done
--- a/tests/ut/python/parallel/parallel_end_to_end/hcom/_test_allgather_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/hcom/_test_allgather_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import numpy as np
-import mindspore as ms
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all_with_sens
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-class MatmulSingle(Cell):
-    def __init__(self, transpose_a=False, transpose_b=False):
-        super(MatmulSingle, self).__init__()
-        self.matmul = P.MatMul(transpose_a, transpose_b)
-        self.pow = P.Pow()
-        self.reduce_sum = P.ReduceSum()
-    def construct(self, x, y):
-        out = self.matmul(x, y)
-        out = self.pow(out, 2.0)
-        out = self.reduce_sum(out, None)
-        return out
-class MatmulAllgather(Cell):
-    def __init__(self, group, transpose_a=False, transpose_b=False):
-        super(MatmulAllgather, self).__init__()
-        self.allgather = P.AllGather(group=group)
-        self.matmul = P.MatMul(transpose_a, transpose_b)
-        self.pow = P.Pow()
-        self.reduce_sum = P.ReduceSum()
-        self.allreduce = P.AllReduce(group=group)
-    def construct(self, x, y):
-        x = self.allgather(x)
-        out = self.matmul(x, y)
-        out = self.pow(out, 2.0)
-        out = self.reduce_sum(out, None)
-        out = self.allreduce(out)
-        return out
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-    def construct(self, x, y, sens):
-        return grad_all_with_sens(self.network)(x, y, sens)
-class MatmulAllgatherFactory:
-    def __init__(self, inputx_shape, inputy_shape, x_stra, y_stra):
-        self.inputx = self.gen_value(inputx_shape, 10)
-        self.inputy = self.gen_value(inputy_shape, 20)
-        self.x_stra = x_stra
-        self.y_stra = y_stra
-        stra_size = 1
-        for s in x_stra:
-            stra_size = stra_size * s
-        self.stra_size = stra_size
-    def gen_value(self, input_shape, delta):
-        size = 1
-        for s in input_shape:
-            size = size * s
-        number_range = min(100, size)
-        input_np = np.reshape(np.arange(0, size) % number_range - delta, input_shape).astype(np.float32)
-        return input_np
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-    def grad_mindspore_impl_single(self):
-        x = Tensor(self.inputx)
-        y = Tensor(self.inputy)
-        sens = Tensor(1.0, dtype=ms.float32)
-        net = MatmulSingle()
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(x, y, sens)
-        return input_grad
-    def grad_mindspore_impl_reduce(self):
-        inputxs = self.get_parallel_blocks(self.inputx, self.x_stra)
-        inputys = self.get_parallel_blocks(self.inputy, self.y_stra)
-        x = Tensor(inputxs[device_id % self.stra_size])
-        y = Tensor(inputys[device_id % self.stra_size])
-        repeat_num = device_num / self.stra_size
-        v = self.stra_size * repeat_num * repeat_num * repeat_num
-        sens = Tensor(1.0 / v, dtype=ms.float32)
-        net = MatmulAllgather("hccl_world_group")
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(x, y, sens)
-        return input_grad
-    def grad_cmp(self):
-        single_results = self.grad_mindspore_impl_single()
-        reduce_results = self.grad_mindspore_impl_reduce()
-        single_result0 = self.get_parallel_blocks(single_results[0].asnumpy(), self.x_stra)[device_id % self.stra_size]
-        reduce_result0 = reduce_results[0].asnumpy()
-        single_result1 = self.get_parallel_blocks(single_results[1].asnumpy(), self.y_stra)[device_id % self.stra_size]
-        reduce_result1 = reduce_results[1].asnumpy()
-        assert np.allclose(single_result0, reduce_result0, 0.0001, 0.0001)
-        assert np.allclose(single_result1, reduce_result1, 0.0001, 0.0001)
-def test_reduce_grad():
-    inputx_shape = (64, 32)
-    inputy_shape = (32, 64)
-    fact = MatmulAllgatherFactory(inputx_shape, inputy_shape, (4, 1), (1, 4))
-    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/hcom/_test_allreduce_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/hcom/_test_allreduce_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import numpy as np
-import mindspore as ms
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all_with_sens
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-class MatmulSingle(Cell):
-    def __init__(self, transpose_a=False, transpose_b=False):
-        super(MatmulSingle, self).__init__()
-        self.matmul1 = P.MatMul(transpose_a, transpose_b)
-        self.matmul2 = P.MatMul(transpose_a, transpose_b)
-        self.pow = P.Pow()
-        self.reduce_sum = P.ReduceSum()
-    def construct(self, x, y, z):
-        out = self.matmul1(x, y)
-        out = self.matmul2(out, z)
-        out = self.pow(out, 2.0)
-        out = self.reduce_sum(out, None)
-        return out
-class MatmulReduce(Cell):
-    def __init__(self, group, transpose_a=False, transpose_b=False):
-        super(MatmulReduce, self).__init__()
-        self.matmul1 = P.MatMul(transpose_a, transpose_b)
-        self.allreduce1 = P.AllReduce(group=group)
-        self.matmul2 = P.MatMul(transpose_a, transpose_b)
-        self.pow = P.Pow()
-        self.reduce_sum = P.ReduceSum()
-        self.allreduce2 = P.AllReduce(group=group)
-    def construct(self, x, y, z):
-        out = self.matmul1(x, y)
-        out = self.allreduce1(out)
-        out = self.matmul2(out, z)
-        out = self.pow(out, 2.0)
-        out = self.reduce_sum(out, None)
-        out = self.allreduce2(out)
-        return out
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-    def construct(self, x, y, z, sens):
-        return grad_all_with_sens(self.network)(x, y, z, sens)
-class MatmulReduceFactory:
-    def __init__(self, inputx_shape, inputy_shape, inputz_shape, x_stra, y_stra, z_stra):
-        self.inputx = self.gen_value(inputx_shape, 10)
-        self.inputy = self.gen_value(inputy_shape, 20)
-        self.inputz = self.gen_value(inputz_shape, 30)
-        self.x_stra = x_stra
-        self.y_stra = y_stra
-        self.z_stra = z_stra
-        stra_size = 1
-        for s in x_stra:
-            stra_size = stra_size * s
-        self.stra_size = stra_size
-    def gen_value(self, input_shape, delta):
-        size = 1
-        for s in input_shape:
-            size = size * s
-        number_range = min(100, size)
-        input_np = np.reshape(np.arange(0, size) % number_range - delta, input_shape).astype(np.float32)
-        return input_np
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-    def grad_mindspore_impl_single(self):
-        x = Tensor(self.inputx)
-        y = Tensor(self.inputy)
-        z = Tensor(self.inputz)
-        sens = Tensor(1.0, dtype=ms.float32)
-        net = MatmulSingle()
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(x, y, z, sens)
-        return input_grad
-    def grad_mindspore_impl_reduce(self):
-        inputxs = self.get_parallel_blocks(self.inputx, self.x_stra)
-        inputys = self.get_parallel_blocks(self.inputy, self.y_stra)
-        inputzs = self.get_parallel_blocks(self.inputz, self.z_stra)
-        x = Tensor(inputxs[device_id % self.stra_size])
-        y = Tensor(inputys[device_id % self.stra_size])
-        z = Tensor(inputzs[device_id % self.stra_size])
-        repeat_num = device_num / self.stra_size
-        v = self.stra_size * repeat_num * repeat_num * repeat_num
-        sens = Tensor(1.0 / v, dtype=ms.float32)
-        net = MatmulReduce("hccl_world_group")
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(x, y, z, sens)
-        return input_grad
-    def grad_cmp(self):
-        single_results = self.grad_mindspore_impl_single()
-        reduce_results = self.grad_mindspore_impl_reduce()
-        single_result0 = self.get_parallel_blocks(single_results[0].asnumpy(), self.x_stra)[device_id % self.stra_size]
-        reduce_result0 = reduce_results[0].asnumpy()
-        single_result1 = self.get_parallel_blocks(single_results[1].asnumpy(), self.y_stra)[device_id % self.stra_size]
-        reduce_result1 = reduce_results[1].asnumpy()
-        single_result2 = self.get_parallel_blocks(single_results[2].asnumpy(), self.z_stra)[device_id % self.stra_size]
-        reduce_result2 = reduce_results[2].asnumpy()
-        assert np.allclose(single_result0, reduce_result0, 0.0001, 0.0001)
-        assert np.allclose(single_result1, reduce_result1, 0.0001, 0.0001)
-        assert np.allclose(single_result2, reduce_result2, 0.0001, 0.0001)
-def test_reduce_grad():
-    inputx_shape = (32, 64)
-    inputy_shape = (64, 64)
-    inputz_shape = (64, 32)
-    fact = MatmulReduceFactory(inputx_shape, inputy_shape, inputz_shape, (1, 4), (4, 1), (1, 4))
-    fact.grad_cmp()
-def test_reduce_grad_repeat():
-    inputx_shape = (32, 64)
-    inputy_shape = (64, 64)
-    inputz_shape = (64, 32)
-    fact = MatmulReduceFactory(inputx_shape, inputy_shape, inputz_shape, (1, 2), (2, 1), (1, 2))
-    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/hcom/allgather_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/hcom/allgather_4p.sh
-#!/bin/bash
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-for((i=0;i<4;i++));
-do
-	rm -rf device$i
-	mkdir device$i
-	cd device$i
-    mkdir output
-    source ../../dist_env_4p.sh $i
-    env  >log$i.log
-    pytest -s ../test_allgather_4p.py>../../log/test_allgather_4p_log$i.log  2>&1 &
-    cd ..    
-done
--- a/tests/ut/python/parallel/parallel_end_to_end/hcom/allreduce_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/hcom/allreduce_4p.sh
-#!/bin/bash
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-for((i=0;i<4;i++));
-do
-	rm -rf device$i
-	mkdir device$i
-	cd device$i
-    mkdir output
-    source ../../dist_env_4p.sh $i
-    env  >log$i.log
-    pytest -s ../test_allreduce_4p.py>../../log/test_allreduce_4p_log$i.log  2>&1 &
-    cd ..    
-done
--- a/tests/ut/python/parallel/parallel_end_to_end/l2normalize/_test_l2normalize_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/l2normalize/_test_l2normalize_parallel_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import numpy as np
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all_with_sens
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    distributedTool.create_group("0-3", [0, 1, 2, 3])
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-class L2normalize(Cell):
-    def __init__(self, axis=0, epsilon=1e-4, strategy0=None, strategy1=None):
-        super(L2normalize, self).__init__()
-        self.add = P.TensorAdd(strategy=strategy0)
-        self.l2norm = P.L2Normalize(axis, epsilon, strategy1)
-    def construct(self, x, y):
-        out = self.add(x, y)
-        out = self.l2norm(out)
-        return out
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-    def construct(self, x, y, output_grad):
-        return grad_all_with_sens(self.network)(x, y, output_grad)
-class L2normalizeFactory:
-    def __init__(self, input_shape, axis, strategy0, strategy1):
-        prefix = ""
-        size = 1
-        for s in input_shape:
-            prefix = prefix + str(s)
-            size = size * s
-        self.prefix = prefix
-        number_range = min(1000, size)
-        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
-            np.float32)
-        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
-            np.float32)
-        target_shape = input_shape
-        self.target_shape = target_shape
-        target_size = 1
-        for s in target_shape:
-            target_size = target_size * s
-        number_range = min(1000, target_size)
-        self.output_grad_np = np.reshape(np.arange(0, target_size) % number_range - number_range / 2,
-                                         target_shape).astype(np.float32)
-        self.axis = axis
-        self.epsilon = 1e-4
-        self.strategy0 = strategy0
-        self.strategy1 = strategy1
-        out_strategy = strategy1[1]
-        self.out_strategy = out_strategy
-        need_dev_num0 = 1
-        need_dev_num1 = 1
-        for s in strategy0[1]:
-            need_dev_num0 = need_dev_num0 * s
-        for s in out_strategy:
-            need_dev_num1 = need_dev_num1 * s
-        self.x_id = device_id % need_dev_num0
-        self.y_id = device_id % need_dev_num0
-        self.out_id = device_id % need_dev_num1
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-    def forward_mindspore_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        net = L2normalize(self.axis, self.epsilon)
-        out = net(x, y)
-        return out.asnumpy()
-    def forward_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(inputs_y[self.y_id])
-        net = L2normalize(self.axis, self.epsilon, strategy0=self.strategy0, strategy1=self.strategy1)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        net.set_auto_parallel()
-        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
-        return out.asnumpy()
-    def grad_mindspore_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        output_grad = Tensor(self.output_grad_np)
-        net = L2normalize(self.axis, self.epsilon)
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(x, y, output_grad)
-        return input_grad
-    def grad_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        output_grad = Tensor(self.output_grad_np)
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
-        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(inputs_y[self.y_id])
-        output_grad1 = Tensor(outgrads[self.out_id])
-        net = L2normalize(self.axis, self.epsilon, strategy0=self.strategy0, strategy1=self.strategy1)
-        grad_net = Grad(net)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        grad_net.set_auto_parallel()
-        grad_net.set_train()
-        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
-                              parallel_inputs_run=[x1, y1, output_grad1])
-        return input_grad
-    def forward_cmp(self):
-        out_mindspore = self.forward_mindspore_impl()
-        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
-        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
-        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.001, 0.001)
-    def grad_cmp(self):
-        input_grad_mindspore = self.grad_mindspore_impl()
-        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
-        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
-        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
-        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
-        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
-        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
-        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
-        assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
-        assert np.allclose(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)
-def test_reid_l2normalize_input_128_512():
-    input_shape = (128, 512)
-    axis = 0
-    fact = L2normalizeFactory(input_shape, axis, strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
-    fact.forward_cmp()
-def test_reid_l2normalize_grad_input_128_512():
-    input_shape = (128, 512)
-    axis = 0
-    fact = L2normalizeFactory(input_shape, axis, (0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
-    fact.grad_cmp()
-def test_reid_l2normalize_input_128_512_repeat():
-    input_shape = (128, 512)
-    axis = 0
-    fact = L2normalizeFactory(input_shape, axis, strategy0=(0, (1, 2), (1, 2)), strategy1=(0, (1, 2)))
-    fact.forward_cmp()
-def test_reid_l2normalize_grad_input_128_512_repeat():
-    input_shape = (128, 512)
-    axis = 0
-    fact = L2normalizeFactory(input_shape, axis, strategy0=(0, (1, 2), (1, 2)), strategy1=(0, (1, 2)))
-    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/l2normalize/l2normalize_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/l2normalize/l2normalize_parallel_4p.sh
-#!/bin/bash
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-for((i=0;i<4;i++));
-do
-    rm -rf device$i
-    mkdir device$i
-    cd device$i
-    mkdir output
-    source ../../dist_env_4p.sh $i
-    env  >log$i.log
-    pytest -s ../test_l2normalize_parallel_4p.py>../../log/test_l2normalize_parallel_4p_log$i.log  2>&1 &
-    cd ..    
-done
--- a/tests/ut/python/parallel/parallel_end_to_end/log/README.MD
+++ b/tests/ut/python/parallel/parallel_end_to_end/log/README.MD
-log files for auto parallel end to end test cases 
--- a/tests/ut/python/parallel/parallel_end_to_end/loss/_test_loss_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/loss/_test_loss_parallel_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import numpy as np
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    distributedTool.create_group("0-3", [0, 1, 2, 3])
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-class AddRelu(Cell):
-    def __init__(self, strategy0=None, strategy1=None):
-        super(AddRelu, self).__init__()
-        self.add = P.TensorAdd(strategy=strategy0)
-        self.relu = P.ReLU(strategy=strategy1)
-    def construct(self, x, y):
-        out = self.add(x, y)
-        out = self.relu(out)
-        return out
-class NetWithLoss(Cell):
-    def __init__(self, network, strategy2=None):
-        super(NetWithLoss, self).__init__()
-        self.loss = P.SoftmaxCrossEntropyWithLogits(strategy=strategy2)
-        self.network = network
-    def construct(self, x, y, b):
-        predict = self.network(x, y)
-        return self.loss(predict, b)[0]
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-    def construct(self, x, y, b):
-        return grad_all(self.network)(x, y, b)
-class AddReluFactory:
-    def __init__(self, input_shape, strategy0, strategy1, strategy2):
-        prefix = ""
-        size = 1
-        for s in input_shape:
-            prefix = prefix + str(s)
-            size = size * s
-        self.prefix = prefix
-        number_range = min(1000, size)
-        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
-            np.float32)
-        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
-            np.float32)
-        target_shape = input_shape
-        self.target_shape = target_shape
-        target_size = 1
-        for s in target_shape:
-            target_size = target_size * s
-        number_range = min(10, target_size)
-        self.output_grad_np = np.reshape((np.arange(0, target_size) % number_range) * 0.1, target_shape).astype(
-            np.float32)
-        self.strategy0 = strategy0
-        self.strategy1 = strategy1
-        self.strategy2 = strategy2
-        out_strategy = strategy1[1]
-        self.out_strategy = out_strategy
-        need_dev_num0 = 1
-        need_dev_num1 = 1
-        for s in strategy0[1]:
-            need_dev_num0 = need_dev_num0 * s
-        for s in out_strategy:
-            need_dev_num1 = need_dev_num1 * s
-        self.x_id = device_id % need_dev_num0
-        self.y_id = device_id % need_dev_num0
-        self.out_id = device_id % need_dev_num1
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-    def grad_mindspore_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        output_grad = Tensor(self.output_grad_np)
-        net = AddRelu()
-        net_with_loss = NetWithLoss(net)
-        grad_net = Grad(net_with_loss)
-        grad_net.set_train()
-        input_grads = []
-        for i in range(0, 3):
-            input_grad = grad_net(x, y, output_grad)
-            input_grads.append(input_grad)
-        return input_grads
-    def grad_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        output_grad = Tensor(self.output_grad_np)
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
-        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(inputs_y[self.y_id])
-        output_grad1 = Tensor(outgrads[self.out_id])
-        net = AddRelu(strategy0=self.strategy0, strategy1=self.strategy1)
-        net_with_loss = NetWithLoss(net, strategy2=self.strategy2)
-        grad_net = Grad(net_with_loss)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        grad_net.set_auto_parallel()
-        grad_net.set_train()
-        input_grads = []
-        for i in range(0, 3):
-            input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad],
-                                  parallel_inputs_run=[x1, y1, output_grad1])
-            input_grads.append(input_grad)
-        return input_grads
-    def grad_cmp(self):
-        input_grad_mindspores = self.grad_mindspore_impl()
-        input_grad_mindspore_parallels = self.grad_mindspore_parallel_impl()
-        for i in range(0, len(input_grad_mindspores)):
-            input_grad_mindspore = input_grad_mindspores[i]
-            input_grad_mindspore_parallel = input_grad_mindspore_parallels[i]
-            input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
-            input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
-            input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
-            input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
-            input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
-            input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
-            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_single0.npy",
-                    input_grad_blocks_0[self.x_id])
-            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_single1.npy",
-                    input_grad_blocks_1[self.y_id])
-            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_parallel0.npy",
-                    input_grad_mindspore_parallel0)
-            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_parallel1.npy",
-                    input_grad_mindspore_parallel1)
-            assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
-            assert np.allclose(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)
-def test_reid_l2normalize_grad_input_128_512():
-    input_shape = (128, 512)
-    fact = AddReluFactory(input_shape, strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (4, 1)),
-                          strategy2=(0, (4, 1), (4, 1)))
-    fact.grad_cmp()
-def test_reid_l2normalize_grad_input_128_512_stridesplit():
-    input_shape = (128, 512)
-    fact = AddReluFactory(input_shape, strategy0=(0, (1, 1), (1, 1)), strategy1=(0, (4, 1)),
-                          strategy2=(0, (4, 1), (4, 1)))
-    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/loss/loss_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/loss/loss_parallel_4p.sh
-#!/bin/bash
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-for((i=0;i<4;i++));
-do
-    rm -rf device$i
-    mkdir device$i
-    cd device$i
-    mkdir output
-    source ../../dist_env_4p.sh $i
-    env  >log$i.log
-    pytest -s ../test_loss_parallel_4p.py>../../log/test_loss_parallel_4p_log$i.log  2>&1 &
-    cd ..    
-done
--- a/tests/ut/python/parallel/parallel_end_to_end/matmul/_test_matmul_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/matmul/_test_matmul_parallel_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import numpy as np
-from numpy import allclose
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all_with_sens
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    distributedTool.create_group("0-3", [0, 1, 2, 3])
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-class Matmul(Cell):
-    def __init__(self, transpose_a=False, transpose_b=False, strategy0=None, strategy1=None):
-        super(Matmul, self).__init__()
-        self.add = P.TensorAdd(strategy=strategy1)
-        self.matmul = P.MatMul(transpose_a, transpose_b, strategy=strategy0)
-    def construct(self, x, w, z):
-        out = self.add(x, z)
-        return self.matmul(out, w)
-class BatchMatMul(Cell):
-    def __init__(self, transpose_a=False, transpose_b=False, strategy0=None, strategy1=None):
-        super(BatchMatMul, self).__init__()
-        self.add = P.TensorAdd(strategy=strategy1)
-        self.batchmatmul = P.BatchMatMul(transpose_a, transpose_b, strategy=strategy0)
-    def construct(self, x, w, z):
-        out = self.add(x, z)
-        return self.batchmatmul(out, w)
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-    def construct(self, inputa, inputb, inputz, output_grad):
-        gout = grad_all_with_sens(self.network)(inputa, inputb, inputz, output_grad)
-        return gout
-class BatchmatmulFactory:
-    def __init__(self, inputa_shape, inputb_shape, transpose_a, transpose_b, strategy, strategy_):
-        self.strategy = strategy
-        self.strategy_ = strategy_
-        inputa_size = 1
-        inputb_size = 1
-        prefix = ""
-        for s in inputa_shape:
-            prefix = prefix + str(s) + "_"
-            inputa_size = inputa_size * s
-        prefix = prefix + "and"
-        for s in inputb_shape:
-            prefix = prefix + str(s) + "_"
-            inputb_size = inputb_size * s
-        number_rangea = min(1000, inputa_size)
-        number_rangeb = min(1000, inputb_size)
-        self.inputa = np.reshape(np.arange(0, inputa_size) % number_rangea - number_rangea / 2, inputa_shape).astype(
-            np.float32)
-        self.inputb = np.reshape(np.arange(0, inputb_size) % number_rangeb - number_rangeb / 2, inputb_shape).astype(
-            np.float32)
-        self.inputz = np.zeros(self.inputa.shape).astype(np.float32)
-        self.transpose_a = transpose_a
-        self.transpose_b = transpose_b
-        out_shape = []
-        device_matrix = []
-        out_strategy = []
-        if transpose_a:
-            temp = inputa_shape[-1]
-            inputa_shape[-1] = inputa_shape[-2]
-            inputa_shape[-2] = temp
-        if transpose_b:
-            temp = inputb_shape[-1]
-            inputb_shape[-1] = inputb_shape[-2]
-            inputb_shape[-2] = temp
-        if len(inputa_shape) >= len(inputb_shape):
-            out_shape = list(inputa_shape)
-            out_shape[-1] = inputb_shape[-1]
-        else:
-            out_shape = list(inputb_shape)
-            out_shape[-2] = inputa_shape[-2]
-        strategy1 = list(self.strategy[1])
-        strategy2 = list(self.strategy[2])
-        if transpose_a:
-            temp = strategy1[-1]
-            strategy1[-1] = strategy1[-2]
-            strategy1[-2] = temp
-        if transpose_b:
-            temp = strategy2[-1]
-            strategy2[-1] = strategy2[-2]
-            strategy2[-2] = temp
-        if len(strategy1) >= len(strategy2):
-            out_strategy = strategy1.copy()
-            out_strategy[-1] = strategy2[-1]
-        else:
-            out_strategy = strategy2.copy()
-            out_strategy[-2] = strategy1[-2]
-        device_matrix = out_strategy.copy()
-        device_matrix.insert(-1, strategy1[-1])
-        self.out_strategy = out_strategy
-        need_dev_num = 1
-        for s in device_matrix:
-            need_dev_num = need_dev_num * s
-        self.need_dev_num = need_dev_num
-        self.device_matrix = device_matrix
-        out_size = 1
-        for s in out_shape:
-            out_size = out_size * s
-        number_range = min(1000, out_size)
-        self.output_grad_np = np.reshape(np.arange(0, out_size) % number_range - number_range / 2, out_shape).astype(
-            np.float32)
-        device_index = self.id_to_list(device_id % need_dev_num, self.device_matrix)
-        x_index = device_index[:-1].copy()
-        if transpose_a:
-            temp = x_index[-1]
-            x_index[-1] = x_index[-2]
-            x_index[-2] = temp
-        y_index = device_index[:-3].copy()
-        y_index.append(device_index[-2])
-        y_index.append(device_index[-1])
-        if transpose_b:
-            temp = y_index[-1]
-            y_index[-1] = y_index[-2]
-            y_index[-2] = temp
-        out_index = device_index[:-2].copy()
-        out_index.append(device_index[-1])
-        print(device_matrix)
-        print(device_index)
-        need_dev_num_ = 1
-        for s in strategy_[1]:
-            need_dev_num_ = need_dev_num_ * s
-        self.x_id = device_id % need_dev_num_
-        self.y_id = self.list_to_id(y_index, self.strategy[2])
-        self.out_id = self.list_to_id(out_index, self.out_strategy)
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-    def id_to_list(self, id_, shape):
-        """
-        shape：每一维的上限，如（2,4,8）
-        """
-        result = []
-        r = id_
-        for i in range(0, len(shape)):
-            v = 1
-            for j in range(i + 1, len(shape)):
-                v = v * shape[j]
-            result.append(r // v)
-            r = r % v
-        return result
-    def list_to_id(self, id_list, shape):
-        result = 0
-        for i in range(0, len(id_list)):
-            v = 1
-            for j in range(i + 1, len(id_list)):
-                v = v * shape[j]
-            result = result + id_list[i] * v
-        return result
-    def forward_mindspore_impl(self):
-        if len(self.inputa.shape) > 2:
-            matmul = BatchMatMul(self.transpose_a, self.transpose_b)
-        else:
-            matmul = Matmul(self.transpose_a, self.transpose_b)
-        matmul.set_train()
-        out_me = matmul(Tensor(self.inputa), Tensor(self.inputb), Tensor(self.inputz))
-        return out_me.asnumpy()
-    def forward_mindspore_parallel_impl(self):
-        if len(self.inputa.shape) > 2:
-            matmul = BatchMatMul(self.transpose_a, self.transpose_b, strategy0=self.strategy, strategy1=self.strategy_)
-        else:
-            matmul = Matmul(self.transpose_a, self.transpose_b, strategy0=self.strategy, strategy1=self.strategy_)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        x = Tensor(self.inputa)
-        y = Tensor(self.inputb)
-        z = Tensor(self.inputz)
-        xs = self.get_parallel_blocks(self.inputa, self.strategy_[1])
-        ys = self.get_parallel_blocks(self.inputb, self.strategy[2])
-        zs = self.get_parallel_blocks(self.inputz, self.strategy_[1])
-        x1 = Tensor(xs[self.x_id])  #
-        y1 = Tensor(ys[self.y_id])  # 需要从设备矩阵推导
-        z1 = Tensor(zs[self.x_id])
-        matmul.set_train()
-        matmul.set_auto_parallel()
-        out_me = matmul(x, y, z, parallel_inputs_compile=[x, y, z], parallel_inputs_run=[x1, y1, z1])
-        return out_me.asnumpy()
-    def grad_mindspore_impl(self):
-        x = Tensor(self.inputa)
-        y = Tensor(self.inputb)
-        z = Tensor(self.inputz)
-        if len(self.inputa.shape) > 2:
-            matmul = BatchMatMul(self.transpose_a, self.transpose_b)
-        else:
-            matmul = Matmul(self.transpose_a, self.transpose_b)
-        net_me = Grad(matmul)
-        net_me.set_train()
-        out_grad_me = Tensor(self.output_grad_np)
-        out_grad = net_me(x, y, z, out_grad_me)
-        return out_grad
-    def grad_mindspore_parallel_impl(self):
-        if len(self.inputa.shape) > 2:
-            matmul = BatchMatMul(self.transpose_a, self.transpose_b, strategy0=self.strategy, strategy1=self.strategy_)
-        else:
-            matmul = Matmul(self.transpose_a, self.transpose_b, strategy0=self.strategy, strategy1=self.strategy_)
-        x = Tensor(self.inputa)
-        y = Tensor(self.inputb)
-        z = Tensor(self.inputz)
-        out_grad_me = Tensor(self.output_grad_np)
-        xs = self.get_parallel_blocks(self.inputa, self.strategy_[1])
-        ys = self.get_parallel_blocks(self.inputb, self.strategy[2])
-        zs = self.get_parallel_blocks(self.inputz, self.strategy_[1])
-        out_grads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
-        x1 = Tensor(xs[self.x_id])  # 需要从设备矩阵推导
-        y1 = Tensor(ys[self.y_id])  #
-        z1 = Tensor(zs[self.x_id])
-        out_grad1 = Tensor(out_grads[self.out_id])
-        net_me = Grad(matmul)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        net_me.set_auto_parallel()
-        net_me.set_train()
-        out_grad = net_me(x, y, z, out_grad_me, parallel_inputs_compile=[x, y, z, out_grad1],
-                          parallel_inputs_run=[x1, y1, z1, out_grad1])
-        return out_grad
-    def forward_cmp(self):
-        out_mindspore = self.forward_mindspore_impl()
-        out_mindspores = self.get_parallel_blocks(out_mindspore, self.out_strategy)
-        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
-        assert allclose(out_mindspores[self.out_id], out_mindspore_parallel, 0.0001, 0.0001)
-    def grad_cmp(self):
-        input_grad_mindspore = self.grad_mindspore_impl()
-        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
-        input_grad_mindspores0 = self.get_parallel_blocks(input_grad_mindspore[0].asnumpy(), self.strategy_[1])
-        input_grad_mindspores1 = self.get_parallel_blocks(input_grad_mindspore[1].asnumpy(), self.strategy[2])
-        input_grad_mindspores2 = self.get_parallel_blocks(input_grad_mindspore[2].asnumpy(), self.strategy_[1])
-        assert allclose(input_grad_mindspores0[self.x_id], input_grad_mindspore_parallel[0].asnumpy(), 0.0001, 0.0001)
-        assert allclose(input_grad_mindspores1[self.y_id], input_grad_mindspore_parallel[1].asnumpy(), 0.0001, 0.0001)
-        assert allclose(input_grad_mindspores2[self.x_id], input_grad_mindspore_parallel[2].asnumpy(), 0.0001, 0.0001)
-def test_reid_batchmatmul_inputa_128_512_inputb_2000_512():
-    inputa = [128, 512]
-    inputb = [2000, 512]
-    fact = BatchmatmulFactory(inputa, inputb, False, True, (0, (2, 2), (1, 2)), (0, (2, 2), (2, 2)))
-    fact.forward_cmp()
-def test_reid_batchmatmul_grad_inputa_128_512_inputb_2000_512():
-    inputa = [128, 512]
-    inputb = [2000, 512]
-    fact = BatchmatmulFactory(inputa, inputb, False, True, (0, (2, 2), (1, 2)), (0, (2, 2), (2, 2)))
-    fact.grad_cmp()
-def test_reid_batchmatmul_inputa_128_512_inputb_2000_512_redistribution():
-    inputa = [128, 512]
-    inputb = [2000, 512]
-    fact = BatchmatmulFactory(inputa, inputb, False, True, (0, (1, 2), (1, 2)), (0, (2, 2), (2, 2)))
-    fact.forward_cmp()
-def test_reid_batchmatmul_grad_inputa_128_512_inputb_2000_512_redistribution():
-    inputa = [128, 512]
-    inputb = [2000, 512]
-    fact = BatchmatmulFactory(inputa, inputb, False, True, (0, (1, 2), (1, 2)), (0, (2, 2), (2, 2)))
-    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/matmul/matmul_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/matmul/matmul_parallel_4p.sh
-#!/bin/bash
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-for((i=0;i<4;i++));
-do
-    rm -rf device$i
-    mkdir device$i
-    cd device$i
-    mkdir output
-    source ../../dist_env_4p.sh $i
-    env  >log$i.log
-    pytest -s ../test_matmul_parallel_4p.py >../../log/test_matmul_parallel_4p_log$i.log  2>&1 &  
-    cd ..  
-done
--- a/tests/ut/python/parallel/parallel_end_to_end/max/_test_max_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/max/_test_max_parallel_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import numpy as np
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all_with_sens
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    distributedTool.create_group("0-3", [0, 1, 2, 3])
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-    def construct(self, input1, input2, output_grad):
-        return grad_all_with_sens(self.network)(input1, input2, output_grad)
-class Max(Cell):
-    def __init__(self, axis, keep_dims, strategy0=None, strategy1=None):
-        super(Max, self).__init__()
-        self.add = P.TensorAdd(strategy=strategy0)
-        self.reduce_max = P.ReduceMax(keep_dims=keep_dims).set_strategy(strategy=strategy1)
-        self.axis = axis
-    def construct(self, input1, input2):
-        out = self.add(input1, input2)
-        return self.reduce_max(out, self.axis)
-class MaxFactory:
-    def __init__(self, input_shape, axis, keep_dims, strategy0, strategy1):
-        self.strategy0 = strategy0
-        self.strategy1 = strategy1
-        self.axis = axis
-        self.keep_dims = keep_dims
-        input_size = 1
-        prefix = ""
-        for s in input_shape:
-            prefix = prefix + str(s) + "_"
-            input_size = input_size * s
-        number_range = min(1000, input_size)
-        self.input_np1 = np.reshape(np.arange(0, input_size) % number_range - number_range / 2, input_shape).astype(
-            np.float32)
-        self.input_np2 = self.input_np1.copy()
-        self.out_grad_np = None
-        out_shape = list(input_shape)
-        out_shape.pop(axis)
-        out_size = input_size / input_shape[axis]
-        number_range_ = min(1000, out_size)
-        self.out_grad_np = np.reshape(np.arange(0, out_size) % number_range_ - number_range_ / 2, out_shape).astype(
-            np.float32)
-        out_strategy = list(strategy1[1])
-        out_strategy.pop(axis)
-        self.out_strategy = out_strategy
-        need_dev_num = 1
-        need_dev_num_ = 1
-        for s in strategy0[1]:
-            need_dev_num = need_dev_num * s
-        for s in out_strategy:
-            need_dev_num_ = need_dev_num_ * s
-        self.x_id = device_id % need_dev_num
-        self.y_id = device_id % need_dev_num
-        self.out_id = device_id % need_dev_num_
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-    def forward_mindspore_impl(self):
-        input1 = Tensor(self.input_np1)
-        input2 = Tensor(self.input_np2)
-        net = Max(axis=self.axis, keep_dims=self.keep_dims)
-        out = net(input1, input2)
-        return out.asnumpy()
-    def forward_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        xs = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        ys = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
-        x1 = Tensor(xs[self.x_id])
-        y1 = Tensor(ys[self.y_id])
-        net = Max(axis=self.axis, keep_dims=self.keep_dims, strategy0=self.strategy0, strategy1=self.strategy1)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        net.set_auto_parallel()
-        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
-        return out.asnumpy()
-    def grad_mindspore_impl(self):
-        input1 = Tensor(self.input_np1)
-        input2 = Tensor(self.input_np2)
-        out_grad = Tensor(self.out_grad_np)
-        net = Max(axis=self.axis, keep_dims=self.keep_dims)
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(input1, input2, out_grad)
-        return input_grad
-    def grad_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        output_grads = self.get_parallel_blocks(self.out_grad_np, self.out_strategy)
-        out_grad = Tensor(output_grads[self.out_id])
-        xs = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        ys = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
-        x1 = Tensor(xs[self.x_id])
-        y1 = Tensor(ys[self.y_id])
-        net = Max(axis=self.axis, keep_dims=self.keep_dims, strategy0=self.strategy0, strategy1=self.strategy1)
-        grad_net = Grad(net)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        grad_net.set_auto_parallel()
-        grad_net.set_train()
-        input_grad = grad_net(x, y, out_grad, parallel_inputs_compile=[x, y, out_grad],
-                              parallel_inputs_run=[x1, y1, out_grad])
-        return input_grad
-    def forward_cmp(self):
-        out_mindspore = self.forward_mindspore_impl()
-        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
-        print(out_mindspore)
-        print(out_mindspore_parallel)
-        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
-        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.001, 0.001)
-    def grad_cmp(self):
-        input_grad_mindspore = self.grad_mindspore_impl()
-        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
-        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
-        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
-        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
-        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
-        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
-        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
-        assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
-        assert np.allclose(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)
-def test_reid_max_forward_input_256_64():
-    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (4, 1), (4, 1)),
-                      strategy1=(0, (4, 1)))
-    fact.forward_cmp()
-def test_reid_max_grad_input_256_64():
-    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (4, 1), (4, 1)),
-                      strategy1=(0, (4, 1)))
-    fact.grad_cmp()
-def test_reid_max_forward_input_128_64_32_32():
-    fact = MaxFactory(input_shape=(128, 64, 32, 32), axis=3, keep_dims=False, strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)),
-                      strategy1=(0, (2, 1, 2, 1)))
-    fact.forward_cmp()
-def test_reid_max_grad_input_128_64_32_32():
-    fact = MaxFactory(input_shape=(128, 64, 32, 32), axis=3, keep_dims=False, strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)),
-                      strategy1=(0, (2, 1, 2, 1)))
-    fact.grad_cmp()
-def test_reid_max_forward_input_256_64_repeat():
-    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (2, 1), (2, 1)),
-                      strategy1=(0, (2, 1)))
-    fact.forward_cmp()
-def test_reid_max_grad_input_256_64_repeat():
-    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (2, 1), (2, 1)),
-                      strategy1=(0, (2, 1)))
-    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/max/max_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/max/max_parallel_4p.sh
-#!/bin/bash
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-for((i=0;i<4;i++));
-do
-    rm -rf device$i
-    mkdir device$i
-    cd device$i
-    mkdir output
-    source ../../dist_env_4p.sh $i
-    env  >log$i.log
-    pytest -s ../test_max_parallel_4p.py>../../log/test_max_parallel_4p_log$i.log  2>&1 &
-    cd ..
-done
--- a/tests/ut/python/parallel/parallel_end_to_end/mul_softmax/mul_activation_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/mul_softmax/mul_activation_parallel_4p.sh
-#!/bin/bash
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-for((i=0;i<4;i++));
-do
-    rm -rf device$i
-    mkdir device$i
-    cd device$i
-    mkdir output
-    source ../../dist_env_4p.sh $i
-    env  >log$i.log
-    pytest -s ../test_mul_softmax_parallel_4p.py>../../log/test_mul_softmax_parallel_4p_log$i.log  2>&1 &
-    cd ..    
-done
--- a/tests/ut/python/parallel/parallel_end_to_end/mul_softmax/need_fix_test_mul_softmax_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/mul_softmax/need_fix_test_mul_softmax_parallel_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import numpy as np
-import pytest
-import mindspore as ms
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all_with_sens
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    distributedTool.create_group("0-3", [0, 1, 2, 3])
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-class MulSoftmax(Cell):
-    def __init__(self, strategy0=None, strategy1=None, axis=0):
-        super(MulSoftmax, self).__init__()
-        self.mul = P.Mul(strategy=strategy0)
-        self.softmax = P.Softmax(axis=axis, strategy=strategy1)
-    def construct(self, x, z):
-        out = self.mul(x, z)
-        return self.softmax(out)
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-    def construct(self, x, y, output_grad):
-        return grad_all_with_sens(self.network)(x, y, output_grad)
-class MulSoftmaxFactory:
-    def __init__(self, input_shape, strategy0, strategy1):
-        prefix = ""
-        size = 1
-        for s in input_shape:
-            prefix = prefix + str(s)
-            size = size * s
-        self.prefix = prefix
-        number_range = min(1000, size)
-        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
-            np.float32)
-        self.input_np2 = 1.0
-        self.output_grad_np = np.reshape((np.arange(0, size) % (number_range - 10) - number_range / 2) * 0.1,
-                                         input_shape).astype(np.float32)
-        self.strategy0 = strategy0
-        self.strategy1 = strategy1
-        need_dev_num = 1
-        need_dev_num_ = 1
-        for s in strategy0[1]:
-            need_dev_num = need_dev_num * s
-        for s in strategy1[1]:
-            need_dev_num_ = need_dev_num_ * s
-        self.x_id = device_id % need_dev_num
-        self.y_id = device_id % need_dev_num
-        self.out_id = device_id % need_dev_num_
-    def forward_mindspore_impl(self):
-        net = MulSoftmax()
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2, ms.float32)
-        out = net(x, y)
-        return out.asnumpy()
-    def forward_mindspore_parallel_impl(self):
-        net = MulSoftmax(strategy0=self.strategy0, strategy1=self.strategy1)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        net.set_auto_parallel()
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2, ms.float32)
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(self.input_np2, ms.float32)
-        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
-        return out.asnumpy()
-    def grad_mindspore_impl(self):
-        output_grad = Tensor(self.output_grad_np)
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2, ms.float32)
-        net = MulSoftmax()
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(x, y, output_grad)
-        return input_grad
-    def grad_mindspore_parallel_impl(self):
-        output_grads = self.get_parallel_blocks(self.output_grad_np, self.strategy1[1])
-        output_grad = Tensor(output_grads[self.out_id])
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2, ms.float32)
-        net = MulSoftmax(strategy0=self.strategy0, strategy1=self.strategy1)
-        grad_net = Grad(net)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        grad_net.set_train()
-        grad_net.set_auto_parallel()
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(self.input_np2, ms.float32)
-        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad],
-                              parallel_inputs_run=[x1, y1, output_grad])
-        return input_grad
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-    def forward_cmp(self):
-        out_mindspore = self.forward_mindspore_impl()
-        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
-        np.save(path + str(device_id) + "_" + self.prefix + "_forward_parallel.npy", out_mindspore_parallel)
-        out_blocks = self.get_parallel_blocks(out_mindspore, self.strategy1[1])
-        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)
-    def grad_cmp(self):
-        input_grad_mindspore = self.grad_mindspore_impl()
-        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
-        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
-        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
-        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
-        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
-        np.save(path + str(device_id) + "_" + self.prefix + "_grad_parallel0.npy", input_grad_mindspore_parallel0)
-        np.save(path + str(device_id) + "_" + self.prefix + "_grad_parallel1.npy", input_grad_mindspore_parallel1)
-        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0,
-                                                       self.strategy0[1])  # 这里由于TensorMul两个输入X1没做广播，X2做了广播
-        assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
-        assert np.allclose(input_grad_mindspore1, input_grad_mindspore_parallel1, 0.0001, 0.0001)
-@pytest.mark.reid_forward
-def test_reid_mul_softmax_input_128x64():
-    stra0 = (0, (1, 4), ())
-    stra1 = (0, (1, 4))
-    fact = MulSoftmaxFactory(input_shape=(128, 64), strategy0=stra0, strategy1=stra1)
-    fact.forward_cmp()
-@pytest.mark.reid_grad
-def test_reid_grad_mul_softmax_input_128x64():
-    stra0 = (0, (1, 4), ())
-    stra1 = (0, (1, 4))
-    fact = MulSoftmaxFactory(input_shape=(128, 64), strategy0=stra0, strategy1=stra1)
-    fact.grad_cmp()
-@pytest.mark.reid_forward
-def test_reid_mul_softmax_input_128x64_all_to_all():
-    stra0 = (0, (4, 1), ())
-    stra1 = (0, (1, 4))
-    fact = MulSoftmaxFactory(input_shape=(128, 64), strategy0=stra0, strategy1=stra1)
-    fact.forward_cmp()
-@pytest.mark.reid_grad
-def test_reid_grad_mul_softmax_input_128x64_all_to_all():
-    stra0 = (0, (4, 1), ())
-    stra1 = (0, (1, 4))
-    fact = MulSoftmaxFactory(input_shape=(128, 64), strategy0=stra0, strategy1=stra1)
-    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/onehot/_test_onehot_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/onehot/_test_onehot_parallel_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import numpy as np
-import mindspore as ms
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    distributedTool.create_group("0-3", [0, 1, 2, 3])
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-class Onehot(Cell):
-    def __init__(self, axis=-1, depth=1, on_value=1.0, off_value=0.0, strategy=None):
-        super(Onehot, self).__init__()
-        self.onehot = P.OneHot(axis, strategy=strategy)
-        self.depth = depth
-        self.on_value = Tensor(on_value, ms.float32)
-        self.off_value = Tensor(off_value, ms.float32)
-    def construct(self, indices):
-        return self.onehot(indices, self.depth, self.on_value, self.off_value)
-class OneHotFactory:
-    def __init__(self, input_shape, depth, on_value=1.0, off_value=0.0, axis=None, dtype=None, strategy0=None):
-        size = 1
-        prefix = ""
-        for s in input_shape:
-            prefix = prefix + str(s)
-            size = size * s
-        self.prefix = prefix
-        number_range = min(10, size)
-        self.input_np = np.reshape(np.arange(0, size) % number_range, input_shape).astype(np.int32)
-        self.depth = depth
-        self.on_value = on_value
-        self.off_value = off_value
-        self.axis = axis
-        self.dtype = dtype
-        self.strategy0 = strategy0
-        need_dev_num = 1
-        for s in strategy0[1]:
-            need_dev_num = need_dev_num * s
-        self.x_id = device_id % need_dev_num
-        self.out_id = device_id % need_dev_num
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-    def grad_mindspore_impl(self):
-        output_grad = Tensor(self.output_grad_np)
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2, ms.float32)
-        net = AddRelu()
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(x, y, output_grad)
-        return input_grad
-    def forward_mindspore_impl(self):
-        indices = Tensor(self.input_np)
-        net = Onehot(axis=self.axis,
-                     depth=self.depth,
-                     on_value=self.on_value,
-                     off_value=self.off_value)
-        out = net(indices)
-        return out.asnumpy()
-    def forward_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np)
-        inputs_x = self.get_parallel_blocks(self.input_np, self.strategy0[1])
-        x1 = Tensor(inputs_x[self.x_id])
-        net = Onehot(axis=self.axis,
-                     depth=self.depth,
-                     on_value=self.on_value,
-                     off_value=self.off_value, strategy=self.strategy0)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        net.set_auto_parallel()
-        out = net(x, parallel_inputs_compile=[x], parallel_inputs_run=[x1])
-        return out.asnumpy()
-    def forward_cmp(self):
-        out_mindspore = self.forward_mindspore_impl()
-        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
-        out_blocks = self.get_parallel_blocks(out_mindspore, self.strategy0[1])
-        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.0001)
-def test_reid_onehot_forward_int32_128_depth13000():
-    fact = OneHotFactory(input_shape=(128,),
-                         depth=131072,
-                         on_value=1.000000,
-                         off_value=0.000000,
-                         axis=-1,
-                         dtype="float32",
-                         strategy0=(0, (2,)))
-    fact.forward_cmp()
-def test_reid_onehot_forward_int32_131072_depth127():
-    fact = OneHotFactory(input_shape=(131072,),
-                         depth=127,
-                         on_value=1.000000,
-                         off_value=0.000000,
-                         axis=-1,
-                         dtype="float32",
-                         strategy0=(0, (4,)))
-    fact.forward_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/onehot/onehot_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/onehot/onehot_parallel_4p.sh
-#!/bin/bash
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-for((i=0;i<4;i++));
-do
-    rm -rf device$i
-    mkdir device$i
-    cd device$i
-    mkdir output
-    source ../../dist_env_4p.sh $i
-    env  >log$i.log
-    pytest -s ../test_onehot_parallel_4p.py>../../log/test_onehot_parallel_4p_log$i.log  2>&1 &
-    cd ..    
-done
--- a/tests/ut/python/parallel/parallel_end_to_end/prelu/_test_prelu_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/prelu/_test_prelu_parallel_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import numpy as np
-import pytest
-import mindspore as ms
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all_with_sens
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    distributedTool.create_group("0-3", [0, 1, 2, 3])
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-class PReLU(Cell):
-    def __init__(self, channel=1, w=0.25, strategy_=None, strategy1_=None):
-        super(PReLU, self).__init__()
-        self.add = P.TensorAdd(strategy=strategy1_)
-        self.prelu = P.PReLU(strategy=strategy_)
-        self.channel = channel
-    def construct(self, x, z, w):
-        out = self.add(x, z)
-        return self.prelu(out, w)
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-    def construct(self, input_, z, w, output_grad):
-        return grad_all_with_sens(self.network)(input_, z, w, output_grad)
-class PReLUFactory:
-    def __init__(self, input_shape, strategy):
-        n, c = input_shape[:2]
-        prefix = ""
-        size = 1
-        for s in input_shape:
-            prefix = prefix + str(s)
-            size = size * s
-        self.prefix = prefix
-        number_range = min(1000, size)
-        self.input_np = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(np.float32)
-        self.output_grad_np = np.reshape((np.arange(0, size) % (number_range - 10) - number_range / 2) * 0.1,
-                                         input_shape).astype(np.float32)
-        self.channel = c
-        self.weight = np.array([np.float32(0.25)] * c)
-        self.strategy = strategy
-    def forward_mindspore_impl(self):
-        net = PReLU(channel=self.channel, w=self.weight)
-        x = Tensor(self.input_np)
-        z = Tensor(np.zeros(self.input_np.shape), ms.float32)
-        w = Tensor(self.weight)
-        out = net(x, z, w)
-        return out.asnumpy()
-    def forward_mindspore_parallel_impl(self):
-        net = PReLU(channel=self.channel, w=self.weight, strategy_=self.strategy,
-                    strategy1_=(self.strategy[0], self.strategy[1], self.strategy[1]))
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        net.set_auto_parallel()
-        x = Tensor(self.input_np)
-        z = Tensor(np.zeros(self.input_np.shape), ms.float32)
-        w = Tensor(self.weight)
-        inputs = self.get_parallel_blocks(self.input_np, self.strategy[1])
-        block_id = device_id % len(inputs)
-        x1 = Tensor(inputs[block_id])
-        z1 = Tensor(np.zeros(inputs[block_id].shape), ms.float32)
-        w1 = Tensor(self.weight)
-        out = net(x, z, w, parallel_inputs_compile=[x, z, w], parallel_inputs_run=[x1, z1, w1])
-        return out.asnumpy()
-    def grad_mindspore_impl(self):
-        output_grad = Tensor(self.output_grad_np)
-        x = Tensor(self.input_np)
-        z = Tensor(np.zeros(self.input_np.shape), ms.float32)
-        w = Tensor(self.weight)
-        net = PReLU(channel=self.channel, w=self.weight)
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(x, z, w, output_grad)
-        return input_grad
-    def grad_mindspore_parallel_impl(self):
-        output_grads = self.get_parallel_blocks(self.output_grad_np, self.strategy[1])
-        block_id = device_id % len(output_grads)
-        output_grad = Tensor(output_grads[block_id])
-        x = Tensor(self.input_np)
-        z = Tensor(np.zeros(self.input_np.shape), ms.float32)
-        w = Tensor(self.weight)
-        net = PReLU(channel=self.channel, w=self.weight, strategy_=self.strategy,
-                    strategy1_=(self.strategy[0], self.strategy[1], self.strategy[1]))
-        grad_net = Grad(net)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        grad_net.set_auto_parallel()
-        grad_net.set_train()
-        inputs = self.get_parallel_blocks(self.input_np, self.strategy[1])
-        x1 = Tensor(inputs[block_id])
-        z1 = Tensor(np.zeros(inputs[block_id].shape), ms.float32)
-        w1 = Tensor(self.weight)
-        input_grad = grad_net(x, z, w, output_grad, parallel_inputs_compile=[x, z, w, output_grad],
-                              parallel_inputs_run=[x1, z1, w1, output_grad])
-        return input_grad
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-    def forward_cmp(self):
-        out_mindspore = self.forward_mindspore_impl()
-        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
-        out_blocks = self.get_parallel_blocks(out_mindspore, self.strategy[1])
-        block_id = device_id % len(out_blocks)
-        assert np.allclose(out_blocks[block_id], out_mindspore_parallel, 0.0001, 0.001)
-    def grad_cmp(self):
-        input_grad_mindspore = self.grad_mindspore_impl()
-        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
-        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
-        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
-        input_grad_mindspore2 = input_grad_mindspore[2].asnumpy()
-        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
-        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
-        input_grad_mindspore_parallel2 = input_grad_mindspore_parallel[2].asnumpy()
-        input_grad_blocks = self.get_parallel_blocks(input_grad_mindspore0, self.strategy[1])
-        input1_grad_blocks = self.get_parallel_blocks(input_grad_mindspore1, self.strategy[1])
-        block_id = device_id % len(input_grad_blocks)
-        assert np.allclose(input_grad_blocks[block_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
-        assert np.allclose(input_grad_mindspore2, input_grad_mindspore_parallel2, 0.0001, 0.0001)
-        assert np.allclose(input1_grad_blocks[block_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)
-@pytest.mark.reid_grad
-def test_reid_prelu_input_128x64x112x112_repeat():
-    stra = (0, (1, 1, 2, 1), (1))
-    fact = PReLUFactory(input_shape=(128, 64, 112, 112), strategy=stra)
-    fact.forward_cmp()
-@pytest.mark.reid_grad
-def test_reid_grad_prelu_input_128x64x112x112_repeat():
-    stra = (0, (1, 1, 2, 1), (1))
-    fact = PReLUFactory(input_shape=(128, 64, 112, 112), strategy=stra)
-    fact.grad_cmp()
-@pytest.mark.reid_grad
-def test_reid_prelu_input_128x64x112x112_mix():
-    stra = (0, (2, 1, 1, 2), (1))
-    fact = PReLUFactory(input_shape=(128, 64, 112, 112), strategy=stra)
-    fact.forward_cmp()
-@pytest.mark.reid_grad
-def test_reid_grad_prelu_input_128x64x112x112_mix():
-    stra = (0, (2, 1, 1, 2), (1))
-    fact = PReLUFactory(input_shape=(128, 64, 112, 112), strategy=stra)
-    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/prelu/prelu_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/prelu/prelu_parallel_4p.sh
-#!/bin/bash
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-for((i=0;i<4;i++));
-do
-    rm -rf device$i
-    mkdir device$i
-    cd device$i
-    mkdir output
-    source ../../dist_env_4p.sh $i
-    env  >log$i.log
-    pytest -s ../test_prelu_parallel_4p.py >../../log/test_prelu_parallel_4p_log$i.log  2>&1 &
-    cd ..
-done
--- a/tests/ut/python/parallel/parallel_end_to_end/reducemean/_test_reducemean_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/reducemean/_test_reducemean_parallel_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import numpy as np
-from numpy import allclose as allclose_nparray
-import mindspore as ms
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all_with_sens
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    distributedTool.create_group("0-3", [0, 1, 2, 3])
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-    def construct(self, x, y, output_grad):
-        return grad_all_with_sens(self.network)(x, y, output_grad)
-class GradScalar(Cell):
-    def __init__(self, network):
-        super(GradScalar, self).__init__()
-        self.network = network
-        self.sens = Tensor([1.0], dtype=ms.float32)
-    def construct(self, x, y):
-        return grad_all_with_sens(self.network)(x, y, self.sens)
-class ReduceMean(Cell):
-    def __init__(self, keep_dims, axis, strategy0=None, strategy1=None):
-        super(ReduceMean, self).__init__()
-        self.add = P.TensorAdd(strategy=strategy0)
-        self.reduce_mean = P.ReduceMean(keep_dims=keep_dims).set_strategy(strategy=strategy1)
-        self.axis = axis
-    def construct(self, x, y):
-        out = self.add(x, y)
-        return self.reduce_mean(out, self.axis)
-class ReduceMeanFactory:
-    def __init__(self, input_shape, keep_dims, axis, strategy0=None, strategy1=None):
-        prefix = ""
-        size = 1
-        for s in input_shape:
-            prefix = prefix + str(s)
-            size = size * s
-        self.prefix = prefix
-        number_range = min(1000, size)
-        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
-            np.float32)
-        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
-            np.float32)
-        self.keep_dims = keep_dims
-        self.axis = axis
-        target_shape = self.input_np1.mean(axis=axis, keepdims=keep_dims).shape
-        target_size = 1
-        for s in target_shape:
-            target_size = target_size * s
-        number_range = min(1000, target_size)
-        self.output_grad_np = np.array([1.0], dtype=np.float32)
-        if len(target_shape) > 0:
-            self.output_grad_np = np.reshape(np.arange(0, target_size) % number_range, target_shape).astype(
-                np.float32) + 1.0
-        self.shape = target_shape
-        self.strategy0 = strategy0
-        self.strategy1 = strategy1
-        out_strategy = []
-        axis_ = list(axis)
-        if axis_[0] == -1:
-            axis_[0] = len(input_shape) - 1
-        for i in range(0, len(input_shape)):
-            if i in axis_:
-                if keep_dims:
-                    out_strategy.append(1)
-            else:
-                out_strategy.append(strategy1[1][i])
-        self.out_strategy = out_strategy
-        need_dev_num0 = 1
-        need_dev_num1 = 1
-        for s in strategy0[1]:
-            need_dev_num0 = need_dev_num0 * s
-        for s in out_strategy:
-            need_dev_num1 = need_dev_num1 * s
-        self.x_id = device_id % need_dev_num0
-        self.y_id = device_id % need_dev_num0
-        block_id = device_id % need_dev_num0
-        device_index = self.id_to_list(block_id, self.strategy1[1])
-        print(device_index)
-        for i in axis:
-            device_index[i] = 0
-        print(device_index)
-        self.out_id = self.list_to_id(device_index, self.out_strategy)
-        print(self.out_id)
-    def id_to_list(self, id_, shape):
-        result = []
-        r = id_
-        for i in range(0, len(shape)):
-            v = 1
-            for j in range(i + 1, len(shape)):
-                v = v * shape[j]
-            result.append(r // v)
-            r = r % v
-        return result
-    def list_to_id(self, id_list, shape):
-        result = 0
-        for i in range(0, len(id_list)):
-            v = 1
-            for j in range(i + 1, len(id_list)):
-                v = v * shape[j]
-            result = result + id_list[i] * v
-        return result
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-    def forward_mindspore_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        net = ReduceMean(keep_dims=self.keep_dims, axis=self.axis)
-        out = net(x, y)
-        return out.asnumpy()
-    def forward_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(inputs_y[self.y_id])
-        net = ReduceMean(keep_dims=self.keep_dims, axis=self.axis, strategy0=self.strategy0, strategy1=self.strategy1)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        net.set_auto_parallel()
-        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
-        return out.asnumpy()
-    def grad_mindspore_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        out_grad = Tensor(self.output_grad_np)
-        net = ReduceMean(keep_dims=self.keep_dims, axis=self.axis)
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(x, y, out_grad)
-        return input_grad
-    def grad_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        output_grad = Tensor(self.output_grad_np)
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
-        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(inputs_y[self.y_id])
-        output_grad1 = Tensor(outgrads[self.out_id])
-        net = ReduceMean(keep_dims=self.keep_dims, axis=self.axis, strategy0=self.strategy0, strategy1=self.strategy1)
-        grad_net = Grad(net)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        grad_net.set_auto_parallel()
-        grad_net.set_train()
-        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
-                              parallel_inputs_run=[x1, y1, output_grad1])
-        return input_grad
-    def forward_cmp(self):
-        out_mindspore = self.forward_mindspore_impl()
-        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
-        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
-        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)
-    def grad_cmp(self):
-        input_grad_mindspore = self.grad_mindspore_impl()
-        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
-        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
-        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
-        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
-        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
-        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
-        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
-        assert allclose_nparray(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
-        assert allclose_nparray(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)
-def test_reid_reducemean_input_64x16():
-    fact = ReduceMeanFactory(input_shape=(64 * 16,), keep_dims=False, axis=(-1,), strategy0=(0, (4,), (4,)),
-                             strategy1=(0, (4,)))
-    fact.forward_cmp()
-def test_grad_reid_reducemean_input_64x16():
-    fact = ReduceMeanFactory(input_shape=(64 * 16,), keep_dims=False, axis=(-1,), strategy0=(0, (4,), (4,)),
-                             strategy1=(0, (4,)))
-    fact.grad_cmp()
-def test_reid_reducemean_input_64x128x28x28():
-    fact = ReduceMeanFactory(input_shape=(64, 128, 32, 32), keep_dims=True, axis=(2, 3),
-                             strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)), strategy1=(0, (2, 1, 2, 1)))
-    fact.forward_cmp()
-def test_grad_reid_reducemean_input_64x128x28x28():
-    fact = ReduceMeanFactory(input_shape=(64, 128, 32, 32), keep_dims=True, axis=(2, 3),
-                             strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)), strategy1=(0, (2, 1, 2, 1)))
-    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/reducemean/reducemean_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/reducemean/reducemean_parallel_4p.sh
-#!/bin/bash
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-for((i=0;i<4;i++));
-do
-    rm -rf device$i
-    mkdir device$i
-    cd device$i
-    mkdir output
-    source ../../dist_env_4p.sh $i
-    env  >log$i.log
-    pytest -s ../test_reducemean_parallel_4p.py>../../log/test_reducemean_parallel_4p_log$i.log  2>&1 &
-    cd ..    
-done
--- a/tests/ut/python/parallel/parallel_end_to_end/reshape/_test_reshape_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/reshape/_test_reshape_parallel_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import numpy as np
-import pytest
-from numpy import allclose as allclose_nparray
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all_with_sens
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    distributedTool.create_group("0-3", [0, 1, 2, 3])
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-    def construct(self, x, y, output_grad):
-        return grad_all_with_sens(self.network)(x, y, output_grad)
-class Reshape(Cell):
-    def __init__(self, target_shape, strategy0=None, strategy1=None):
-        super(Reshape, self).__init__()
-        self.add = P.TensorAdd(strategy=strategy0)
-        self.reshape = P.Reshape(strategy=strategy1)
-        self.shape = tuple(target_shape)
-    def construct(self, input1, input2):
-        x = self.add(input1, input2)
-        return self.reshape(x, self.shape)
-class ReshapeFactory:
-    def __init__(self, input_shape, target_shape, strategy0, strategy1):
-        prefix = ""
-        size = 1
-        for s in input_shape:
-            prefix = prefix + str(s)
-            size = size * s
-        self.prefix = prefix
-        number_range = min(1000, size)
-        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
-            np.float32)
-        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
-            np.float32)
-        target_size = 1
-        for s in target_shape:
-            target_size = target_size * s
-        number_range = min(1000, target_size)
-        self.output_grad_np = np.reshape(np.arange(0, target_size) % number_range - number_range / 2,
-                                         target_shape).astype(np.float32)
-        self.target_shape = target_shape
-        self.strategy0 = strategy0
-        self.strategy1 = strategy1
-        out_strategy = [1] * len(target_shape)
-        out_strategy[0] = strategy1[1][0]
-        self.out_strategy = out_strategy
-        need_dev_num0 = 1
-        need_dev_num1 = 1
-        for s in strategy0[1]:
-            need_dev_num0 = need_dev_num0 * s
-        for s in out_strategy:
-            need_dev_num1 = need_dev_num1 * s
-        self.x_id = device_id % need_dev_num0
-        self.y_id = device_id % need_dev_num0
-        self.out_id = device_id % need_dev_num1
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-    def forward_reshape_mindspore_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        net = Reshape(self.target_shape)
-        out = net(x, y)
-        return out.asnumpy()
-    def forward_reshape_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(inputs_y[self.y_id])
-        net = Reshape(self.target_shape, strategy0=self.strategy0, strategy1=self.strategy1)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        net.set_auto_parallel()
-        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
-        return out.asnumpy()
-    def grad_reshape_mindspore_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        output_grad = Tensor(self.output_grad_np)
-        net = Reshape(self.target_shape)
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(x, y, output_grad)
-        return input_grad
-    def grad_reshape_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        output_grad = Tensor(self.output_grad_np)
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
-        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(inputs_y[self.y_id])
-        output_grad1 = Tensor(outgrads[self.out_id])
-        net = Reshape(self.target_shape, strategy0=self.strategy0, strategy1=self.strategy1)
-        grad_net = Grad(net)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        grad_net.set_auto_parallel()
-        grad_net.set_train()
-        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
-                              parallel_inputs_run=[x1, y1, output_grad1])
-        return input_grad
-    def forward_reshape_cmp(self):
-        out_mindspore = self.forward_reshape_mindspore_impl()
-        out_mindspore_parallel = self.forward_reshape_mindspore_parallel_impl()
-        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
-        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)
-    def grad_reshape_cmp(self):
-        input_grad_mindspore = self.grad_reshape_mindspore_impl()
-        input_grad_mindspore_parallel = self.grad_reshape_mindspore_parallel_impl()
-        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
-        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
-        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
-        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
-        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
-        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
-        assert allclose_nparray(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
-        assert allclose_nparray(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)
-@pytest.mark.reid_forward
-def test_reid_reshape_input_128x512x7x7_target_128x25088():
-    fact = ReshapeFactory(input_shape=(128, 512, 7, 7), target_shape=(128, 25088),
-                          strategy0=(0, (4, 1, 1, 1), (4, 1, 1, 1)), strategy1=(0, (4, 1, 1, 1)))
-    fact.forward_reshape_cmp()
-def test_reid_reshape_grad_input_128x512x7x7_target_128x25088():
-    fact = ReshapeFactory(input_shape=(128, 512, 7, 7), target_shape=(128, 25088),
-                          strategy0=(0, (4, 1, 1, 1), (4, 1, 1, 1)), strategy1=(0, (4, 1, 1, 1)))
-    fact.grad_reshape_cmp()
-@pytest.mark.reid_forward
-def test_reid_reshape_input_128x64_target_128x64x1x1():
-    fact = ReshapeFactory(input_shape=(128, 64), target_shape=(128, 64, 1, 1), strategy0=(0, (2, 1), (2, 1)),
-                          strategy1=(0, (2, 1)))
-    fact.forward_reshape_cmp()
-@pytest.mark.reid_grad
-def test_reid_reshape_grad_input_128x64_target_128x64x1x1():
-    fact = ReshapeFactory(input_shape=(128, 64), target_shape=(128, 64, 1, 1), strategy0=(0, (2, 1), (2, 1)),
-                          strategy1=(0, (2, 1)))
-    fact.grad_reshape_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/reshape/reshape_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/reshape/reshape_parallel_4p.sh
-#!/bin/bash
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-for((i=0;i<4;i++));
-do
-    rm -rf device$i
-    mkdir device$i
-    cd device$i
-    mkdir output
-    source ../../dist_env_4p.sh $i
-    env  >log$i.log
-    pytest -s ../test_reshape_parallel_4p.py>../../log/test_reshape_parallel_4p_log$i.log  2>&1 &
-    cd ..    
-done
--- a/tests/ut/python/parallel/parallel_end_to_end/transpose/_test_transpose_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/transpose/_test_transpose_parallel_4p.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import numpy as np
-from numpy import allclose as allclose_nparray
-import mindspore.communication.management as distributedTool
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.nn import Cell
-from mindspore.ops import operations as P
-from mindspore.ops.composite import grad_all_with_sens
-device_num = 4
-device_id = int(os.environ["RANK_ID"])
-path = "./output/"
-def setup_module():
-    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
-    context.set_context(mode=context.GRAPH_MODE)
-    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
-    distributedTool.init()
-    distributedTool.create_group("0-3", [0, 1, 2, 3])
-    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")
-def teardown_module():
-    print("~~~~~~~~~~~~tear down~~~~~~~~~~")
-class Net(Cell):
-    def __init__(self, perm_in, strategy0=None, strategy1=None):
-        super(Net, self).__init__()
-        self.add = P.TensorAdd(strategy=strategy0)
-        self.transpose = P.Transpose(strategy=strategy1)
-        self.perm_in = perm_in
-    def construct(self, x, y):
-        out = self.add(x, y)
-        return self.transpose(out, self.perm_in)
-class Grad(Cell):
-    def __init__(self, network):
-        super(Grad, self).__init__()
-        self.network = network
-    def construct(self, x, y, output_grad):
-        return grad_all_with_sens(self.network)(x, y, output_grad)
-class TransposeFactory:
-    def __init__(self, input_shape, perm_in, strategy0, strategy1):
-        prefix = ""
-        size = 1
-        for s in input_shape:
-            prefix = prefix + str(s)
-            size = size * s
-        self.prefix = prefix
-        number_range = min(1000, size)
-        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
-            np.float32)
-        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
-            np.float32)
-        target_shape = self.input_np1.transpose(perm_in).shape
-        target_size = 1
-        for s in target_shape:
-            target_size = target_size * s
-        number_range = min(1000, target_size)
-        self.target_shape = target_shape
-        self.output_grad_np = np.reshape(np.arange(0, target_size) % number_range - number_range / 2,
-                                         target_shape).astype(np.float32)
-        self.perm_in = perm_in
-        self.strategy0 = strategy0
-        self.strategy1 = strategy1
-        out_strategy = []
-        for i in perm_in:
-            out_strategy.append(strategy1[1][i])
-        self.out_strategy = out_strategy
-        need_dev_num0 = 1
-        need_dev_num1 = 1
-        for s in strategy0[1]:
-            need_dev_num0 = need_dev_num0 * s
-        for s in out_strategy:
-            need_dev_num1 = need_dev_num1 * s
-        self.x_id = device_id % need_dev_num0
-        self.y_id = device_id % need_dev_num0
-        device_index = self.id_to_list(device_id % need_dev_num1,
-                                       self.strategy1[1])  # encoding to get the index before transpose
-        device_index_transpose = []
-        for i in perm_in:
-            device_index_transpose.append(device_index[i])
-        self.out_id = self.list_to_id(device_index_transpose, self.out_strategy)
-    def get_parallel_blocks(self, input_, strategy):
-        blocks = [input_]
-        i = 0
-        for stra in strategy:
-            temp = []
-            while len(blocks) > 0:
-                block = blocks.pop(0)
-                temp.extend(np.split(block, stra, axis=i))
-            blocks.extend(temp)
-            i += 1
-        return blocks
-    def id_to_list(self, id_, shape):
-        result = []
-        r = id_
-        for i in range(0, len(shape)):
-            v = 1
-            for j in range(i + 1, len(shape)):
-                v = v * shape[j]
-            result.append(r // v)
-            r = r % v
-        return result
-    def list_to_id(self, id_list, shape):
-        result = 0
-        for i in range(0, len(id_list)):
-            v = 1
-            for j in range(i + 1, len(id_list)):
-                v = v * shape[j]
-            result = result + id_list[i] * v
-        return result
-    def forward_mindspore_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        net = Net(self.perm_in)
-        out = net(x, y)
-        return out.asnumpy()
-    def forward_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(inputs_y[self.y_id])
-        net = Net(self.perm_in, strategy0=self.strategy0, strategy1=self.strategy1)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        net.set_auto_parallel()
-        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
-        return out.asnumpy()
-    def grad_mindspore_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        output_grad = Tensor(self.output_grad_np)
-        net = Net(self.perm_in)
-        grad_net = Grad(net)
-        grad_net.set_train()
-        input_grad = grad_net(x, y, output_grad)
-        return input_grad
-    def grad_mindspore_parallel_impl(self):
-        x = Tensor(self.input_np1)
-        y = Tensor(self.input_np2)
-        output_grad = Tensor(self.output_grad_np)
-        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
-        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
-        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
-        x1 = Tensor(inputs_x[self.x_id])
-        y1 = Tensor(inputs_y[self.y_id])
-        output_grad1 = Tensor(outgrads[self.out_id])
-        net = Net(self.perm_in, strategy0=self.strategy0, strategy1=self.strategy1)
-        grad_net = Grad(net)
-        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
-        grad_net.set_auto_parallel()
-        grad_net.set_train()
-        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
-                              parallel_inputs_run=[x1, y1, output_grad1])
-        return input_grad
-    def forward_transpose_cmp(self):
-        out_mindspore = self.forward_mindspore_impl()
-        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
-        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
-        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)
-    def grad_transpose_cmp(self):
-        input_grad_mindspore = self.grad_mindspore_impl()
-        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
-        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
-        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
-        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
-        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
-        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
-        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
-        assert allclose_nparray(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
-        assert allclose_nparray(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)
-def test_reid_transpose_input_256x512_output_512x256_perm_1x0():
-    fact = TransposeFactory((256, 512), (1, 0), strategy0=(0, (2, 2), (2, 2)), strategy1=(0, (2, 2)))
-    fact.forward_transpose_cmp()
-def test_reid_grad_transpose_input_256x512_output_512x256_perm_1x0():
-    fact = TransposeFactory((256, 512), (1, 0), strategy0=(0, (2, 2), (2, 2)), strategy1=(0, (2, 2)))
-    fact.grad_transpose_cmp()
-def test_reid_transpose_input_512x256_output_256x512_perm_1x0():
-    fact = TransposeFactory((512, 256), (1, 0), strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
-    fact.forward_transpose_cmp()
-def test_reid_grad_transpose_input_512x256_output_256x512_perm_1x0():
-    fact = TransposeFactory((512, 256), (1, 0), strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
-    fact.grad_transpose_cmp()
-def test_reid_transpose_input_512x256_output_256x512_perm_1x0_repeat():
-    fact = TransposeFactory((512, 256), (1, 0), strategy0=(0, (2, 1), (2, 1)), strategy1=(0, (2, 1)))
-    fact.forward_transpose_cmp()
-def test_reid_grad_transpose_input_512x256_output_256x512_perm_1x0_repeat():
-    fact = TransposeFactory((512, 256), (1, 0), strategy0=(0, (2, 1), (2, 1)), strategy1=(0, (2, 1)))
-    fact.grad_transpose_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/transpose/transpose_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/transpose/transpose_parallel_4p.sh
-#!/bin/bash
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-for((i=0;i<4;i++));
-do
-    rm -rf device$i
-    mkdir device$i
-    cd device$i
-    mkdir output
-    source ../../dist_env_4p.sh $i
-    env  >log$i.log
-    pytest -s ../test_transpose_parallel_4p.py>../../log/test_transpose_parallel_4p_log$i.log  2>&1 &
-    cd ..    
-done