for pylint 2nd

2d0ee054 · z00478463 · 648501da · 2d0ee054 · 2d0ee054 · 2d0ee054
12 changed file
--- a/mindspore/ops/_op_impl/_custom_op/__init__.py
+++ b/mindspore/ops/_op_impl/_custom_op/__init__.py
@@ -14,14 +14,3 @@
 # ============================================================================

 """custom ops"""
-from .batch_matmul_impl import CusBatchMatMul
-from .cholesky_trsm_impl import CusCholeskyTrsm
-from .fused_abs_max1_impl import CusFusedAbsMax1
-from .img2col_impl import CusImg2Col
-from .matmul_cube_dense_left_impl import CusMatMulCubeDenseLeft
-from .matmul_cube_dense_right_impl import CusMatMulCubeDenseRight
-from .matmul_cube_fracz_left_cast_impl import CusMatMulCubeFraczLeftCast
-from .matmul_cube_fracz_right_mul_impl import CusMatMulCubeFraczRightMul
-from .matmul_cube_impl import CusMatMulCube
-from .matrix_combine_impl import CusMatrixCombine
-from .transpose02314_impl import CusTranspose02314
--- a/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
@@ -14,29 +14,31 @@
 # ============================================================================
 """batch_matmul_impl"""

+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
- 
+
 cus_batchmatmul_op_info = TBERegOp("CusBatchMatMul") \
-                             .fusion_type("OPAQUE") \
-                             .async_flag(False) \
-                             .binfile_name("batchmatmul.so") \
-                             .compute_cost(10) \
-                             .kernel_name("CusBatchMatMul") \
-                             .partial_flag(True) \
-                             .input(0, "x1", False, "required", "all") \
-                             .input(1, "x2", False, "required", "all") \
-                             .output(0, "y", False, "required", "all") \
-                             .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
-                             .get_op_info()
- 
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("batchmatmul.so") \
+    .compute_cost(10) \
+    .kernel_name("CusBatchMatMul") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .input(1, "x2", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
+
 def _get_flattern_shape(shape):
    flattern_shape = 1
    for dim in shape:
        flattern_shape *= dim
    return (flattern_shape,)
- 
+
+
 def _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index):
    input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB", scope=tik.scope_ubuf)
    t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB", scope=tik.scope_ubuf)
@@ -66,12 +68,13 @@ def _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_
                          matmul_hybrid_f_t_local_UB, 1, 1, 1, 1, 8, 8, 8)
        tik_instance.data_move(res[res_index + thread_idx2 * 64],
                               matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)
- 
+
+
 def _inner_matmul_new_1_64_32_64(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index):
    input_1_local_UB = tik_instance.Tensor(dtype, [64], name="input_1_local_UB", scope=tik.scope_ubuf)
    tik_instance.data_move(input_1_local_UB, input1[input1_index], 0, 1, 8, 0, 0)
    with tik_instance.for_range(0, 2, thread_num=2) as thread_idx2:
-        input_2_local_UB = tik_instance.Tensor(dtype, [32*64], name="input_2_local_UB",
+        input_2_local_UB = tik_instance.Tensor(dtype, [32 * 64], name="input_2_local_UB",
                                               scope=tik.scope_ubuf)
        t_1_local_UB = input_2_local_UB
        matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [32], name="matmul_hybrid_f_t_local_UB",
@@ -83,6 +86,8 @@ def _inner_matmul_new_1_64_32_64(tik_instance, dtype, input1, input1_index, inpu
                               1, 1, 1, 8)
        tik_instance.data_move(res[res_index + thread_idx2 * 32],
                               matmul_hybrid_f_t_local_UB, 0, 1, 4, 0, 0)
+
+
 @op_info_register(cus_batchmatmul_op_info)
 def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=True, kernel_name="batchmatmul"):
    if util.get_product_version() == util.VERSION_MINI:
@@ -97,51 +102,54 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
            dtype, input_x2.get("dtype").lower()))
    input_shape = (tuple(x1_shape), tuple(x2_shape), dtype, transpose_a, transpose_b)
    support_shape = [((8, 128, 128), (8, 128, 128), "float32", False, True),
-                    ((36, 128, 128), (36, 128, 128), "float32", False, True),
-                    ((5, 128, 128), (5, 128, 128), "float32", False, True),
-                    ((18, 128, 128), (18, 128, 128), "float32", False, True),
-                    ((16, 128, 128), (16, 128, 128), "float32", False, True),
-                    ((9, 128, 128), (9, 128, 128), "float32", False, True),
-                    ((1, 64, 64), (1, 64, 64), "float32", False, True),
-                    ((1, 128, 128), (1, 128, 128), "float32", False, True),
-                    ((4, 128, 128), (4, 128, 128), "float32", False, True),
-                    ((2, 128, 128), (2, 128, 128), "float32", False, True)]
+                     ((36, 128, 128), (36, 128, 128), "float32", False, True),
+                     ((5, 128, 128), (5, 128, 128), "float32", False, True),
+                     ((18, 128, 128), (18, 128, 128), "float32", False, True),
+                     ((16, 128, 128), (16, 128, 128), "float32", False, True),
+                     ((9, 128, 128), (9, 128, 128), "float32", False, True),
+                     ((1, 64, 64), (1, 64, 64), "float32", False, True),
+                     ((1, 128, 128), (1, 128, 128), "float32", False, True),
+                     ((4, 128, 128), (4, 128, 128), "float32", False, True),
+                     ((2, 128, 128), (2, 128, 128), "float32", False, True)]
    if input_shape not in support_shape:
        raise RuntimeError("input_shape %s is not supported" % str(input_shape))
- 
- 
+
    # if not transpose_a and transpose_b:
    batch, m, k = x1_shape
    _, n, _ = x2_shape
- 
+
    input1_shape = _get_flattern_shape(x1_shape)
    input1 = tik_instance.Tensor(dtype, input1_shape, name="input1", scope=tik.scope_gm)
    input2_shape = _get_flattern_shape(x2_shape)
    input2 = tik_instance.Tensor(dtype, input2_shape, name="input2", scope=tik.scope_gm)
- 
+
    output_shape = x1_shape
    res_shape = _get_flattern_shape(output_shape)
    res = tik_instance.Tensor(dtype, res_shape, name="res", scope=tik.scope_gm)
- 
+
    if input_shape == ((36, 128, 128), (36, 128, 128), "float32", False, True):
        with tik_instance.for_range(0, 18, block_num=18) as block_idx:
            with tik_instance.for_range(0, 2) as cc0:
                with tik_instance.for_range(0, 128, thread_num=2) as cc1:
-                    input1_index = block_idx * 32768 + cc0*16384 + cc1 * 128
-                    input2_index = block_idx * 32768 + cc0*16384
-                    res_index = block_idx*32768 + cc0*16384 + cc1*128
+                    input1_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128
+                    input2_index = block_idx * 32768 + cc0 * 16384
+                    res_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128
                    _inner_matmul_new(tik_instance, dtype,
-                                  input1, input1_index,
-                                  input2, input2_index,
-                                  res, res_index)
+                                      input1, input1_index,
+                                      input2, input2_index,
+                                      res, res_index)
    if input_shape == ((5, 128, 128), (5, 128, 128), "float32", False, True):
        with tik_instance.for_range(0, 30, block_num=30) as block_idx:
            with tik_instance.for_range(0, 11) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as thread_idx:
                    with tik_instance.if_scope(((((block_idx % 6) * 22) + (cc1_db * 2) + thread_idx) < 128)):
-                        input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB", scope=tik.scope_ubuf)
-                        t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB", scope=tik.scope_ubuf)
-                        tik_instance.data_move(input_1_local_UB, input1[(block_idx//6)*16384 + (block_idx % 6)*2816 + cc1_db * 256 + thread_idx*128], 0, 1, 16, 0, 0)
+                        input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB",
+                                                               scope=tik.scope_ubuf)
+                        t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB",
+                                                             scope=tik.scope_ubuf)
+                        tik_instance.data_move(input_1_local_UB, input1[
+                            (block_idx // 6) * 16384 + (block_idx % 6) * 2816 + cc1_db * 256 + thread_idx * 128], 0, 1,
+                                               16, 0, 0)
                        with tik_instance.for_range(0, 2) as vec_i:
                            tik_instance.vadds(64, t_1_0_local_UB[vec_i * 64], input_1_local_UB[vec_i * 64], 0,
                                               64, 1, 1, 16, 0)
@@ -150,58 +158,61 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
                                                                   scope=tik.scope_ubuf)
                            t_1_local_UB = input_2_local_UB
                            bisec_last_axis_local_UB = input_2_local_UB
-                            matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [64], name="matmul_hybrid_f_t_local_UB",
+                            matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [64],
+                                                                             name="matmul_hybrid_f_t_local_UB",
                                                                             scope=tik.scope_ubuf)
                            matmul_hybrid_f_t_local_UB_dst_tmp = tik_instance.Tensor(dtype, [64],
                                                                                     name="matmul_hybrid_f_t_local_UB_dst_tmp",
                                                                                     scope=tik.scope_ubuf)
                            tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB, 0, 1, 1, 8)
-                            tik_instance.data_move(input_2_local_UB, input2[(block_idx//6) * 16384 + thread_idx2*8192], 0, 1,
+                            tik_instance.data_move(input_2_local_UB,
+                                                   input2[(block_idx // 6) * 16384 + thread_idx2 * 8192], 0, 1,
                                                   1024, 0, 0)
                            tik_instance.vmul(64, t_1_local_UB, t_1_0_local_UB, input_2_local_UB, 128, 1, 1, 1, 8, 8, 8)
                            tik_instance.vadd(64, bisec_last_axis_local_UB, t_1_local_UB, t_1_local_UB[64], 64, 1, 1, 1,
                                              16, 16, 16)
                            tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB_dst_tmp, 0, 1, 1, 8)
                            with tik_instance.for_range(0, 64) as cc6:
-                                tik_instance.vcadd(64, matmul_hybrid_f_t_local_UB_dst_tmp[cc6], bisec_last_axis_local_UB[cc6*128],
+                                tik_instance.vcadd(64, matmul_hybrid_f_t_local_UB_dst_tmp[cc6],
+                                                   bisec_last_axis_local_UB[cc6 * 128],
                                                   1, 1, 1, 8)
                            tik_instance.vadd(64, matmul_hybrid_f_t_local_UB, matmul_hybrid_f_t_local_UB_dst_tmp,
                                              matmul_hybrid_f_t_local_UB, 1, 1, 1, 1, 8, 8, 8)
-                            tik_instance.data_move(res[(block_idx//6)*16384 + (block_idx%6)*2816 + cc1_db*256 +
-                                                       thread_idx*128 + thread_idx2*64],
-                                                   matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)
- 
+                            tik_instance.data_move(
+                                res[(block_idx // 6) * 16384 + (block_idx % 6) * 2816 + cc1_db * 256 +
+                                    thread_idx * 128 + thread_idx2 * 64],
+                                matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)
+
    if input_shape == ((18, 128, 128), (18, 128, 128), "float32", False, True):
        with tik_instance.for_range(0, 18, block_num=18) as block_idx:
            with tik_instance.for_range(0, 128, thread_num=2) as cc0:
                input1_index = block_idx * 16384 + cc0 * 128
                input2_index = block_idx * 16384
-                res_index = block_idx*16384 + cc0*128
+                res_index = block_idx * 16384 + cc0 * 128
                _inner_matmul_new(tik_instance, dtype,
-                              input1, input1_index,
-                              input2, input2_index,
-                              res, res_index)
- 
- 
+                                  input1, input1_index,
+                                  input2, input2_index,
+                                  res, res_index)
+
    if input_shape == ((9, 128, 128), (9, 128, 128), "float32", False, True):
        with tik_instance.for_range(0, 27, block_num=27) as block_idx:
            with tik_instance.for_range(0, 42, thread_num=2) as cc0:
-                input1_index = (block_idx//3) * 16384 + (block_idx % 3)*5504 + cc0 * 128
-                input2_index = (block_idx//3) * 16384
-                res_index = (block_idx//3) * 16384 + (block_idx % 3)*5504 + cc0*128
+                input1_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + cc0 * 128
+                input2_index = (block_idx // 3) * 16384
+                res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + cc0 * 128
                _inner_matmul_new(tik_instance, dtype,
-                              input1, input1_index,
-                              input2, input2_index,
-                              res, res_index)
+                                  input1, input1_index,
+                                  input2, input2_index,
+                                  res, res_index)
            with tik_instance.if_scope((block_idx % 3) < 2):
-                input1_index = (block_idx//3) * 16384 + (block_idx % 3)*5504 + 42*128
+                input1_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42 * 128
                input2_index = (block_idx // 3) * 16384
-                res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42*128
+                res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42 * 128
                _inner_matmul_new(tik_instance, dtype,
-                              input1, input1_index,
-                              input2, input2_index,
-                              res, res_index)
- 
+                                  input1, input1_index,
+                                  input2, input2_index,
+                                  res, res_index)
+
    if input_shape == ((1, 64, 64), (1, 64, 64), "float32", False, True):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            with tik_instance.for_range(0, 2, thread_num=2) as cc0:
@@ -209,35 +220,35 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
                input2_index = 0
                res_index = block_idx * 128 + cc0 * 64
                _inner_matmul_new_1_64_32_64(tik_instance, dtype,
-                              input1, input1_index,
-                              input2, input2_index,
-                              res, res_index)
- 
+                                             input1, input1_index,
+                                             input2, input2_index,
+                                             res, res_index)
+
    input_shape_list = [((1, 128, 128), (1, 128, 128), "float32", False, True),
                        ((2, 128, 128), (2, 128, 128), "float32", False, True),
                        ((4, 128, 128), (4, 128, 128), "float32", False, True),
                        ((8, 128, 128), (8, 128, 128), "float32", False, True),
                        ((16, 128, 128), (16, 128, 128), "float32", False, True)
-    ]
+                        ]
    if input_shape in input_shape_list:
        block_num = 32
        input1_unit_size = 128
-        input2_unint_size = 128*128
+        input2_unint_size = 128 * 128
        with tik_instance.for_range(0, block_num, block_num=block_num) as block_idx:
            block_process_ele_num = (batch * m * k) // block_num
-            loop_time = (batch*m*k)//block_num//input1_unit_size
+            loop_time = (batch * m * k) // block_num // input1_unit_size
            thread_num = 2
            with tik_instance.for_range(0, loop_time, thread_num=thread_num) as cc0:
-                input1_index = block_idx*block_process_ele_num + cc0*input1_unit_size
+                input1_index = block_idx * block_process_ele_num + cc0 * input1_unit_size
                if batch > 1:
-                    input2_index = block_idx//(block_num//batch) * input2_unint_size
+                    input2_index = block_idx // (block_num // batch) * input2_unint_size
                else:
                    input2_index = 0
-                res_index = block_idx*block_process_ele_num + cc0*input1_unit_size
+                res_index = block_idx * block_process_ele_num + cc0 * input1_unit_size
                _inner_matmul_new(tik_instance, dtype,
                                  input1, input1_index,
                                  input2, input2_index,
                                  res, res_index)
- 
+
    tik_instance.BuildCCE(kernel_name, inputs=[input1, input2], outputs=[res])
    return tik_instance
--- a/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
@@ -13,24 +13,25 @@
 # limitations under the License.
 # ============================================================================
 """CusCholeskyTrsm"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType

 cus_cholesky_trsm_op_info = TBERegOp("CusCholeskyTrsm") \
-                             .fusion_type("OPAQUE") \
-                             .async_flag(False) \
-                             .binfile_name("choleskytrsm.so") \
-                             .compute_cost(10) \
-                             .kernel_name("CusCholeskyTrsm") \
-                             .partial_flag(True) \
-                             .input(0, "x1", False, "required", "all") \
-                             .output(0, "y", False, "required", "all") \
-                             .dtype_format(DataType.F32_Default, DataType.F32_Default) \
-                             .get_op_info()
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("choleskytrsm.so") \
+    .compute_cost(10) \
+    .kernel_name("CusCholeskyTrsm") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+

 @op_info_register(cus_cholesky_trsm_op_info)
-def CusCholeskyTrsm(input_x,output, kernel_name):
+def CusCholeskyTrsm(input_x, output, kernel_name):
    input_x_shape = input_x.get("shape")
    output_shape = output.get("shape")
    split_dim = 128
@@ -47,34 +48,36 @@ def CusCholeskyTrsm(input_x,output, kernel_name):

    input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
    res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
-    with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-        input_x_ub = tik_instance.Tensor("float32", (split_dim,split_dim), name="input_x_ub", scope=tik.scope_ubuf)
-        temp_ub = tik_instance.Tensor("float32", (split_dim,split_dim), name="temp_ub", scope=tik.scope_ubuf)
+    with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+        input_x_ub = tik_instance.Tensor("float32", (split_dim, split_dim), name="input_x_ub", scope=tik.scope_ubuf)
+        temp_ub = tik_instance.Tensor("float32", (split_dim, split_dim), name="temp_ub", scope=tik.scope_ubuf)
        assist_1_ub = tik_instance.Tensor("float32", (split_dim,), name="assist_1_ub", scope=tik.scope_ubuf)
        assist_2_ub = tik_instance.Tensor("float32", (split_dim,), name="assist_2_ub", scope=tik.scope_ubuf)
-        with tik_instance.for_range(0,split_dim) as i:
-            tik_instance.data_move(input_x_ub[i,0], input_x[block_index * split_dim + i, block_index * split_dim], 0, 1, vector_repeat_times * 8, 0, 0)
-        scalar1 = tik_instance.Scalar("float32", init_value = -0.5)
+        with tik_instance.for_range(0, split_dim) as i:
+            tik_instance.data_move(input_x_ub[i, 0], input_x[block_index * split_dim + i, block_index * split_dim], 0,
+                                   1, vector_repeat_times * 8, 0, 0)
+        scalar1 = tik_instance.Scalar("float32", init_value=-0.5)

        with tik_instance.for_range(0, split_dim) as i:
-            scalar2= tik_instance.Scalar("float32")
-            tik_instance.vln(64, assist_1_ub[0], input_x_ub[i,0], vector_repeat_times, 1, 1, 8, 8)
+            scalar2 = tik_instance.Scalar("float32")
+            tik_instance.vln(64, assist_1_ub[0], input_x_ub[i, 0], vector_repeat_times, 1, 1, 8, 8)
            tik_instance.vmuls(64, assist_2_ub[0], assist_1_ub[0], scalar1, vector_repeat_times, 1, 1, 8, 8)
            tik_instance.vexp(64, assist_1_ub[0], assist_2_ub[0], vector_repeat_times, 1, 1, 8, 8)
            scalar2.set_as(assist_1_ub[i])
-            tik_instance.vmuls(64, input_x_ub[i,0], input_x_ub[i,0], scalar2, vector_repeat_times, 1, 1, 8, 8)
+            tik_instance.vmuls(64, input_x_ub[i, 0], input_x_ub[i, 0], scalar2, vector_repeat_times, 1, 1, 8, 8)
            with tik_instance.for_range(i + 1, split_dim) as j:
-                scalar3= tik_instance.Scalar("float32")
+                scalar3 = tik_instance.Scalar("float32")
                scalar3.set_as(input_x_ub[i, j])
-                tik_instance.vmuls(64,temp_ub[j, 0], input_x_ub[i, 0], scalar3, vector_repeat_times, 1, 1, 8, 8)
-            tik_instance.vsub(64,input_x_ub[i+1,0], input_x_ub[i+1,0], temp_ub[i+1,0], (split_dim-1-i) * vector_repeat_times, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmuls(64, temp_ub[j, 0], input_x_ub[i, 0], scalar3, vector_repeat_times, 1, 1, 8, 8)
+            tik_instance.vsub(64, input_x_ub[i + 1, 0], input_x_ub[i + 1, 0], temp_ub[i + 1, 0],
+                              (split_dim - 1 - i) * vector_repeat_times, 1, 1, 1, 8, 8, 8)

        zero = tik_instance.Scalar("float32")
        zero.set_as(0.0)
        one = tik_instance.Scalar("float32")
        one.set_as(1.0)
        with tik_instance.for_range(0, split_dim) as i:
-            tik_instance.vector_dup(64, temp_ub[i,0], zero, vector_repeat_times, 1, 8)
+            tik_instance.vector_dup(64, temp_ub[i, 0], zero, vector_repeat_times, 1, 8)
            temp_ub.__setitem__(i * split_dim + i, one)

        chol_diag_element_final = tik_instance.Scalar("float32")
@@ -89,16 +92,19 @@ def CusCholeskyTrsm(input_x,output, kernel_name):
            with tik_instance.for_range(0, i) as j:
                chol_diag_element_loop = tik_instance.Scalar("float32")
                chol_diag_element_loop.set_as(input_x_ub[index, index + 1 + j])
-                tik_instance.vmuls(64, assist_2_ub, temp_ub[j + index + 1, 0], chol_diag_element_loop, vector_repeat_times,1,1,8,8)
-                tik_instance.vadd(64, assist_1_ub, assist_2_ub, assist_1_ub, vector_repeat_times,1,1,1,8,8,8)
+                tik_instance.vmuls(64, assist_2_ub, temp_ub[j + index + 1, 0], chol_diag_element_loop,
+                                   vector_repeat_times, 1, 1, 8, 8)
+                tik_instance.vadd(64, assist_1_ub, assist_2_ub, assist_1_ub, vector_repeat_times, 1, 1, 1, 8, 8, 8)
            temp_scalar = tik_instance.Scalar("float32")
            temp_scalar.set_as(input_x_ub[index, index])
            chol_diag_element = tik_instance.Scalar("float32")
            chol_diag_element.set_as(1.0 / temp_scalar)
-            tik_instance.vsub(64,temp_ub[index, 0], temp_ub[index, 0], assist_1_ub, vector_repeat_times,1,1,1,8,8,8)
-            tik_instance.vmuls(64, temp_ub[index, 0], temp_ub[index, 0], chol_diag_element,vector_repeat_times,1,1,8,8)
+            tik_instance.vsub(64, temp_ub[index, 0], temp_ub[index, 0], assist_1_ub, vector_repeat_times, 1, 1, 1, 8, 8,
+                              8)
+            tik_instance.vmuls(64, temp_ub[index, 0], temp_ub[index, 0], chol_diag_element, vector_repeat_times, 1, 1,
+                               8, 8)

-        tik_instance.data_move(res[block_index,0,0], temp_ub, 0, 1, 8 * vector_repeat_times * split_dim,0,0)
+        tik_instance.data_move(res[block_index, 0, 0], temp_ub, 0, 1, 8 * vector_repeat_times * split_dim, 0, 0)

    tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
    return tik_instance
--- a/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
--- a/mindspore/ops/_op_impl/_custom_op/img2col_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/img2col_impl.py
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
@@ -17,17 +17,15 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
+
 import te.lang.cce
 import te.platform.cce_params as cce
-from te.platform.fusion_manager import fusion_manager
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+from te import tik
 from te import tvm
 from topi import generic
 from topi.cce import util

-from impl.matmul_vector import matmul_vector_cce
-
-from te import tik
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 # General limitation of the size for input shape: 2**31
 SHAPE_SIZE_LIMIT = 2147483648
 NoneType = type(None)
@@ -46,6 +44,7 @@ matmul_cube_dense_left_op_info = TBERegOp("CusMatMulCubeDenseLeft") \
    .dtype_format(DataType.F16_Default, DataType.F16_FracNZ, DataType.F16_Default, DataType.F16_FracNZ) \
    .get_op_info()

+
 # pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
 def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
    """
@@ -115,16 +114,16 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):

    if m_shape != 1:
        if n_shape == 1:
-            if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
+            if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
                raise RuntimeError("input shape K1 should be multiple of %d"
-                                   % (cce.BLOCK_IN*cce.BLOCK_IN))
-        elif km_shape%k_block_size != 0:
+                                   % (cce.BLOCK_IN * cce.BLOCK_IN))
+        elif km_shape % k_block_size != 0:
            raise RuntimeError(
                "input shape K1 should be multiple of %d" % cce.BLOCK_IN)
    else:
-        if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
+        if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
            raise RuntimeError("input shape K1 should be multiple of %d"
-                               % (cce.BLOCK_IN*cce.BLOCK_IN))
+                               % (cce.BLOCK_IN * cce.BLOCK_IN))

    if n_shape % cce.BLOCK_IN != 0 and n_shape != 1:
        raise RuntimeError("input shape N should be 1 or multiple of %d" % cce.BLOCK_IN)
@@ -132,7 +131,7 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
    if len(shape_bias):
        if len(shape_bias) == 1:
            if is_gevm or is_gemv:
-                if shape_bias[0] != m_shape*n_shape:
+                if shape_bias[0] != m_shape * n_shape:
                    raise RuntimeError("broadcast case shape bias for gemv must be equal m*n")
            else:
                if shape_bias[0] != n_shape:
@@ -143,33 +142,36 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
        else:
            raise RuntimeError("unsupport input shape now for batch bias case")

+
 def _get_bias(shape_bias):
    bias_length = shape_bias[0]
-    if bias_length % 16 ==0:
+    if bias_length % 16 == 0:
        return shape_bias
    else:
-        bias_length = (bias_length // 16)*16 + 16
+        bias_length = (bias_length // 16) * 16 + 16
        shape_bias = []
        shape_bias.append(bias_length)
        return shape_bias

+
 def _get_input_shape(shape_x):
    dim_a = shape_x[0]
    dim_b = shape_x[1]
    res = []
-    if dim_a % 16 !=0:
-        dim_a = (dim_a // 16)*16 + 16
+    if dim_a % 16 != 0:
+        dim_a = (dim_a // 16) * 16 + 16
        res.append(dim_a)
    else:
        res.append(dim_a)

-    if dim_b % 16 !=0:
-        dim_b = (dim_b // 16)*16 + 16
+    if dim_b % 16 != 0:
+        dim_b = (dim_b // 16) * 16 + 16
        res.append(dim_b)
    else:
        res.append(dim_b)
    return res

+
 def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
    shape_a = input_x1.get("shape")
    shape_b = input_x2.get("shape")
@@ -184,7 +186,7 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
    if bias is not None and bool(bias):
        shape_bias = bias.get("shape")
    try:
-        trans_a_f = bool(1-trans_a)
+        trans_a_f = bool(1 - trans_a)
        if src_dtype == "float32" or src_dtype == "int32":
            if len(shape_a) != 2 and len(shape_b) != 2:
                return False
@@ -205,44 +207,46 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
                    return False
            elif shape_a[1] != shape_b[0]:
                return False
- 
+
            if trans_a_f and trans_b and shape_b[1] == 1:
                return False
- 
+
        if src_dtype == "float16":
            if len(shape_a) != 2 and len(shape_b) != 2:
                return False
- 
+
            if trans_a:
                m_shape = shape_a[1]
                k_shape = shape_a[0]
            else:
                m_shape = shape_a[0]
                k_shape = shape_a[1]
- 
+
            if trans_b:
                n_shape = shape_b[0]
                k_b_shape = shape_b[1]
            else:
                n_shape = shape_b[1]
                k_b_shape = shape_b[0]
- 
+
            if k_shape != k_b_shape:
                return False
- 
+
            if m_shape == 1 or n_shape == 1:
                if k_shape % 256 != 0:
                    return False
- 
+
    except RuntimeError as e:
        return False
- 
+
    return True
- 
+
+
 # pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
 # @util.check_input_type(dict, dict, (dict, NoneType), dict, bool, bool, str)
 @op_info_register(matmul_cube_dense_left_op_info)
-def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False,
+                           kernel_name="matmulcube"):
    """
    calculating  matrix multiplication with bias, C = A*B + bias, support input
    data with fractal format.
@@ -279,87 +283,87 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
    print(shape_a, shape_b)
    print("============")
    if input_x2.get("format") == "FRACTAL_Z":
-        n,c,h,w = shape_b
+        n, c, h, w = shape_b
        c0 = 16
        c1 = c // c0
        if c1 == 0:
            c1 = 1
        shape_b = [n, c1 * h * w * c0]
-        shape_a = [n,n]
+        shape_a = [n, n]

    if input_x1.get("format") == "FRACTAL_Z":
-        n,c,h,w = shape_a
+        n, c, h, w = shape_a
        c0 = 16
        c1 = c // c0
        if c1 == 0:
            c1 = 1
        shape_a = [n, c1 * h * w * c0]
        shape_b = [c1 * h * w * c0, c1 * h * w * c0]
- 
+
    if input_x2.get("format") == "FRACTAL_NZ":
        shape_a = [shape_b[0], shape_b[0]]
        shape_b = shape_b
- 
+
    if input_x1.get("format") == "FRACTAL_NZ":
        shape_a = shape_a
        shape_b = [shape_a[1], shape_a[1]]
- 
+
    shape_a = list(shape_a)
    shape_b = list(shape_b)
- 
+
    shape_a = _get_input_shape(shape_a)
    shape_b = _get_input_shape(shape_b)
- 
+
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_a)
    util.check_shape_rule(shape_b)
    util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
- 
+
    shape_a = [shape_a[1], shape_a[0]]
-    trans_a = bool(1-trans_a)
- 
+    trans_a = bool(1 - trans_a)
+
    shape_b = [shape_b[1], shape_b[0]]
-    trans_b = bool(1-trans_b)
- 
+    trans_b = bool(1 - trans_b)
+
    shape_bias = ()
    if bias is not None and bool(bias):
        shape_bias = bias.get("shape")
        shape_bias = list(shape_bias)
        shape_bias = _get_bias(shape_bias)
- 
+
    src_dtype = input_x1.get("dtype").lower()
    dst_dtype = output_y.get("dtype").lower()
    _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b)
- 
+
    m_shape = shape_a[len(shape_a) - 2]
    km_shape = shape_a[len(shape_a) - 1]
    kn_shape = shape_b[len(shape_a) - 2]
    n_shape = shape_b[len(shape_a) - 1]
- 
+
    if src_dtype == "float16":
        block_reduce = cce.BLOCK_REDUCE
- 
+
    block_in = cce.BLOCK_IN
    block_out = cce.BLOCK_OUT
- 
+
    if trans_a and km_shape == 1:
        block_in = cce.BLOCK_VECTOR
- 
+
    if not trans_a and m_shape == 1:
        block_in = cce.BLOCK_VECTOR
- 
+
    if trans_b and kn_shape == 1:
        block_out = cce.BLOCK_VECTOR
- 
+
    if not trans_b and n_shape == 1:
        block_out = cce.BLOCK_VECTOR
- 
+
    if trans_a:
        shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in)
    else:
        shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce)
- 
+
    if trans_b:
        shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out)
    else:
@@ -368,7 +372,7 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
    format_a = "FRACTAL_NZ"
    shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
    format_b = "FRACTAL_NZ"
- 
+
    print("=======================================")
    print(shape_a_temp, shape_b_temp)
    print(format_a, format_b)
@@ -378,67 +382,85 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
                               dtype=src_dtype)
    tensor_b = tvm.placeholder(shape_b_temp, name='tensor_b',
                               dtype=src_dtype)
- 
+
    if len(shape_bias) > 0:
        tensor_bias = tvm.placeholder(shape_bias, name='tensor_bias',
                                      dtype=dst_dtype)
- 
+
    if shape_a_temp[0] == 63 and shape_a_temp[1] == 63 and shape_b_temp[0] == 128 and shape_b_temp[1] == 63:
        if util.get_product_version() == util.VERSION_MINI:
-          tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
+            tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
        else:
-          tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
- 
+            tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
+
        input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm)
        input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm)
        resMatmul = tik_instance.Tensor("float16", shape_output, name="output", scope=tik.scope_gm)
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            resMatmul_local_UB = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_ubuf, name = "resMatmul_local_UB")
-            resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (128 * 256,), scope=tik.scope_cc, name = "resMatmul_local_UB")
-            input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_ca, name = "input_1_local_L1_local_L0A")
-            input_2_local_L1 = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cbuf, name = "input_2_local_L1")
-            input_1_local_L1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cbuf, name = "input_1_local_L1")
-            input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cb, name = "input_2_local_L1_local_L0B")
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            resMatmul_local_UB = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_ubuf,
+                                                     name="resMatmul_local_UB")
+            resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (128 * 256,), scope=tik.scope_cc,
+                                                               name="resMatmul_local_UB")
+            input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_ca,
+                                                             name="input_1_local_L1_local_L0A")
+            input_2_local_L1 = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cbuf,
+                                                   name="input_2_local_L1")
+            input_1_local_L1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cbuf,
+                                                   name="input_1_local_L1")
+            input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cb,
+                                                             name="input_2_local_L1_local_L0B")
            core_m_idx = block_index % 8
            core_n_idx = block_index // 8
            with tik_instance.if_scope(core_m_idx != 7):
-                tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 8, 128, 55 * 16, 0)
-                tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0, 32, 128, 55 * 16, 0)
+                tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 8, 128,
+                                       55 * 16, 0)
+                tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0,
+                                       32, 128, 55 * 16, 0)
                with tik_instance.for_range(0, 8) as cc12:
-                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc12 * 2048], input_1_local_L1[cc12 * 256],  0, 8, 8, 0, False)
+                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc12 * 2048], input_1_local_L1[cc12 * 256], 0, 8,
+                                          8, 0, False)
                with tik_instance.for_range(0, 2) as cc6:
                    with tik_instance.for_range(0, 8) as cc121:
-                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc121 * 4096], input_2_local_L1[cc6 * 32768 + cc121 * 256],  0, 16, 8, 0, True)
-                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 128, 128, 256, 0)
+                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc121 * 4096],
+                                              input_2_local_L1[cc6 * 32768 + cc121 * 256], 0, 16, 8, 0, True)
+                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A,
+                                      input_2_local_L1_local_L0B, 128, 128, 256, 0)
                    tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0, 1)
-                    tik_instance.data_move(resMatmul[cc6 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], resMatmul_local_UB, 0, 16, 256 // 2 , 0, 55 * 16 * 2 // 2)
+                    tik_instance.data_move(resMatmul[cc6 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008],
+                                           resMatmul_local_UB, 0, 16, 256 // 2, 0, 55 * 16 * 2 // 2)
            with tik_instance.else_scope():
-                tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 7, 112, 56 * 16, 0)
-                tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0, 32, 112, 56 * 16, 0)
+                tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 7, 112,
+                                       56 * 16, 0)
+                tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0,
+                                       32, 112, 56 * 16, 0)
                with tik_instance.for_range(0, 7) as cc10:
-                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc10 * 1792], input_1_local_L1[cc10 * 256],  0, 7, 7, 0, False)
+                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc10 * 1792], input_1_local_L1[cc10 * 256], 0, 7,
+                                          7, 0, False)
                with tik_instance.for_range(0, 2) as cc5:
                    with tik_instance.for_range(0, 7) as cc101:
-                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc101 * 4096], input_2_local_L1[cc5 * 28672 + cc101 * 256],  0, 16, 7, 0, True)
-                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 112, 112, 256, 0)
+                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc101 * 4096],
+                                              input_2_local_L1[cc5 * 28672 + cc101 * 256], 0, 16, 7, 0, True)
+                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A,
+                                      input_2_local_L1_local_L0B, 112, 112, 256, 0)
                    tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 112, 0, 0, 1)
-                    tik_instance.data_move(resMatmul[cc5 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], resMatmul_local_UB, 0, 16, 224 // 2 , 0, 56 * 16 * 2 // 2)
+                    tik_instance.data_move(resMatmul[cc5 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008],
+                                           resMatmul_local_UB, 0, 16, 224 // 2, 0, 56 * 16 * 2 // 2)
        tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[resMatmul])
        return tik_instance
    else:
        print("come into tbe, shape is error!")
        result = te.lang.cce.matmul(tensor_a, tensor_b, trans_a, trans_b, format_a=format_a,
                                    format_b=format_b, dst_dtype=dst_dtype, tensor_bias=tensor_bias)
- 
+
        with tvm.target.cce():
            schedule = generic.auto_schedule(result)
- 
+
        tensor_list = [tensor_a, tensor_b, result]
        if len(shape_bias) > 0:
            tensor_list = [tensor_a, tensor_b, tensor_bias, result]
- 
+
        config = {"print_ir": False,
                  "name": kernel_name,
                  "tensor_list": tensor_list}
- 
+
        te.lang.cce.cce_build_code(schedule, config)
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
@@ -18,15 +18,10 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
-import te.lang.cce
-import te.platform.cce_params as cce
-from te.platform.fusion_manager import fusion_manager
-from te import tvm
-from topi import generic
-from topi.cce import util
-from impl.matmul_vector import matmul_vector_cce
-from te import tik
+
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+from te import tik
+from topi.cce import util

 matmul_cube_dense_right_op_info = TBERegOp("CusMatMulCubeDenseRight") \
    .fusion_type("OPAQUE") \
@@ -40,23 +35,26 @@ matmul_cube_dense_right_op_info = TBERegOp("CusMatMulCubeDenseRight") \
    .input(2, "x3", False, "required", "all") \
    .input(3, "x4", False, "optional", "all") \
    .output(0, "y", False, "required", "all") \
-    .dtype_format(DataType.F16_FracNZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default, DataType.F32_FracNZ) \
+    .dtype_format(DataType.F16_FracNZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default,
+                  DataType.F32_FracNZ) \
    .get_op_info()
- 
+
+
 @op_info_register(matmul_cube_dense_right_op_info)
-def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False,
+                            kernel_name="matmulcube"):
    shape_a_temp = (128, 63, 16, 16)
    shape_b_temp = (128, 128, 16, 16)
    shape_output = output_y.get("shape")
    matrix_max_shape = (1,)
-    support_shape = [(shape_a_temp, shape_b_temp, matrix_max_shape),]
+    support_shape = [(shape_a_temp, shape_b_temp, matrix_max_shape), ]
    shape_a_input = input_x1.get("shape")
    shape_b_input = input_x2.get("shape")
    matrix_max_input = input_x3.get("shape")
    input_shape = (tuple(shape_a_input), tuple(shape_b_input), tuple(matrix_max_input))
    if input_shape not in support_shape:
        raise RuntimeError("input_shape %s is not supported" % str(input_shape))
- 
+
    if shape_a_temp[0] == 128 and shape_a_temp[1] == 63 and shape_b_temp[0] == 128 and shape_b_temp[1] == 128:
        if util.get_product_version() == util.VERSION_MINI:
            tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
@@ -64,79 +62,110 @@ def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}
            tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
        input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm)
        input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm)
-        input_x3 = tik_instance.Tensor("float32", [1,], name="matrix_max", scope=tik.scope_gm)
+        input_x3 = tik_instance.Tensor("float32", [1, ], name="matrix_max", scope=tik.scope_gm)
        resMatmul = tik_instance.Tensor("float32", shape_output, name="output", scope=tik.scope_gm)
        with tik_instance.for_range(0, 32, block_num=32) as block_index:
            core_m_idx = block_index // 16
            core_n_idx = block_index % 16
            matrix_max_scalar = tik_instance.Scalar("float32")
-            matrix_max_local_UB = tik_instance.Tensor("float32", (8,), scope = tik.scope_ubuf, name = "matrix_max_local_UB")
+            matrix_max_local_UB = tik_instance.Tensor("float32", (8,), scope=tik.scope_ubuf, name="matrix_max_local_UB")
            tik_instance.data_move(matrix_max_local_UB, input_x3, 0, 1, 1, 0, 0)
            matrix_max_scalar.set_as(matrix_max_local_UB[0])
- 
-            resMatmul_local_UB = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_ubuf, name = "resMatmul_local_UB")
-            resMatmul_local_UB1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_ubuf, name = "resMatmul_local_UB1")

-            resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_cc, name = "resMatmul_local_UB_local_L0C")
-            resMatmul_local_UB_local_L0C1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_cc, name = "resMatmul_local_UB_local_L0C1")
+            resMatmul_local_UB = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_ubuf,
+                                                     name="resMatmul_local_UB")
+            resMatmul_local_UB1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_ubuf,
+                                                      name="resMatmul_local_UB1")
+
+            resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_cc,
+                                                               name="resMatmul_local_UB_local_L0C")
+            resMatmul_local_UB_local_L0C1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_cc,
+                                                                name="resMatmul_local_UB_local_L0C1")
+
+            input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (256 * 128,), scope=tik.scope_ca,
+                                                             name="input_1_local_L1_local_L0A")
+            input_2_local_L1 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf,
+                                                   name="input_2_local_L1")
+            input_2_local_L11 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf,
+                                                    name="input_2_local_L11")
+
+            input_1_local_L1 = tik_instance.Tensor("float16", (8 * 256 * 16,), scope=tik.scope_cbuf,
+                                                   name="input_1_local_L1")
+            input_1_local_L11 = tik_instance.Tensor("float16", (8 * 240 * 16,), scope=tik.scope_cbuf,
+                                                    name="input_1_local_L11")
+
+            input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb,
+                                                             name="input_2_local_L1_local_L0B")
+            input_2_local_L1_local_L0B1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb,
+                                                              name="input_2_local_L1_local_L0B1")

-            input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (256 * 128,), scope=tik.scope_ca, name = "input_1_local_L1_local_L0A")
-            input_2_local_L1 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf, name = "input_2_local_L1")
-            input_2_local_L11 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf, name = "input_2_local_L11")
- 
-            input_1_local_L1 = tik_instance.Tensor("float16", (8 * 256 * 16,), scope=tik.scope_cbuf, name = "input_1_local_L1")
-            input_1_local_L11 = tik_instance.Tensor("float16", (8 * 240 * 16,), scope=tik.scope_cbuf, name = "input_1_local_L11")
- 
-            input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb, name = "input_2_local_L1_local_L0B")
-            input_2_local_L1_local_L0B1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb, name = "input_2_local_L1_local_L0B1")
- 
            with tik_instance.if_scope(core_m_idx == 0):
                with tik_instance.for_range(0, 2) as cc1:
-                    tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0)
-                    tik_instance.data_move(input_1_local_L1, input_x1[core_n_idx * 129024 + cc1 * 4096], 0, 8, 256, 752, 0)
+                    tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8,
+                                           128, 1920, 0)
+                    tik_instance.data_move(input_1_local_L1, input_x1[core_n_idx * 129024 + cc1 * 4096], 0, 8, 256, 752,
+                                           0)
                    with tik_instance.for_range(0, 8) as cc10:
-                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256],  0, 8, 8, 0, True)
+                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256], 0,
+                                              8, 8, 0, True)
                    with tik_instance.for_range(0, 16) as cc101:
-                        tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256],  0, 8, 16, 0, False)
- 
-                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 256, 128, 128, 0)
+                        tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256],
+                                              0, 8, 16, 0, False)
+
+                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A,
+                                      input_2_local_L1_local_L0B, 256, 128, 128, 0)
                    tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0)
-                    tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255,1,1,8,8)
-                    tik_instance.vmuls(64, resMatmul_local_UB[255*64], resMatmul_local_UB[255*64], matrix_max_scalar, 255,1,1,8,8)
-                    tik_instance.vmuls(64, resMatmul_local_UB[510*64], resMatmul_local_UB[510*64], matrix_max_scalar, 2,1,1,8,8)
- 
-                    tik_instance.data_move(resMatmul[core_n_idx * 129024 + cc1 * 4096], resMatmul_local_UB, 0, 8, 512, 0, 1504)
+                    tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255, 1, 1, 8, 8)
+                    tik_instance.vmuls(64, resMatmul_local_UB[255 * 64], resMatmul_local_UB[255 * 64],
+                                       matrix_max_scalar, 255, 1, 1, 8, 8)
+                    tik_instance.vmuls(64, resMatmul_local_UB[510 * 64], resMatmul_local_UB[510 * 64],
+                                       matrix_max_scalar, 2, 1, 1, 8, 8)
+
+                    tik_instance.data_move(resMatmul[core_n_idx * 129024 + cc1 * 4096], resMatmul_local_UB, 0, 8, 512,
+                                           0, 1504)
            with tik_instance.else_scope():
-                tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0)
+                tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128,
+                                       1920, 0)
                tik_instance.data_move(input_1_local_L1, input_x1[core_n_idx * 129024 + 2 * 4096], 0, 8, 256, 752, 0)
                with tik_instance.for_range(0, 8) as cc10:
-                    tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256],  0, 8, 8, 0, True)
+                    tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256], 0, 8,
+                                          8, 0, True)
                with tik_instance.for_range(0, 16) as cc101:
-                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256],  0, 8, 16, 0, False)
- 
-                tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 256, 128, 128, 0)
+                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256], 0, 8,
+                                          16, 0, False)
+
+                tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B,
+                                  256, 128, 128, 0)
                tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0)
-                tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255,1,1,8,8)
-                tik_instance.vmuls(64, resMatmul_local_UB[255*64], resMatmul_local_UB[255*64], matrix_max_scalar, 255,1,1,8,8)
-                tik_instance.vmuls(64, resMatmul_local_UB[510*64], resMatmul_local_UB[510*64], matrix_max_scalar, 2,1,1,8,8)
- 
-                tik_instance.data_move(resMatmul[core_n_idx * 129024 + 2 * 4096], resMatmul_local_UB, 0, 8, 512, 0, 1504)
- 
-                tik_instance.data_move(input_2_local_L11, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0)
+                tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255, 1, 1, 8, 8)
+                tik_instance.vmuls(64, resMatmul_local_UB[255 * 64], resMatmul_local_UB[255 * 64], matrix_max_scalar,
+                                   255, 1, 1, 8, 8)
+                tik_instance.vmuls(64, resMatmul_local_UB[510 * 64], resMatmul_local_UB[510 * 64], matrix_max_scalar, 2,
+                                   1, 1, 8, 8)
+
+                tik_instance.data_move(resMatmul[core_n_idx * 129024 + 2 * 4096], resMatmul_local_UB, 0, 8, 512, 0,
+                                       1504)
+
+                tik_instance.data_move(input_2_local_L11, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128,
+                                       1920, 0)
                tik_instance.data_move(input_1_local_L11, input_x1[core_n_idx * 129024 + 12288], 0, 8, 240, 768, 0)
- 
+
                with tik_instance.for_range(0, 8) as cc102:
-                    tik_instance.load2dv1(input_2_local_L1_local_L0B1[cc102 * 2048], input_2_local_L11[cc102 * 256],  0, 8, 8, 0, True)
+                    tik_instance.load2dv1(input_2_local_L1_local_L0B1[cc102 * 2048], input_2_local_L11[cc102 * 256], 0,
+                                          8, 8, 0, True)
                with tik_instance.for_range(0, 16) as cc103:
-                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc103 * 2048], input_1_local_L11[cc103 * 256],  0, 8, 15, 0, False)
- 
-                tik_instance.mmad(resMatmul_local_UB_local_L0C1, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B1, 240, 128, 128, 0)
+                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc103 * 2048], input_1_local_L11[cc103 * 256], 0,
+                                          8, 15, 0, False)
+
+                tik_instance.mmad(resMatmul_local_UB_local_L0C1, input_1_local_L1_local_L0A,
+                                  input_2_local_L1_local_L0B1, 240, 128, 128, 0)
                tik_instance.data_move(resMatmul_local_UB1, resMatmul_local_UB_local_L0C1, 0, 1, 120, 0, 0)
- 
-                tik_instance.vmuls(64, resMatmul_local_UB1, resMatmul_local_UB1, matrix_max_scalar, 255,1,1,8,8)
-                tik_instance.vmuls(64, resMatmul_local_UB1[255*64], resMatmul_local_UB1[255*64], matrix_max_scalar, 225,1,1,8,8)
- 
+
+                tik_instance.vmuls(64, resMatmul_local_UB1, resMatmul_local_UB1, matrix_max_scalar, 255, 1, 1, 8, 8)
+                tik_instance.vmuls(64, resMatmul_local_UB1[255 * 64], resMatmul_local_UB1[255 * 64], matrix_max_scalar,
+                                   225, 1, 1, 8, 8)
+
                tik_instance.data_move(resMatmul[core_n_idx * 129024 + 12288], resMatmul_local_UB1, 0, 8, 480, 0, 1536)
- 
+
        tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2, input_x3], outputs=[resMatmul])
        return tik_instance
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
@@ -17,11 +17,12 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
+
 import te.platform.cce_params as cce
-from te import tvm
-from topi.cce import util
-from te import tik
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+from te import tik
+from topi.cce import util
+
 # General limitation of the size for input shape: 2**31
 SHAPE_SIZE_LIMIT = 2147483648
 NoneType = type(None)
@@ -40,6 +41,7 @@ matmul_cube_fracz_left_cast_op_info = TBERegOp("CusMatMulCubeFraczLeftCast") \
    .dtype_format(DataType.F16_Default, DataType.F32_FracZ, DataType.F16_Default, DataType.F16_FracZ) \
    .get_op_info()

+
 # pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
 def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
    """
@@ -137,6 +139,7 @@ src_dtype: str
        else:
            raise RuntimeError("unsupport input shape now for batch bias case")

+
 def _get_bias(shape_bias):
    bias_length = shape_bias[0]
    if bias_length % 16 == 0:
@@ -147,6 +150,7 @@ def _get_bias(shape_bias):
        shape_bias.append(bias_length)
        return shape_bias

+
 def _get_input_shape(shape_x):
    dim_a = shape_x[0]
    dim_b = shape_x[1]
@@ -164,6 +168,7 @@ def _get_input_shape(shape_x):
        res.append(dim_b)
    return res

+
 def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
    shape_a = input_x1.get("shape")
    shape_b = input_x2.get("shape")
@@ -199,40 +204,41 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
                    return False
            elif shape_a[1] != shape_b[0]:
                return False
- 
+
            if trans_a_f and trans_b and shape_b[1] == 1:
                return False
- 
+
        if src_dtype == "float16":
            if len(shape_a) != 2 and len(shape_b) != 2:
                return False
- 
+
            if trans_a:
                m_shape = shape_a[1]
                k_shape = shape_a[0]
            else:
                m_shape = shape_a[0]
                k_shape = shape_a[1]
- 
+
            if trans_b:
                n_shape = shape_b[0]
                k_b_shape = shape_b[1]
            else:
                n_shape = shape_b[1]
                k_b_shape = shape_b[0]
- 
+
            if k_shape != k_b_shape:
                return False
- 
+
            if m_shape == 1 or n_shape == 1:
                if k_shape % 256 != 0:
                    return False
- 
+
    except RuntimeError as e:
        return False
- 
+
    return True
- 
+
+
 # pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
 @op_info_register(matmul_cube_fracz_left_cast_op_info)
 def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False,
@@ -278,7 +284,7 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
            c1 = 1
        shape_b = [n, c1 * h * w * c0]
        shape_a = [n, n]
- 
+
    if input_x1.get("format") == "FRACTAL_Z":
        n, c, h, w = shape_a
        c0 = 16
@@ -291,26 +297,26 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
    if input_x2.get("format") == "FRACTAL_NZ":
        shape_a = [shape_b[0], shape_b[0]]
        shape_b = shape_b
- 
+
    if input_x1.get("format") == "FRACTAL_NZ":
        shape_a = shape_a
        shape_b = [shape_a[1], shape_a[1]]

    shape_a = list(shape_a)
    shape_b = list(shape_b)
- 
+
    shape_a = _get_input_shape(shape_a)
    shape_b = _get_input_shape(shape_b)
- 
+
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_a)
    util.check_shape_rule(shape_b)
    util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
- 
+
    shape_a = [shape_a[1], shape_a[0]]
    trans_a = bool(1 - trans_a)
- 
+
    shape_b = [shape_b[1], shape_b[0]]
    trans_b = bool(1 - trans_b)

@@ -319,45 +325,45 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
        shape_bias = bias.get("shape")
        shape_bias = list(shape_bias)
        shape_bias = _get_bias(shape_bias)
- 
+
    src_dtype = input_x1.get("dtype").lower()
    _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b)
- 
+
    m_shape = shape_a[len(shape_a) - 2]
    km_shape = shape_a[len(shape_a) - 1]
    kn_shape = shape_b[len(shape_a) - 2]
    n_shape = shape_b[len(shape_a) - 1]
- 
+
    if src_dtype == "float16":
        block_reduce = cce.BLOCK_REDUCE
- 
+
    block_in = cce.BLOCK_IN
    block_out = cce.BLOCK_OUT
- 
+
    if trans_a and km_shape == 1:
        block_in = cce.BLOCK_VECTOR
- 
+
    if not trans_a and m_shape == 1:
        block_in = cce.BLOCK_VECTOR
- 
+
    if trans_b and kn_shape == 1:
        block_out = cce.BLOCK_VECTOR
- 
+
    if not trans_b and n_shape == 1:
        block_out = cce.BLOCK_VECTOR
- 
+
    if trans_a:
        shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in)
    else:
        shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce)
- 
+
    if trans_b:
        shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out)
    else:
        shape_b_temp = (kn_shape // block_reduce, n_shape // block_out, block_out, block_reduce)
    shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2], shape_a_temp[3])
    shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
- 
+
    if util.get_product_version() == util.VERSION_MINI:
        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
    else:
@@ -372,7 +378,8 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
                         diag_opt=diag_opt, diag_size=DIAG_SIZE)
    tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[res_matmul])
    return tik_instance
- 
+
+
 def get_cus_tile_info(input_x1, input_x2, diag_size):
    tile_map = {
        ((32, 32, 16, 16), (128, 32, 16, 16)): (8, 8, 16),
@@ -381,10 +388,10 @@ def get_cus_tile_info(input_x1, input_x2, diag_size):
        ((128, 128, 16, 16), (32, 128, 16, 16)): (8, 8, 16),
        ((16, 16, 16, 16), (144, 16, 16, 16)): (8, 8, 9),
        ((64, 64, 16, 16), (16, 64, 16, 16)): (8, 8, 4),
-        ((16, 16, 16, 16), (64, 16, 16, 16)):  (8, 8, 4),
-        ((32, 32, 16, 16), (8, 32, 16, 16)):  (8, 8, 1),
+        ((16, 16, 16, 16), (64, 16, 16, 16)): (8, 8, 4),
+        ((32, 32, 16, 16), (8, 32, 16, 16)): (8, 8, 1),
        ((128, 128, 16, 16), (64, 128, 16, 16)): (8, 8, 16),
-        ((16, 16, 16, 16),  (4, 16, 16, 16)):  (8, 8, 1),
+        ((16, 16, 16, 16), (4, 16, 16, 16)): (8, 8, 1),
        ((16, 16, 16, 16), (32, 16, 16, 16)): (8, 8, 2),
        ((64, 64, 16, 16), (32, 64, 16, 16)): (8, 8, 8),
        ((32, 32, 16, 16), (64, 32, 16, 16)): (8, 8, 8),
@@ -398,13 +405,14 @@ def get_cus_tile_info(input_x1, input_x2, diag_size):
    }
    shape_info = (tuple(input_x1.shape), tuple(input_x2.shape))
    diag_opt = False
-    if input_x1.shape[0]*input_x1.shape[3] > diag_size:
+    if input_x1.shape[0] * input_x1.shape[3] > diag_size:
        diag_opt = True
    if shape_info not in tile_map:
        raise ValueError("shape %s is not supported" % str(shape_info))
    mo_tile, ko_tile, no_tile = tile_map[shape_info]
    return mo_tile, ko_tile, no_tile, diag_opt

+
 def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
                         res, mo_tile, ko_tile, no_tile, diag_opt=False, diag_size=128):
    ko, mo, mi, ki = input_x1.shape
@@ -420,7 +428,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
        raise ValueError("shape of input_x1 or input_x2 is not supported!")
    if not trans_a or not trans_b:
        raise ValueError("only trans_a=False and trans_b=False be supported!")
- 
+
    core_m_num = mo // mo_tile
    loop_n_num = no // no_tile
    if loop_n_num * core_m_num <= maxblocknum:
@@ -432,7 +440,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
    else:
        raise ValueError("Does not support this scenario!")
    block_num = core_m_num * core_n_num
- 
+
    loop_k_num = ko // ko_tile
    if diag_opt:
        loop_k_num = diag_outer // ko_tile
@@ -445,7 +453,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
        core_n = block_idx % core_n_num
        with tik_instance.for_range(0, loop_n_num) as cc_n:
            res_L0C = tik_instance.Tensor("float32", [no_tile, mo_tile, c0, c0],
-                                                name="resMatmul_L0C", scope=tik.scope_cc)
+                                          name="resMatmul_L0C", scope=tik.scope_cc)
            with tik_instance.for_range(0, loop_k_num, thread_num=thread_num_k) as thread_idx_k:
                # input_x2 -> input_x2_ub -(fp322fp16)-> input_x2_cast_ub -> input_x2_L1
                input_x2_ub = tik_instance.Tensor("float32", [no_tile, ko_tile_inner, c0, c0], name="input_x2_ub",
@@ -476,41 +484,41 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
                                   input_x2_cast_ub[count * repeate_times_max * vectorfp32_size],
                                   input_x2_ub[count * repeate_times_max * vectorfp32_size], repeate_num,
                                   1, 1, 4, 8)
-               input_x2_L1 = tik_instance.Tensor("float16", [no_tile, ko_tile_inner, c0, c0],
-                                                  name="input_x2_L1", scope=tik.scope_cbuf)
-                tik_instance.data_move(input_x2_L1, input_x2_cast_ub, 0, 1,
-                                       no_tile * ko_tile_inner * c0 * c0 * fp16_size // blocksize, 0, 0)
-                # input_x1 -> input_x1_L1
-                input_x1_L1 = tik_instance.Tensor(input_x1.dtype, [ko_tile_inner, mo_tile, c0, c0],
-                                                  name="input_x1_L1", scope=tik.scope_cbuf)
-                tik_instance.data_move(input_x1_L1,
-                                       input_x1[k_idx,
-                                                core_m * mo_tile, 0, 0],
-                                       0, ko_tile_inner, mo_tile * c0 * c0 * fp16_size // blocksize,
-                                       (mo - mo_tile) * c0 * c0 * fp16_size // blocksize, 0)
-                # input_x2_L1 -> input_x2_L0B
-                input_x2_L0B = tik_instance.Tensor("float16", [ko_tile_inner, no_tile, c0, c0],
-                                                   name="input_x2_L0B", scope=tik.scope_cb)
-                with tik_instance.for_range(0, ko_tile_inner) as cc2:
-                    tik_instance.load2dv1(input_x2_L0B[cc2, 0, 0, 0], input_x2_L1[0, cc2, 0, 0], 0, no_tile,
-                                          ko_tile_inner,
-                                          0, True)
-                # input_x1_L1 -> input_x1_L0A
-                input_x1_L0A = tik_instance.Tensor(input_x1.dtype, [mo_tile, ko_tile_inner, c0, c0],
-                                                   name="input_x1_L0A", scope=tik.scope_ca)
-                with tik_instance.for_range(0, mo_tile) as cc1:
-                    tik_instance.load2dv1(input_x1_L0A[cc1, 0, 0, 0], input_x1_L1[0, cc1, 0, 0], 0, ko_tile_inner,
-                                          mo_tile, 0, False)
-                with tik_instance.if_scope(thread_idx_k == 0):
-                    tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
-                                      ko_tile_inner * c0, no_tile * c0, 0)
-                with tik_instance.else_scope():
-                    tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
-                                      ko_tile_inner * c0, no_tile * c0, 1)
-            res_ub = tik_instance.Tensor(input_x1.dtype, [no_tile, mo_tile, c0, c0],
-                                               name="resMatmul_ub", scope=tik.scope_ubuf)
-            tik_instance.data_move(res_ub, res_L0C, 0, 1, no_tile * mo_tile, 0, 0, 1)
-            tik_instance.data_move(res[(core_n * loop_n_num + cc_n) * no_tile, core_m * mo_tile, 0, 0],
-                                   res_ub, 0, no_tile,
-                                   mo_tile * c0 * c0 * fp16_size // blocksize, 0,
-                                   (mo - mo_tile) * c0 * c0 * fp16_size // blocksize)
+            input_x2_L1 = tik_instance.Tensor("float16", [no_tile, ko_tile_inner, c0, c0],
+                                              name="input_x2_L1", scope=tik.scope_cbuf)
+            tik_instance.data_move(input_x2_L1, input_x2_cast_ub, 0, 1,
+                                   no_tile * ko_tile_inner * c0 * c0 * fp16_size // blocksize, 0, 0)
+            # input_x1 -> input_x1_L1
+            input_x1_L1 = tik_instance.Tensor(input_x1.dtype, [ko_tile_inner, mo_tile, c0, c0],
+                                              name="input_x1_L1", scope=tik.scope_cbuf)
+            tik_instance.data_move(input_x1_L1,
+                                   input_x1[k_idx,
+                                            core_m * mo_tile, 0, 0],
+                                   0, ko_tile_inner, mo_tile * c0 * c0 * fp16_size // blocksize,
+                                   (mo - mo_tile) * c0 * c0 * fp16_size // blocksize, 0)
+            # input_x2_L1 -> input_x2_L0B
+            input_x2_L0B = tik_instance.Tensor("float16", [ko_tile_inner, no_tile, c0, c0],
+                                               name="input_x2_L0B", scope=tik.scope_cb)
+            with tik_instance.for_range(0, ko_tile_inner) as cc2:
+                tik_instance.load2dv1(input_x2_L0B[cc2, 0, 0, 0], input_x2_L1[0, cc2, 0, 0], 0, no_tile,
+                                      ko_tile_inner,
+                                      0, True)
+            # input_x1_L1 -> input_x1_L0A
+            input_x1_L0A = tik_instance.Tensor(input_x1.dtype, [mo_tile, ko_tile_inner, c0, c0],
+                                               name="input_x1_L0A", scope=tik.scope_ca)
+            with tik_instance.for_range(0, mo_tile) as cc1:
+                tik_instance.load2dv1(input_x1_L0A[cc1, 0, 0, 0], input_x1_L1[0, cc1, 0, 0], 0, ko_tile_inner,
+                                      mo_tile, 0, False)
+            with tik_instance.if_scope(thread_idx_k == 0):
+                tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
+                                  ko_tile_inner * c0, no_tile * c0, 0)
+            with tik_instance.else_scope():
+                tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
+                                  ko_tile_inner * c0, no_tile * c0, 1)
+        res_ub = tik_instance.Tensor(input_x1.dtype, [no_tile, mo_tile, c0, c0],
+                                     name="resMatmul_ub", scope=tik.scope_ubuf)
+        tik_instance.data_move(res_ub, res_L0C, 0, 1, no_tile * mo_tile, 0, 0, 1)
+        tik_instance.data_move(res[(core_n * loop_n_num + cc_n) * no_tile, core_m * mo_tile, 0, 0],
+                               res_ub, 0, no_tile,
+                               mo_tile * c0 * c0 * fp16_size // blocksize, 0,
+                               (mo - mo_tile) * c0 * c0 * fp16_size // blocksize)
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
@@ -18,37 +18,35 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
-import te.lang.cce
-import te.platform.cce_params as cce
-from te.platform.fusion_manager import fusion_manager
-from te import tvm
-from topi import generic
-from topi.cce import util
-from te import tik
-from impl.matmul_vector import matmul_vector_cce
+
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+from te import tik
+from topi.cce import util
+
 # General limitation of the size for input shape: 2**31
 SHAPE_SIZE_LIMIT = 2147483648
 NoneType = type(None)

 cus_matmul_cube_fracz_right_mul_op_info = TBERegOp("CusMatMulCubeFraczRightMul") \
-                             .fusion_type("OPAQUE") \
-                             .async_flag(False) \
-                             .binfile_name("matmulcubefraczrightmul.so") \
-                             .compute_cost(10) \
-                             .kernel_name("CusMatMulCubeFraczRightMul") \
-                             .partial_flag(True) \
-                             .input(0, "x1", False, "required", "all") \
-                             .input(1, "x2", False, "required", "all") \
-                             .input(2, "x3", False, "required", "all") \
-                             .input(3, "x4", False, "optional", "all") \
-                             .output(0, "y", False, "required", "all") \
-                             .dtype_format(DataType.F16_FracZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default, DataType.F32_FracZ) \
-                             .get_op_info()
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("matmulcubefraczrightmul.so") \
+    .compute_cost(10) \
+    .kernel_name("CusMatMulCubeFraczRightMul") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .input(1, "x2", False, "required", "all") \
+    .input(2, "x3", False, "required", "all") \
+    .input(3, "x4", False, "optional", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_FracZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default,
+                  DataType.F32_FracZ) \
+    .get_op_info()


 @op_info_register(cus_matmul_cube_fracz_right_mul_op_info)
-def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False,
+                               kernel_name="matmulcube"):
    if util.get_product_version() == util.VERSION_MINI:
        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
    else:
@@ -61,10 +59,10 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
    input_x3_shape = input_x3.get("shape")
    input_x3_dtype = input_x3.get("dtype").lower()
    output_shape = output_y.get("shape")
-    Supported = [((72, 8, 16, 16),"float16", (72, 72, 16, 16), "float16", (1,), "float32"),
-                 ((32, 8, 16, 16),"float16", (32, 32, 16, 16), "float16", (1,), "float32"),
-                 ((8, 32, 16, 16),"float16", (8, 8, 16, 16), "float16", (1,), "float32"),
-                 ((4, 4, 16, 16),"float16", (4, 4, 16, 16), "float16", (1,), "float32"),
+    Supported = [((72, 8, 16, 16), "float16", (72, 72, 16, 16), "float16", (1,), "float32"),
+                 ((32, 8, 16, 16), "float16", (32, 32, 16, 16), "float16", (1,), "float32"),
+                 ((8, 32, 16, 16), "float16", (8, 8, 16, 16), "float16", (1,), "float32"),
+                 ((4, 4, 16, 16), "float16", (4, 4, 16, 16), "float16", (1,), "float32"),
                 ((4, 16, 16, 16), 'float16', (4, 4, 16, 16), 'float16', (1,), 'float32'),
                 ((49, 4, 16, 16), 'float16', (49, 49, 16, 16), 'float16', (1,), 'float32'),
                 ((36, 4, 16, 16), 'float16', (36, 36, 16, 16), 'float16', (1,), 'float32'),
@@ -81,7 +79,8 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
                 ((32, 128, 16, 16), 'float16', (32, 32, 16, 16), 'float16', (1,), 'float32'),
                 ((64, 32, 16, 16), 'float16', (64, 64, 16, 16), 'float16', (1,), 'float32'),
                 ((16, 64, 16, 16), 'float16', (16, 16, 16, 16), 'float16', (1,), 'float32')]
-    input_shape = (tuple(input_x1_shape), input_x1_dtype, tuple(input_x2_shape), input_x2_dtype, tuple(input_x3_shape), input_x3_dtype)
+    input_shape = (
+    tuple(input_x1_shape), input_x1_dtype, tuple(input_x2_shape), input_x2_dtype, tuple(input_x3_shape), input_x3_dtype)
    if input_shape not in Supported:
        raise RuntimeError("input_shape %s is not supported" % str(input_shape))

@@ -93,6 +92,7 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
    tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2, input_x3], outputs=[resMatmul])
    return tik_instance

+
 def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                              res):
    diag_size = 128
@@ -176,7 +176,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                                              name="resMatmul_L0C", scope=tik.scope_cc)
                with tik_instance.for_range(0, loop_k_num, thread_num=thread_num_k) as thread_idx_k:
                    if diag_opt:
-                        k_idx = (core_n*loop_n_num + cc_n) * no_tile + thread_idx_k * ko_tile_inner
+                        k_idx = (core_n * loop_n_num + cc_n) * no_tile + thread_idx_k * ko_tile_inner
                    else:
                        k_idx = thread_idx_k * ko_tile_inner
                    # input_x1 -> input_x1_L1
@@ -191,7 +191,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                    input_x2_L1 = tik_instance.Tensor("float16", [no_tile, ko_tile_inner, c0, c0],
                                                      name="input_x2_L1", scope=tik.scope_cbuf)
                    tik_instance.data_move(input_x2_L1,
-                                           input_x2[(core_n*loop_n_num + cc_n) * no_tile,
+                                           input_x2[(core_n * loop_n_num + cc_n) * no_tile,
                                                    k_idx, 0, 0],
                                           0, no_tile, ko_tile_inner * c0 * c0 * fp16_size // blocksize,
                                           (ko - ko_tile_inner) * c0 * c0 * fp16_size // blocksize, 0)
@@ -215,9 +215,9 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                        tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
                                          ko_tile_inner * c0, no_tile * c0, 1)
                res_ub = tik_instance.Tensor("float32", [no_tile, mo_tile, c0, c0],
-                                                   name="resMatmul_ub", scope=tik.scope_ubuf)
+                                             name="resMatmul_ub", scope=tik.scope_ubuf)
                tik_instance.data_move(res_ub, res_L0C, 0, 1, no_tile * mo_tile, 0, 0)
- 
+
                input_3_local_UB = tik_instance.Tensor("float32", (8,), scope=tik.scope_ubuf, name="input_3_local_UB")
                tik_instance.data_move(input_3_local_UB, input_x3, 0, 1, 1, 0, 0)
                matrix_max_scalar = tik_instance.Scalar("float32")
@@ -236,7 +236,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                                   res_ub[count * repeate_times_max * vectorfp32_size],
                                   res_ub[count * repeate_times_max * vectorfp32_size],
                                   matrix_max_scalar, repeate_num, 1, 1, 8, 8)
- 
+
                tik_instance.data_move(res[(core_n * loop_n_num + cc_n) * no_tile,
                                           (core_m * loop_m_num + cc_m) * mo_tile, 0, 0],
                                       res_ub, 0, no_tile,

--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
@@ -18,13 +18,15 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
+
 import te.lang.cce
 import te.platform.cce_params as cce
+from impl.matmul_vector import matmul_vector_cce
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tvm
 from topi import generic
 from topi.cce import util
-from impl.matmul_vector import matmul_vector_cce
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
 # General limitation of the size for input shape: 2**31
 SHAPE_SIZE_LIMIT = 2147483648
 NoneType = type(None)
@@ -36,8 +38,8 @@ matmul_cube_op_info = TBERegOp("CusMatMulCube") \
    .compute_cost(10) \
    .kernel_name("CusMatMulCube") \
    .partial_flag(True) \
-    .attr("transpose_a", "required", "bool", "all")\
-    .attr("transpose_b", "required", "bool", "all")\
+    .attr("transpose_a", "required", "bool", "all") \
+    .attr("transpose_b", "required", "bool", "all") \
    .input(0, "x1", False, "required", "all") \
    .input(1, "x2", False, "required", "all") \
    .input(2, "x3", False, "optional", "all") \
@@ -45,6 +47,7 @@ matmul_cube_op_info = TBERegOp("CusMatMulCube") \
    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_Default, DataType.F32_FracNZ) \
    .get_op_info()

+
 # pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
 def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
    """
@@ -113,16 +116,16 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):

    if m_shape != 1:
        if n_shape == 1:
-            if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
+            if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
                raise RuntimeError("input shape K1 should be multiple of %d"
-                                   % (cce.BLOCK_IN*cce.BLOCK_IN))
-        elif km_shape%k_block_size != 0:
+                                   % (cce.BLOCK_IN * cce.BLOCK_IN))
+        elif km_shape % k_block_size != 0:
            raise RuntimeError(
                "input shape K1 should be multiple of %d" % cce.BLOCK_IN)
    else:
-        if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
+        if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
            raise RuntimeError("input shape K1 should be multiple of %d"
-                               % (cce.BLOCK_IN*cce.BLOCK_IN))
+                               % (cce.BLOCK_IN * cce.BLOCK_IN))

    if n_shape % cce.BLOCK_IN != 0 and n_shape != 1:
        raise RuntimeError("input shape N should be 1 or multiple of %d" % cce.BLOCK_IN)
@@ -130,7 +133,7 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
    if len(shape_bias):
        if len(shape_bias) == 1:
            if is_gevm or is_gemv:
-                if shape_bias[0] != m_shape*n_shape:
+                if shape_bias[0] != m_shape * n_shape:
                    raise RuntimeError("broadcast case shape bias for gemv must be equal m*n")
            else:
                if shape_bias[0] != n_shape:
@@ -141,33 +144,36 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
        else:
            raise RuntimeError("unsupport input shape now for batch bias case")

+
 def _get_bias(shape_bias):
    bias_length = shape_bias[0]
-    if bias_length % 16 ==0:
+    if bias_length % 16 == 0:
        return shape_bias
    else:
-        bias_length = (bias_length // 16)*16 + 16
+        bias_length = (bias_length // 16) * 16 + 16
        shape_bias = []
        shape_bias.append(bias_length)
        return shape_bias

+
 def _get_input_shape(shape_x):
    dim_a = shape_x[0]
    dim_b = shape_x[1]
    res = []
-    if dim_a % 16 !=0:
-        dim_a = (dim_a // 16)*16 + 16
+    if dim_a % 16 != 0:
+        dim_a = (dim_a // 16) * 16 + 16
        res.append(dim_a)
    else:
        res.append(dim_a)

-    if dim_b % 16 !=0:
-        dim_b = (dim_b // 16)*16 + 16
+    if dim_b % 16 != 0:
+        dim_b = (dim_b // 16) * 16 + 16
        res.append(dim_b)
    else:
        res.append(dim_b)
    return res

+
 def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
    shape_a = input_x1.get("shape")
    shape_b = input_x2.get("shape")
@@ -182,7 +188,7 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
    if bias is not None and bool(bias):
        shape_bias = bias.get("shape")
    try:
-        trans_a_f = bool(1-trans_a)
+        trans_a_f = bool(1 - trans_a)
        if src_dtype == "float32" or src_dtype == "int32":
            if len(shape_a) != 2 and len(shape_b) != 2:
                return False
@@ -203,10 +209,10 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
                    return False
            elif shape_a[1] != shape_b[0]:
                return False
- 
+
            if trans_a_f and trans_b and shape_b[1] == 1:
                return False
- 
+
        if src_dtype == "float16":
            if len(shape_a) != 2 and len(shape_b) != 2:
                return False
@@ -217,26 +223,27 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
            else:
                m_shape = shape_a[0]
                k_shape = shape_a[1]
- 
+
            if trans_b:
                n_shape = shape_b[0]
                k_b_shape = shape_b[1]
            else:
                n_shape = shape_b[1]
                k_b_shape = shape_b[0]
- 
+
            if k_shape != k_b_shape:
                return False
- 
+
            if m_shape == 1 or n_shape == 1:
                if k_shape % 256 != 0:
                    return False
- 
+
    except RuntimeError as e:
        return False
- 
+
    return True
- 
+
+
 # pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
 @op_info_register(matmul_cube_op_info)
 def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
@@ -269,18 +276,18 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
    """
    shape_a = input_x1.get("ori_shape")
    shape_b = input_x2.get("ori_shape")
- 
+
    if shape_a is not None:
        if len(shape_a) < 2:
            shape_a = input_x1.get("shape")
- 
+
    if shape_b is not None:
        if len(shape_b) < 2:
            shape_b = input_x2.get("shape")
- 
+
    shape_a = list(shape_a)
    shape_b = list(shape_b)
- 
+
    if input_x1.get("format") == "FRACTAL_NZ":
        shape_a = _get_input_shape(shape_a)
        shape_b = _get_input_shape(shape_b)
@@ -290,21 +297,21 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
    util.check_shape_rule(shape_b)
    util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
- 
+
    if input_x1.get("format") == "FRACTAL_NZ":
        shape_a = [shape_a[1], shape_a[0]]
-        trans_a = bool(1-trans_a)
- 
+        trans_a = bool(1 - trans_a)
+
    if input_x2.get("format") == "FRACTAL_NZ":
        shape_b = [shape_b[1], shape_b[0]]
-        trans_b = bool(1-trans_b)
- 
+        trans_b = bool(1 - trans_b)
+
    shape_bias = ()
    if bias is not None and bool(bias):
        shape_bias = bias.get("shape")
        shape_bias = list(shape_bias)
        shape_bias = _get_bias(shape_bias)
- 
+
    src_dtype = input_x1.get("dtype").lower()
    dst_dtype = output_y.get("dtype").lower()
    if src_dtype == "float32" or src_dtype == "int32":
@@ -338,12 +345,12 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
        shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in)
    else:
        shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce)
- 
+
    if trans_b:
        shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out)
    else:
        shape_b_temp = (kn_shape // block_reduce, n_shape // block_out, block_out, block_reduce)
- 
+
    if input_x1.get("format") == "FORMAT_FRACTAL_Z":
        shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2], shape_a_temp[3])
        format_a = "fractal"
@@ -353,7 +360,7 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
    else:
        shape_a_temp = (shape_a[len(shape_a) - 2], shape_a[len(shape_a) - 1])
        format_a = "ND"
- 
+
    if input_x2.get("format") == "FORMAT_FRACTAL_Z":
        shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
        format_b = "fractal"
@@ -363,28 +370,28 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
    else:
        shape_b_temp = (shape_b[len(shape_b) - 2], shape_b[len(shape_b) - 1])
        format_b = "ND"
- 
+
    tensor_bias = None
    tensor_a = tvm.placeholder(shape_a_temp, name='tensor_a',
                               dtype=src_dtype)
    tensor_b = tvm.placeholder(shape_b_temp, name='tensor_b',
                               dtype=src_dtype)
- 
+
    if len(shape_bias) > 0:
        tensor_bias = tvm.placeholder(shape_bias, name='tensor_bias',
                                      dtype=dst_dtype)
    result = te.lang.cce.matmul(tensor_a, tensor_b, trans_a, trans_b, format_a=format_a,
                                format_b=format_b, dst_dtype=dst_dtype, tensor_bias=tensor_bias)
- 
+
    with tvm.target.cce():
        schedule = generic.auto_schedule(result)
- 
+
    tensor_list = [tensor_a, tensor_b, result]
    if len(shape_bias) > 0:
        tensor_list = [tensor_a, tensor_b, tensor_bias, result]
- 
+
    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": tensor_list}
- 
+
    te.lang.cce.cce_build_code(schedule, config)
--- a/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
@@ -13,24 +13,25 @@
 # limitations under the License.
 # ============================================================================
 """CusMatrixCombine"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType

 cus_matrix_combine_op_info = TBERegOp("CusMatrixCombine") \
-                             .fusion_type("OPAQUE") \
-                             .async_flag(False) \
-                             .binfile_name("matrixcombine.so") \
-                             .compute_cost(10) \
-                             .kernel_name("CusMatrixCombine") \
-                             .partial_flag(True) \
-                             .input(0, "x1", False, "required", "all") \
-                             .output(0, "y", False, "required", "all") \
-                             .dtype_format(DataType.F32_Default, DataType.F32_Default) \
-                             .get_op_info()
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("matrixcombine.so") \
+    .compute_cost(10) \
+    .kernel_name("CusMatrixCombine") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+

 @op_info_register(cus_matrix_combine_op_info)
-def CusMatrixCombine(input_x, output,kernel_name="matrix_combine"):
+def CusMatrixCombine(input_x, output, kernel_name="matrix_combine"):
    input_x_shape = input_x.get("shape")
    output_shape = output.get("shape")
    split_dim = 128
@@ -45,18 +46,20 @@ def CusMatrixCombine(input_x, output,kernel_name="matrix_combine"):

    blocks = 32
    matrix_dim = input_x_shape[0] * input_x_shape[1]
-    if input_x_shape[0] == 1 and input_x_shape[1] == 64 :
+    if input_x_shape[0] == 1 and input_x_shape[1] == 64:
        tiling_dim = 2
        bs = 1
-        with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-            input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub", scope=tik.scope_ubuf)
+        with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+            input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub",
+                                             scope=tik.scope_ubuf)
            tik_instance.data_move(input_x_ub, input_x[0, block_index * tiling_dim, 0], 0, 1, 16, 0, 0)
            tik_instance.data_move(res[block_index * tiling_dim, 0], input_x_ub, 0, 1, 16, 0, 0)
    else:
        tiling_dim = 4
        bs = input_x_shape[0]
-        with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-            input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub", scope=tik.scope_ubuf)
+        with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+            input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub",
+                                             scope=tik.scope_ubuf)
            zero = tik_instance.Scalar("float32")
            zero.set_as(0.0)
            with tik_instance.for_range(0, bs) as i:
@@ -69,7 +72,9 @@ def CusMatrixCombine(input_x, output,kernel_name="matrix_combine"):
                    tik_instance.vector_dup(64, input_x_ub, zero, repeat_1, 1, 8)
                    tik_instance.vector_dup(64, input_x_ub[255 * 64], zero, repeat_2, 1, 8)
                with tik_instance.for_range(0, tiling_dim) as j:
-                    tik_instance.data_move(input_x_ub[j, split_dim * i], input_x[i, block_index * tiling_dim + j, 0], 0, 1, 16, 0, 0)
-                tik_instance.data_move(res[i * split_dim + block_index * tiling_dim, 0], input_x_ub, 0, 1, tiling_dim * matrix_dim *4 // 32, 0, 0)
+                    tik_instance.data_move(input_x_ub[j, split_dim * i], input_x[i, block_index * tiling_dim + j, 0], 0,
+                                           1, 16, 0, 0)
+                tik_instance.data_move(res[i * split_dim + block_index * tiling_dim, 0], input_x_ub, 0, 1,
+                                       tiling_dim * matrix_dim * 4 // 32, 0, 0)
    tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
    return tik_instance
--- a/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py