From 19043e7b9772b5a2e7c093afce2473941400c137 Mon Sep 17 00:00:00 2001
From: wangzhuo325 <wangzhuo16@huawei.com>
Date: Mon, 6 Jul 2020 15:56:30 +0800
Subject: [PATCH] support adding bias in matmul

---
 python/akg/ops/nn/matmul.py             | 10 ++--
 src/poly/cce_isl_emitter.cc             | 13 ++++-
 tests/common/test_run/matmul_run.py     | 42 ++++++-------
 tests/operators/cube/test_matmul_001.py | 78 ++++++++++++-------------
 4 files changed, 79 insertions(+), 64 deletions(-)
diff --git a/python/akg/ops/nn/matmul.py b/python/akg/ops/nn/matmul.py
index 31a2bab..7094ce7 100644
--- a/python/akg/ops/nn/matmul.py
+++ b/python/akg/ops/nn/matmul.py
@@ -271,7 +271,7 @@ def matmul4D_compute(x, y, bias_value, out_dtype, left_format, right_format, out
         "bias": bias_name,
     })
 
-    if out_dtype == "float16":
+    if out_dtype == "float16" and (bias_value == None or bias_value.dtype == "float16"):
         result_matmul = cast.cast(result_matmul, out_dtype)
 
     def matmul_reshape(shape, result_matmul, *indices):
@@ -288,10 +288,10 @@ def matmul4D_compute(x, y, bias_value, out_dtype, left_format, right_format, out
         N = len(output_shape)
         # reduce axis
         if output_format == "zN":
-            bias_indices = indices[:(N - 4)] + indices[(N - 4):(N - 3)] + (0, 0) + indices[(N - 1):]
+            bias_indices = indices[N - 4] * cce.BLOCK_OUT + indices[N - 1]
         elif output_format == "zZ":
-            bias_indices = indices[:(N - 4)] + (0,) + indices[(N - 3):(N - 2)] + (0,) + indices[(N - 1):]
-        return result(*indices) + bias(*bias_indices)
+            bias_indices = indices[N - 3] * cce.BLOCK_OUT + indices[N - 1]
+        return result(*indices) + bias(bias_indices)
     if bias == 1:
         if out_format == "zN":
             out = akg.tvm.compute(output_shape_zN, lambda *indices: bias_compute(output_shape_zN, result, bias_value, out_format, *indices),
@@ -299,6 +299,8 @@ def matmul4D_compute(x, y, bias_value, out_dtype, left_format, right_format, out
         elif out_format == "zZ":
             out = akg.tvm.compute(output_shape_zZ, lambda *indices: bias_compute(output_shape_zZ, result, bias_value, out_format, *indices),
                               name="output")
+        if out_dtype == "float16" and bias_value.dtype == "float32":
+            out = cast.cast(out, out_dtype)
     else:
         out = result
 
diff --git a/src/poly/cce_isl_emitter.cc b/src/poly/cce_isl_emitter.cc
index d15af81..acf770f 100644
--- a/src/poly/cce_isl_emitter.cc
+++ b/src/poly/cce_isl_emitter.cc
@@ -171,10 +171,21 @@ class HoistL0Write : public IRMutator {
       for (const auto &arg : op->args) {
         args.push_back(this->Mutate(arg));
       }
-      return Provide::make(op->func, op->value_index, op->value, args);
+      auto value = this->Mutate(op->value);
+      return Provide::make(op->func, op->value_index, value, args);
     }
     return IRMutator::Mutate_(op, s);
   }
+  Expr Mutate_(const Call *op, const Expr &e) final {
+    if (mutate_write_) {
+      Array<Expr> args;
+      for (const auto &arg : op->args) {
+        args.push_back(this->Mutate(arg));
+      }
+      return Call::make(op->type, op->name, args, op->call_type, op->func, op->value_index);
+    }
+    return IRMutator::Mutate_(op, e);
+  }
 
   bool found_{false};
   bool mutate_{false};
diff --git a/tests/common/test_run/matmul_run.py b/tests/common/test_run/matmul_run.py
index 3e88148..bf2a1eb 100644
--- a/tests/common/test_run/matmul_run.py
+++ b/tests/common/test_run/matmul_run.py
@@ -121,7 +121,7 @@ def np_matmul(matrix_a, matrix_b, batch_tuple, M, K, N, trans_data=False, trans_
 
 
 def genData(batch_tuple, M, K, N, trans_data=False, trans_weight=False,
-            dtype="float16", out_dtype="float16", bias=0, left_format="zZ", right_format="nZ", output_format="zN"):
+            dtype="float16", bias_dtype="float16", out_dtype="float16", bias=0, left_format="zZ", right_format="nZ", output_format="zN"):
     shape_x, shape_y = get_shapes(batch_tuple, M, K, N, trans_data, trans_weight)
     matrix_a = random_gaussian(shape_x, miu=0.1, sigma=0.01).astype(dtype)
     matrix_b = random_gaussian(shape_y, miu=0.1, sigma=0.01).astype(dtype)
@@ -137,13 +137,19 @@ def genData(batch_tuple, M, K, N, trans_data=False, trans_weight=False,
     if dtype == "float16":
         out.astype(np.float16)
 
-    bias_shape = batch_tuple + (N // cce.BLOCK_OUT, 1, 1, cce.BLOCK_OUT)
-    if output_format == "zZ":
-        bias_shape = batch_tuple + (1, N // cce.BLOCK_OUT, 1, cce.BLOCK_OUT)
-    bias_data = np.full(bias_shape, np.nan, out_dtype)
-    if bias == 1:
-        bias_data = random_gaussian(bias_shape, miu=0.5, sigma=0.01).astype(out_dtype)
-        out = out + bias_data
+    bias_shape = (N,)
+    bias_data = np.full(bias_shape, np.nan, bias_dtype)
+    if bias != 0:
+        bias_data = random_gaussian(bias_shape, miu=0.5, sigma=0.01).astype(bias_dtype)
+        bias_reshape = (N // cce.BLOCK_OUT, 1, 1, cce.BLOCK_OUT)
+        if output_format == "zZ":
+            bias_reshape = (1, N // cce.BLOCK_OUT, 1, cce.BLOCK_OUT)
+        bias_data_reshaped = bias_data.reshape(bias_reshape)
+        if bias_dtype != out_dtype:
+            out = out.astype(np.float32) + bias_data_reshaped.astype(np.float32)
+            out = out.astype(out_dtype)
+        else:
+            out = out + bias_data_reshaped
 
     shape_x = ()
     shape_y = ()
@@ -185,14 +191,14 @@ def genData(batch_tuple, M, K, N, trans_data=False, trans_weight=False,
     return fractal_a, fractal_b, out, bias_data
 
 
-def matmul_data(batch_tuple, M, K, N, dtype, out_dtype, bias, adj_x, adj_y, left_format=None, right_format=None, output_format=None, debug_logging=False):
+def matmul_data(batch_tuple, M, K, N, dtype, bias_dtype, out_dtype, bias, adj_x, adj_y, left_format=None, right_format=None, output_format=None, debug_logging=False):
     m_x = ()
     m_y = ()
     bench_mark = ()
     bias_data = ()
     logging.debug("gen data start!")
     a = datetime.now()
-    m_x, m_y, bench_mark, bias_data = genData(batch_tuple, M, K, N, adj_x, adj_y, dtype, out_dtype, bias, left_format, right_format, output_format)
+    m_x, m_y, bench_mark, bias_data = genData(batch_tuple, M, K, N, adj_x, adj_y, dtype, bias_dtype, out_dtype, bias, left_format, right_format, output_format)
     b = datetime.now()
     logging.debug((b - a).seconds)
     logging.debug("gen data end!")
@@ -295,17 +301,13 @@ def get_converted_shapes(m, n, k, batch_tuple, adj_x, adj_y, bias, left_format="
         output_shape = batch_tuple + (m // cce.BLOCK_OUT, 1, n % cce.BLOCK_IN, cce.BLOCK_OUT)
 
     if bias == 1:
-        if out_format == "zN":
-            bias_shape_nc1hwc0 = batch_tuple + (n // cce.BLOCK_OUT, 1, 1, cce.BLOCK_OUT)
-        elif out_format == "zZ":
-            bias_shape_nc1hwc0 = batch_tuple + (1, n // cce.BLOCK_OUT, 1, cce.BLOCK_OUT)
-
+        bias_shape_nc1hwc0 = (n,)
     else:
         bias_shape_nc1hwc0 = None 
     return shape_xx, shape_yy, bias_shape_nc1hwc0, output_shape, k
 
 
-def matmul_execute(shape_x, shape_y, bias, left_format, right_format, out_format, adj_x, adj_y, dtype, out_dtype, kernel_name, attrs):
+def matmul_execute(shape_x, shape_y, bias, left_format, right_format, out_format, adj_x, adj_y, dtype, bias_dtype, out_dtype, kernel_name, attrs):
     '''
     There are four types of fractal format in Davinci core: zZ, zN, nZ, nN
     general matmul format
@@ -323,9 +325,9 @@ def matmul_execute(shape_x, shape_y, bias, left_format, right_format, out_format
     n = (n + 15) // 16 * 16
     k = (k + 15) // 16 * 16
     shape_xx, shape_yy, bias_shape, out_shape, k = get_converted_shapes(m, n, k, batch_tuple, adj_x, adj_y, bias, left_format, right_format, out_format)
-    mod = matmul_compile(shape_x, shape_y, bias, left_format, right_format, out_format, adj_x, adj_y, dtype, out_dtype, kernel_name, attrs)
+    mod = matmul_compile(shape_x, shape_y, bias, left_format, right_format, out_format, adj_x, adj_y, dtype, bias_dtype, out_dtype, kernel_name, attrs)
     # Generate data
-    m_x, m_y, bench_mark, bias_data = matmul_data(batch_tuple, m, k, n, dtype, out_dtype, bias, adj_x, adj_y, left_format, right_format, out_format)
+    m_x, m_y, bench_mark, bias_data = matmul_data(batch_tuple, m, k, n, dtype, bias_dtype, out_dtype, bias, adj_x, adj_y, left_format, right_format, out_format)
 
     # mod launch
     output = np.full(out_shape, np.nan, out_dtype)
@@ -341,7 +343,7 @@ def matmul_execute(shape_x, shape_y, bias, left_format, right_format, out_format
     return (m_x, m_y), output, bench_mark, compare_result
 
 
-def matmul_compile(shape_x, shape_y, bias, left_format, right_format, output_format, adj_x, adj_y, dtype, out_dtype, kernel_name, attrs, tuning=False):
+def matmul_compile(shape_x, shape_y, bias, left_format, right_format, output_format, adj_x, adj_y, dtype, bias_dtype, out_dtype, kernel_name, attrs, tuning=False):
     batch_tuple, m, k, n = extract_dim(shape_x, shape_y, adj_x, adj_y)
     m = (m + 15) // 16 * 16
     n = (n + 15) // 16 * 16
@@ -349,7 +351,7 @@ def matmul_compile(shape_x, shape_y, bias, left_format, right_format, output_for
     shape_xx, shape_yy, bias_shape, out_shape, k = get_converted_shapes(m, n, k, batch_tuple, adj_x, adj_y, bias,
                                                                         left_format, right_format, output_format)
     input_shapes = [shape_xx, shape_yy, bias_shape]
-    input_types = [dtype, dtype, out_dtype]
+    input_types = [dtype, dtype, bias_dtype]
     has_bias = False
     if bias == 1:
         has_bias = True
diff --git a/tests/operators/cube/test_matmul_001.py b/tests/operators/cube/test_matmul_001.py
index 9b2b644..d728d9d 100644
--- a/tests/operators/cube/test_matmul_001.py
+++ b/tests/operators/cube/test_matmul_001.py
@@ -31,85 +31,85 @@ class TestCase(TestBase):
         self._log.info("============= {0} Setup case============".format(self.casename))
         self.testarg = [
             # caseflag,opfuncname,testRunArgs, dimArgs
-            # shape_x, shape_y, bias, left_format, right_format, output_format, adj_x, adj_y, dtype, out_dtype, kernel_name, attrs
+            # shape_x, shape_y, bias, left_format, right_format, output_format, adj_x, adj_y, dtype, bias_dtype, out_dtype, kernel_name, attrs
 
             # bert shape
-            ("matmul_run_bert_00", "matmul_run", ((16, 1024), (16, 1024), 0, "zN", "zN", "zN", False, True, "float16", "float16", "matmul_cce")),
-            ("matmul_run_bert_01", "matmul_run", ((8192, 4096), (8192, 1024), 0, "zN", "zN", "zN", True, False, "float16", "float32", "matmul_cce")),
-            ("matmul_run_bert_02", "matmul_run", ((8192, 1024), (1024, 4096), 0, "zN", "zN", "zN", False, False, "float16", "float16", "matmul_cce")),
-            ("matmul_run_bert_03", "matmul_run", ((16, 16), (16, 1024), 0, "zN", "zN", "zN", True, False, "float16", "float32", "matmul_cce")),
-            ("matmul_run_bert_04", "matmul_run", ((1216, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, False, "float16", "float32", "matmul_cce")),
-            ("matmul_run_bert_05", "matmul_run", ((8192, 4096), (4096, 1024), 0, "zN", "zN", "zN", False, False, "float16", "float16", "matmul_cce")),
-            ("matmul_run_bert_06", "matmul_run", ((8192, 1024), (4096, 1024), 0, "zN", "zN", "zN", False, True, "float16", "float16", "matmul_cce")),
-            ("matmul_run_bert_07", "matmul_run", ((8192, 1024), (8192, 4096), 0, "zN", "zN", "zN", True, False, "float16", "float16", "matmul_cce")),
-            ("matmul_run_bert_08", "matmul_run", ((1216, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, True, "float16", "float16", "matmul_cce")),
-            ("matmul_run_bert_09", "matmul_run", ((8192, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, False, "float16", "float16", "matmul_cce")),
-            ("matmul_run_bert_10", "matmul_run", ((1216, 30522), (30522, 1024), 0, "zN", "zN", "zN", False, False, "float16", "float16", "matmul_cce")),
-            ("matmul_run_bert_11", "matmul_run", ((1216, 30522), (1216, 1024), 0, "zN", "zN", "zN", True, False, "float16", "float32", "matmul_cce")),
-            ("matmul_run_bert_12", "matmul_run", ((1216, 1024), (30522, 1024), 0, "zN", "zN", "zN", False, True, "float16", "float32", "matmul_cce")),
-            ("matmul_run_bert_13", "matmul_run", ((8192, 1024), (8192, 1024), 0, "zN", "zN", "zN", True, False, "float16", "float32", "matmul_cce")),
-            ("matmul_run_bert_14", "matmul_run", ((1216, 1024), (1216, 1024), 0, "zN", "zN", "zN", True, False, "float16", "float16", "matmul_cce")),
-            ("matmul_run_bert_15", "matmul_run", ((16, 1024), (16, 1024), 0, "zN", "zN", "zN", True, False, "float16", "float32", "matmul_cce")),
-            ("matmul_run_bert_16", "matmul_run", ((16, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, True, "float16", "float32", "matmul_cce")),
-            ("matmul_run_bert_17", "matmul_run", ((16, 16), (16, 1024), 0, "zN", "zN", "zN", False, False, "float16", "float32", "matmul_cce")),
-            ("matmul_run_bert_18", "matmul_run", ((8192, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, True, "float16", "float16", "matmul_cce")),
-            ("matmul_run_bert_19", "matmul_run", ((8192, 4096), (1024, 4096), 0, "zN", "zN", "zN", False, True, "float16", "float16", "matmul_cce")),
+            ("matmul_run_bert_00", "matmul_run", ((16, 1024), (16, 1024), 0, "zN", "zN", "zN", False, True, "float16", None, "float16", "matmul_cce")),
+            ("matmul_run_bert_01", "matmul_run", ((8192, 4096), (8192, 1024), 0, "zN", "zN", "zN", True, False, "float16", None, "float32", "matmul_cce")),
+            ("matmul_run_bert_02", "matmul_run", ((8192, 1024), (1024, 4096), 0, "zN", "zN", "zN", False, False, "float16", None, "float16", "matmul_cce")),
+            ("matmul_run_bert_03", "matmul_run", ((16, 16), (16, 1024), 0, "zN", "zN", "zN", True, False, "float16", None, "float32", "matmul_cce")),
+            ("matmul_run_bert_04", "matmul_run", ((1216, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, False, "float16", None, "float32", "matmul_cce")),
+            ("matmul_run_bert_05", "matmul_run", ((8192, 4096), (4096, 1024), 0, "zN", "zN", "zN", False, False, "float16", None, "float16", "matmul_cce")),
+            ("matmul_run_bert_06", "matmul_run", ((8192, 1024), (4096, 1024), 0, "zN", "zN", "zN", False, True, "float16", None, "float16", "matmul_cce")),
+            ("matmul_run_bert_07", "matmul_run", ((8192, 1024), (8192, 4096), 0, "zN", "zN", "zN", True, False, "float16", None, "float16", "matmul_cce")),
+            ("matmul_run_bert_08", "matmul_run", ((1216, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, True, "float16", None, "float16", "matmul_cce")),
+            ("matmul_run_bert_09", "matmul_run", ((8192, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, False, "float16", None, "float16", "matmul_cce")),
+            ("matmul_run_bert_10", "matmul_run", ((1216, 30522), (30522, 1024), 0, "zN", "zN", "zN", False, False, "float16", None, "float16", "matmul_cce")),
+            ("matmul_run_bert_11", "matmul_run", ((1216, 30522), (1216, 1024), 0, "zN", "zN", "zN", True, False, "float16", None, "float32", "matmul_cce")),
+            ("matmul_run_bert_12", "matmul_run", ((1216, 1024), (30522, 1024), 0, "zN", "zN", "zN", False, True, "float16", None, "float32", "matmul_cce")),
+            ("matmul_run_bert_13", "matmul_run", ((8192, 1024), (8192, 1024), 0, "zN", "zN", "zN", True, False, "float16", None, "float32", "matmul_cce")),
+            ("matmul_run_bert_14", "matmul_run", ((1216, 1024), (1216, 1024), 0, "zN", "zN", "zN", True, False, "float16", None, "float16", "matmul_cce")),
+            ("matmul_run_bert_15", "matmul_run", ((16, 1024), (16, 1024), 0, "zN", "zN", "zN", True, False, "float16", None, "float32", "matmul_cce")),
+            ("matmul_run_bert_16", "matmul_run", ((16, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, True, "float16", None, "float32", "matmul_cce")),
+            ("matmul_run_bert_17", "matmul_run", ((16, 16), (16, 1024), 0, "zN", "zN", "zN", False, False, "float16", None, "float32", "matmul_cce")),
+            ("matmul_run_bert_18", "matmul_run", ((8192, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, True, "float16", None, "float16", "matmul_cce")),
+            ("matmul_run_bert_19", "matmul_run", ((8192, 4096), (1024, 4096), 0, "zN", "zN", "zN", False, True, "float16", None, "float16", "matmul_cce")),
 
             # matmul_cast
             ("matmul_run1", "matmul_run",
-             ((64, 1024), (16, 1024), 0, "zZ", "nZ", "zN", False, True, "float16", "float32", "matmul_cast_cce")),
+             ((64, 1024), (16, 1024), 0, "zZ", "nZ", "zN", False, True, "float16", None, "float32", "matmul_cast_cce")),
             # ((4, 4), (16, 16), (128, 128), (16, 16), (16, 16))),
             # matmul_bias
             ("matmul_run2", "matmul_run",
-             ((64, 1024), (16, 1024), 1, "zZ", "nZ", "zN", False, True, "float16", "float16", "matmul_bias_cce")),
+             ((64, 1024), (16, 1024), 1, "zZ", "nZ", "zN", False, True, "float16", "float16", "float16", "matmul_bias_cce")),
             # ((4, 4), (16, 16), (128, 128), (16, 16), (16, 16))),
             # matmul_trans
             ("matmul_run3", "matmul_run",
-             ((1024, 64), (16, 1024), 1, "zZ", "nZ", "zN", True, True, "float16", "float16", "matmul_bias_cce")),
+             ((1024, 64), (16, 1024), 1, "zZ", "nZ", "zN", True, True, "float16", "float16", "float16", "matmul_bias_cce")),
             # ((4, 4), (16, 16), (128, 128), (16, 16), (16, 16))),
 
             # matmul
             ("matmul_run4", "matmul_run",
-             ((64, 1024), (16, 1024), 0, "zZ", "nZ", "zN", False, True, "float16", "float16", "matmul_cce")),
+             ((64, 1024), (16, 1024), 0, "zZ", "nZ", "zN", False, True, "float16", None, "float16", "matmul_cce")),
             # ((4, 4), (16, 16), (128, 128), (16, 16), (16, 16))),
             ("matmul_run5", "matmul_run",
-             ((1024, 16), (16, 1024), 1, "zZ", "nZ", "zN", False, False, "float16", "float16", "matmul_cce")),
+             ((1024, 16), (16, 1024), 1, "zZ", "nZ", "zN", False, False, "float16", "float16", "float16", "matmul_cce")),
             # ((8, 8), (8, 8), (128, 128), (128, 128), (16, 16))),
             ("matmul_run9", "matmul_run",
-             ((16, 1024), (16, 1024), 0, "zZ", "nZ", "zN", False, True, "float16", "float16", "matmul_cce")),
+             ((16, 1024), (16, 1024), 0, "zZ", "nZ", "zN", False, True, "float16", None, "float16", "matmul_cce")),
             # ((16, 16), (16, 16), (16, 16))),
             ("matmul_run16", "matmul_run",
-             ((16, 64), (64, 1024), 0, "zZ", "nZ", "zN", False, False, "float16", "float16", "matmul_cce")),
+             ((16, 64), (64, 1024), 0, "zZ", "nZ", "zN", False, False, "float16", None, "float16", "matmul_cce")),
             # ((16, 16), (16, 16), (16, 16), (4, 4))),
 
             # new shape for bert
             # ("matmul_run29", "matmul_run",
-            # ((8192,2), (1024,2), 0, 0, False, True,  "float16", "float16", "matmul_cce"),
+            # ((8192,2), (1024,2), 0, 0, False, True,  "float16", None, "float16", "matmul_cce"),
             # ((8, 8), (8, 8), (128, 128), (128, 128), (16, 16))),
 
             ("matmul_run30", "matmul_run",
-             ((64, 1024), (2, 1024), 0, "zZ", "nZ", "zN", False, True, "float16", "float16", "matmul_cce")),
+             ((64, 1024), (2, 1024), 0, "zZ", "nZ", "zN", False, True, "float16", None, "float16", "matmul_cce")),
             # ((4, 4), (16, 16), (16, 16), (16, 16), (16, 16))),
 
             ("matmul_run31", "matmul_run",
-             ((2, 64), (1024, 64), 0, "zZ", "nZ", "zN", False, True, "float16", "float16", "matmul_cce")),
+             ((2, 64), (1024, 64), 0, "zZ", "nZ", "zN", False, True, "float16", None, "float16", "matmul_cce")),
             # ((16, 16), (16, 16), (16, 16), (16, 16))),
 
             # zZ case
             ("matmul_run1", "matmul_run",
-             ((6272, 256), (6272, 256), 0, "zZ", "zZ", "zZ", True, False, "float16", "float32", "matmul_cast_cce")),
+             ((6272, 256), (6272, 256), 0, "zZ", "zZ", "zZ", True, False, "float16", None, "float32", "matmul_cast_cce")),
             ("matmul_run2", "matmul_run",
-             ((6272*16, 4*16), (6272*16, 4*16), 0, "zZ", "zZ", "zZ", True, False, "float16", "float32", "matmul_cce")),
+             ((6272*16, 4*16), (6272*16, 4*16), 0, "zZ", "zZ", "zZ", True, False, "float16", None, "float32", "matmul_cce")),
             ("matmul_run3", "matmul_run",
-             ((1568*16, 8*16), (1568*16, 8*16), 0, "zZ", "zZ", "zZ", True, False, "float16", "float32", "matmul_cce")),
+             ((1568*16, 8*16), (1568*16, 8*16), 0, "zZ", "zZ", "zZ", True, False, "float16", None, "float32", "matmul_cce")),
 
             # zN case
             ("matmul_run_zN_1", "matmul_run",
-             ((32, 48), (48, 64), 0, "zN", "zN", "zN", False, False, "float16", "float32", "matmul_cce")),
+             ((32, 48), (48, 64), 0, "zN", "zN", "zN", False, False, "float16", None, "float32", "matmul_cce")),
             ("matmul_run_zN_2", "matmul_run",
-             ((32, 48), (48, 64), 0, "zN", "zN", "zN", True, False, "float16", "float32", "matmul_cce")),
+             ((32, 48), (48, 64), 0, "zN", "zN", "zN", True, False, "float16", None, "float32", "matmul_cce")),
             ("matmul_run_zN_3", "matmul_run",
-             ((32, 48), (48, 64), 0, "zN", "zN", "zN", False, True, "float16", "float32", "matmul_cce")),
+             ((32, 48), (48, 64), 0, "zN", "zN", "zN", False, True, "float16", None, "float32", "matmul_cce")),
         ]
 
         self.testarg_rpc_cloud = [
@@ -121,11 +121,11 @@ class TestCase(TestBase):
             #shape_x, shape_y, bias, left_format, right_format, output_format, adj_x, adj_y, dtype, out_dtype, kernel_name, attrs
 
             ("matmul_run29", "matmul_run",
-             ((8192, 16), (1024, 16), 0, "zZ", "nZ", "zN", False, True, "float16", "float16", "matmul_cce"),
+             ((8192, 16), (1024, 16), 0, "zZ", "nZ", "zN", False, True, "float16", None, "float16", "matmul_cce"),
              ((8, 8), (8, 8), (128, 128), (128, 128), (128, 128))),
 
             # ("matmul_run33", "matmul_run",
-            #  ((16, 32), (32, 32), 0, 0, False, True, "float16", "float16", "matmul_cce"),
+            #  ((16, 32), (32, 32), 0, 0, False, True, "float16", None, "float16", "matmul_cce"),
             #  ((4, 8), (4,8), (16, 128), (16, 128), (16, 128))),
         ]
 
-- 
GitLab