From 19043e7b9772b5a2e7c093afce2473941400c137 Mon Sep 17 00:00:00 2001 From: wangzhuo325 Date: Mon, 6 Jul 2020 15:56:30 +0800 Subject: [PATCH] support adding bias in matmul --- python/akg/ops/nn/matmul.py | 10 ++-- src/poly/cce_isl_emitter.cc | 13 ++++- tests/common/test_run/matmul_run.py | 42 ++++++------- tests/operators/cube/test_matmul_001.py | 78 ++++++++++++------------- 4 files changed, 79 insertions(+), 64 deletions(-) diff --git a/python/akg/ops/nn/matmul.py b/python/akg/ops/nn/matmul.py index 31a2bab..7094ce7 100644 --- a/python/akg/ops/nn/matmul.py +++ b/python/akg/ops/nn/matmul.py @@ -271,7 +271,7 @@ def matmul4D_compute(x, y, bias_value, out_dtype, left_format, right_format, out "bias": bias_name, }) - if out_dtype == "float16": + if out_dtype == "float16" and (bias_value == None or bias_value.dtype == "float16"): result_matmul = cast.cast(result_matmul, out_dtype) def matmul_reshape(shape, result_matmul, *indices): @@ -288,10 +288,10 @@ def matmul4D_compute(x, y, bias_value, out_dtype, left_format, right_format, out N = len(output_shape) # reduce axis if output_format == "zN": - bias_indices = indices[:(N - 4)] + indices[(N - 4):(N - 3)] + (0, 0) + indices[(N - 1):] + bias_indices = indices[N - 4] * cce.BLOCK_OUT + indices[N - 1] elif output_format == "zZ": - bias_indices = indices[:(N - 4)] + (0,) + indices[(N - 3):(N - 2)] + (0,) + indices[(N - 1):] - return result(*indices) + bias(*bias_indices) + bias_indices = indices[N - 3] * cce.BLOCK_OUT + indices[N - 1] + return result(*indices) + bias(bias_indices) if bias == 1: if out_format == "zN": out = akg.tvm.compute(output_shape_zN, lambda *indices: bias_compute(output_shape_zN, result, bias_value, out_format, *indices), @@ -299,6 +299,8 @@ def matmul4D_compute(x, y, bias_value, out_dtype, left_format, right_format, out elif out_format == "zZ": out = akg.tvm.compute(output_shape_zZ, lambda *indices: bias_compute(output_shape_zZ, result, bias_value, out_format, *indices), name="output") + if out_dtype == "float16" and bias_value.dtype == "float32": + out = cast.cast(out, out_dtype) else: out = result diff --git a/src/poly/cce_isl_emitter.cc b/src/poly/cce_isl_emitter.cc index d15af81..acf770f 100644 --- a/src/poly/cce_isl_emitter.cc +++ b/src/poly/cce_isl_emitter.cc @@ -171,10 +171,21 @@ class HoistL0Write : public IRMutator { for (const auto &arg : op->args) { args.push_back(this->Mutate(arg)); } - return Provide::make(op->func, op->value_index, op->value, args); + auto value = this->Mutate(op->value); + return Provide::make(op->func, op->value_index, value, args); } return IRMutator::Mutate_(op, s); } + Expr Mutate_(const Call *op, const Expr &e) final { + if (mutate_write_) { + Array args; + for (const auto &arg : op->args) { + args.push_back(this->Mutate(arg)); + } + return Call::make(op->type, op->name, args, op->call_type, op->func, op->value_index); + } + return IRMutator::Mutate_(op, e); + } bool found_{false}; bool mutate_{false}; diff --git a/tests/common/test_run/matmul_run.py b/tests/common/test_run/matmul_run.py index 3e88148..bf2a1eb 100644 --- a/tests/common/test_run/matmul_run.py +++ b/tests/common/test_run/matmul_run.py @@ -121,7 +121,7 @@ def np_matmul(matrix_a, matrix_b, batch_tuple, M, K, N, trans_data=False, trans_ def genData(batch_tuple, M, K, N, trans_data=False, trans_weight=False, - dtype="float16", out_dtype="float16", bias=0, left_format="zZ", right_format="nZ", output_format="zN"): + dtype="float16", bias_dtype="float16", out_dtype="float16", bias=0, left_format="zZ", right_format="nZ", output_format="zN"): shape_x, shape_y = get_shapes(batch_tuple, M, K, N, trans_data, trans_weight) matrix_a = random_gaussian(shape_x, miu=0.1, sigma=0.01).astype(dtype) matrix_b = random_gaussian(shape_y, miu=0.1, sigma=0.01).astype(dtype) @@ -137,13 +137,19 @@ def genData(batch_tuple, M, K, N, trans_data=False, trans_weight=False, if dtype == "float16": out.astype(np.float16) - bias_shape = batch_tuple + (N // cce.BLOCK_OUT, 1, 1, cce.BLOCK_OUT) - if output_format == "zZ": - bias_shape = batch_tuple + (1, N // cce.BLOCK_OUT, 1, cce.BLOCK_OUT) - bias_data = np.full(bias_shape, np.nan, out_dtype) - if bias == 1: - bias_data = random_gaussian(bias_shape, miu=0.5, sigma=0.01).astype(out_dtype) - out = out + bias_data + bias_shape = (N,) + bias_data = np.full(bias_shape, np.nan, bias_dtype) + if bias != 0: + bias_data = random_gaussian(bias_shape, miu=0.5, sigma=0.01).astype(bias_dtype) + bias_reshape = (N // cce.BLOCK_OUT, 1, 1, cce.BLOCK_OUT) + if output_format == "zZ": + bias_reshape = (1, N // cce.BLOCK_OUT, 1, cce.BLOCK_OUT) + bias_data_reshaped = bias_data.reshape(bias_reshape) + if bias_dtype != out_dtype: + out = out.astype(np.float32) + bias_data_reshaped.astype(np.float32) + out = out.astype(out_dtype) + else: + out = out + bias_data_reshaped shape_x = () shape_y = () @@ -185,14 +191,14 @@ def genData(batch_tuple, M, K, N, trans_data=False, trans_weight=False, return fractal_a, fractal_b, out, bias_data -def matmul_data(batch_tuple, M, K, N, dtype, out_dtype, bias, adj_x, adj_y, left_format=None, right_format=None, output_format=None, debug_logging=False): +def matmul_data(batch_tuple, M, K, N, dtype, bias_dtype, out_dtype, bias, adj_x, adj_y, left_format=None, right_format=None, output_format=None, debug_logging=False): m_x = () m_y = () bench_mark = () bias_data = () logging.debug("gen data start!") a = datetime.now() - m_x, m_y, bench_mark, bias_data = genData(batch_tuple, M, K, N, adj_x, adj_y, dtype, out_dtype, bias, left_format, right_format, output_format) + m_x, m_y, bench_mark, bias_data = genData(batch_tuple, M, K, N, adj_x, adj_y, dtype, bias_dtype, out_dtype, bias, left_format, right_format, output_format) b = datetime.now() logging.debug((b - a).seconds) logging.debug("gen data end!") @@ -295,17 +301,13 @@ def get_converted_shapes(m, n, k, batch_tuple, adj_x, adj_y, bias, left_format=" output_shape = batch_tuple + (m // cce.BLOCK_OUT, 1, n % cce.BLOCK_IN, cce.BLOCK_OUT) if bias == 1: - if out_format == "zN": - bias_shape_nc1hwc0 = batch_tuple + (n // cce.BLOCK_OUT, 1, 1, cce.BLOCK_OUT) - elif out_format == "zZ": - bias_shape_nc1hwc0 = batch_tuple + (1, n // cce.BLOCK_OUT, 1, cce.BLOCK_OUT) - + bias_shape_nc1hwc0 = (n,) else: bias_shape_nc1hwc0 = None return shape_xx, shape_yy, bias_shape_nc1hwc0, output_shape, k -def matmul_execute(shape_x, shape_y, bias, left_format, right_format, out_format, adj_x, adj_y, dtype, out_dtype, kernel_name, attrs): +def matmul_execute(shape_x, shape_y, bias, left_format, right_format, out_format, adj_x, adj_y, dtype, bias_dtype, out_dtype, kernel_name, attrs): ''' There are four types of fractal format in Davinci core: zZ, zN, nZ, nN general matmul format @@ -323,9 +325,9 @@ def matmul_execute(shape_x, shape_y, bias, left_format, right_format, out_format n = (n + 15) // 16 * 16 k = (k + 15) // 16 * 16 shape_xx, shape_yy, bias_shape, out_shape, k = get_converted_shapes(m, n, k, batch_tuple, adj_x, adj_y, bias, left_format, right_format, out_format) - mod = matmul_compile(shape_x, shape_y, bias, left_format, right_format, out_format, adj_x, adj_y, dtype, out_dtype, kernel_name, attrs) + mod = matmul_compile(shape_x, shape_y, bias, left_format, right_format, out_format, adj_x, adj_y, dtype, bias_dtype, out_dtype, kernel_name, attrs) # Generate data - m_x, m_y, bench_mark, bias_data = matmul_data(batch_tuple, m, k, n, dtype, out_dtype, bias, adj_x, adj_y, left_format, right_format, out_format) + m_x, m_y, bench_mark, bias_data = matmul_data(batch_tuple, m, k, n, dtype, bias_dtype, out_dtype, bias, adj_x, adj_y, left_format, right_format, out_format) # mod launch output = np.full(out_shape, np.nan, out_dtype) @@ -341,7 +343,7 @@ def matmul_execute(shape_x, shape_y, bias, left_format, right_format, out_format return (m_x, m_y), output, bench_mark, compare_result -def matmul_compile(shape_x, shape_y, bias, left_format, right_format, output_format, adj_x, adj_y, dtype, out_dtype, kernel_name, attrs, tuning=False): +def matmul_compile(shape_x, shape_y, bias, left_format, right_format, output_format, adj_x, adj_y, dtype, bias_dtype, out_dtype, kernel_name, attrs, tuning=False): batch_tuple, m, k, n = extract_dim(shape_x, shape_y, adj_x, adj_y) m = (m + 15) // 16 * 16 n = (n + 15) // 16 * 16 @@ -349,7 +351,7 @@ def matmul_compile(shape_x, shape_y, bias, left_format, right_format, output_for shape_xx, shape_yy, bias_shape, out_shape, k = get_converted_shapes(m, n, k, batch_tuple, adj_x, adj_y, bias, left_format, right_format, output_format) input_shapes = [shape_xx, shape_yy, bias_shape] - input_types = [dtype, dtype, out_dtype] + input_types = [dtype, dtype, bias_dtype] has_bias = False if bias == 1: has_bias = True diff --git a/tests/operators/cube/test_matmul_001.py b/tests/operators/cube/test_matmul_001.py index 9b2b644..d728d9d 100644 --- a/tests/operators/cube/test_matmul_001.py +++ b/tests/operators/cube/test_matmul_001.py @@ -31,85 +31,85 @@ class TestCase(TestBase): self._log.info("============= {0} Setup case============".format(self.casename)) self.testarg = [ # caseflag,opfuncname,testRunArgs, dimArgs - # shape_x, shape_y, bias, left_format, right_format, output_format, adj_x, adj_y, dtype, out_dtype, kernel_name, attrs + # shape_x, shape_y, bias, left_format, right_format, output_format, adj_x, adj_y, dtype, bias_dtype, out_dtype, kernel_name, attrs # bert shape - ("matmul_run_bert_00", "matmul_run", ((16, 1024), (16, 1024), 0, "zN", "zN", "zN", False, True, "float16", "float16", "matmul_cce")), - ("matmul_run_bert_01", "matmul_run", ((8192, 4096), (8192, 1024), 0, "zN", "zN", "zN", True, False, "float16", "float32", "matmul_cce")), - ("matmul_run_bert_02", "matmul_run", ((8192, 1024), (1024, 4096), 0, "zN", "zN", "zN", False, False, "float16", "float16", "matmul_cce")), - ("matmul_run_bert_03", "matmul_run", ((16, 16), (16, 1024), 0, "zN", "zN", "zN", True, False, "float16", "float32", "matmul_cce")), - ("matmul_run_bert_04", "matmul_run", ((1216, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, False, "float16", "float32", "matmul_cce")), - ("matmul_run_bert_05", "matmul_run", ((8192, 4096), (4096, 1024), 0, "zN", "zN", "zN", False, False, "float16", "float16", "matmul_cce")), - ("matmul_run_bert_06", "matmul_run", ((8192, 1024), (4096, 1024), 0, "zN", "zN", "zN", False, True, "float16", "float16", "matmul_cce")), - ("matmul_run_bert_07", "matmul_run", ((8192, 1024), (8192, 4096), 0, "zN", "zN", "zN", True, False, "float16", "float16", "matmul_cce")), - ("matmul_run_bert_08", "matmul_run", ((1216, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, True, "float16", "float16", "matmul_cce")), - ("matmul_run_bert_09", "matmul_run", ((8192, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, False, "float16", "float16", "matmul_cce")), - ("matmul_run_bert_10", "matmul_run", ((1216, 30522), (30522, 1024), 0, "zN", "zN", "zN", False, False, "float16", "float16", "matmul_cce")), - ("matmul_run_bert_11", "matmul_run", ((1216, 30522), (1216, 1024), 0, "zN", "zN", "zN", True, False, "float16", "float32", "matmul_cce")), - ("matmul_run_bert_12", "matmul_run", ((1216, 1024), (30522, 1024), 0, "zN", "zN", "zN", False, True, "float16", "float32", "matmul_cce")), - ("matmul_run_bert_13", "matmul_run", ((8192, 1024), (8192, 1024), 0, "zN", "zN", "zN", True, False, "float16", "float32", "matmul_cce")), - ("matmul_run_bert_14", "matmul_run", ((1216, 1024), (1216, 1024), 0, "zN", "zN", "zN", True, False, "float16", "float16", "matmul_cce")), - ("matmul_run_bert_15", "matmul_run", ((16, 1024), (16, 1024), 0, "zN", "zN", "zN", True, False, "float16", "float32", "matmul_cce")), - ("matmul_run_bert_16", "matmul_run", ((16, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, True, "float16", "float32", "matmul_cce")), - ("matmul_run_bert_17", "matmul_run", ((16, 16), (16, 1024), 0, "zN", "zN", "zN", False, False, "float16", "float32", "matmul_cce")), - ("matmul_run_bert_18", "matmul_run", ((8192, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, True, "float16", "float16", "matmul_cce")), - ("matmul_run_bert_19", "matmul_run", ((8192, 4096), (1024, 4096), 0, "zN", "zN", "zN", False, True, "float16", "float16", "matmul_cce")), + ("matmul_run_bert_00", "matmul_run", ((16, 1024), (16, 1024), 0, "zN", "zN", "zN", False, True, "float16", None, "float16", "matmul_cce")), + ("matmul_run_bert_01", "matmul_run", ((8192, 4096), (8192, 1024), 0, "zN", "zN", "zN", True, False, "float16", None, "float32", "matmul_cce")), + ("matmul_run_bert_02", "matmul_run", ((8192, 1024), (1024, 4096), 0, "zN", "zN", "zN", False, False, "float16", None, "float16", "matmul_cce")), + ("matmul_run_bert_03", "matmul_run", ((16, 16), (16, 1024), 0, "zN", "zN", "zN", True, False, "float16", None, "float32", "matmul_cce")), + ("matmul_run_bert_04", "matmul_run", ((1216, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, False, "float16", None, "float32", "matmul_cce")), + ("matmul_run_bert_05", "matmul_run", ((8192, 4096), (4096, 1024), 0, "zN", "zN", "zN", False, False, "float16", None, "float16", "matmul_cce")), + ("matmul_run_bert_06", "matmul_run", ((8192, 1024), (4096, 1024), 0, "zN", "zN", "zN", False, True, "float16", None, "float16", "matmul_cce")), + ("matmul_run_bert_07", "matmul_run", ((8192, 1024), (8192, 4096), 0, "zN", "zN", "zN", True, False, "float16", None, "float16", "matmul_cce")), + ("matmul_run_bert_08", "matmul_run", ((1216, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, True, "float16", None, "float16", "matmul_cce")), + ("matmul_run_bert_09", "matmul_run", ((8192, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, False, "float16", None, "float16", "matmul_cce")), + ("matmul_run_bert_10", "matmul_run", ((1216, 30522), (30522, 1024), 0, "zN", "zN", "zN", False, False, "float16", None, "float16", "matmul_cce")), + ("matmul_run_bert_11", "matmul_run", ((1216, 30522), (1216, 1024), 0, "zN", "zN", "zN", True, False, "float16", None, "float32", "matmul_cce")), + ("matmul_run_bert_12", "matmul_run", ((1216, 1024), (30522, 1024), 0, "zN", "zN", "zN", False, True, "float16", None, "float32", "matmul_cce")), + ("matmul_run_bert_13", "matmul_run", ((8192, 1024), (8192, 1024), 0, "zN", "zN", "zN", True, False, "float16", None, "float32", "matmul_cce")), + ("matmul_run_bert_14", "matmul_run", ((1216, 1024), (1216, 1024), 0, "zN", "zN", "zN", True, False, "float16", None, "float16", "matmul_cce")), + ("matmul_run_bert_15", "matmul_run", ((16, 1024), (16, 1024), 0, "zN", "zN", "zN", True, False, "float16", None, "float32", "matmul_cce")), + ("matmul_run_bert_16", "matmul_run", ((16, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, True, "float16", None, "float32", "matmul_cce")), + ("matmul_run_bert_17", "matmul_run", ((16, 16), (16, 1024), 0, "zN", "zN", "zN", False, False, "float16", None, "float32", "matmul_cce")), + ("matmul_run_bert_18", "matmul_run", ((8192, 1024), (1024, 1024), 0, "zN", "zN", "zN", False, True, "float16", None, "float16", "matmul_cce")), + ("matmul_run_bert_19", "matmul_run", ((8192, 4096), (1024, 4096), 0, "zN", "zN", "zN", False, True, "float16", None, "float16", "matmul_cce")), # matmul_cast ("matmul_run1", "matmul_run", - ((64, 1024), (16, 1024), 0, "zZ", "nZ", "zN", False, True, "float16", "float32", "matmul_cast_cce")), + ((64, 1024), (16, 1024), 0, "zZ", "nZ", "zN", False, True, "float16", None, "float32", "matmul_cast_cce")), # ((4, 4), (16, 16), (128, 128), (16, 16), (16, 16))), # matmul_bias ("matmul_run2", "matmul_run", - ((64, 1024), (16, 1024), 1, "zZ", "nZ", "zN", False, True, "float16", "float16", "matmul_bias_cce")), + ((64, 1024), (16, 1024), 1, "zZ", "nZ", "zN", False, True, "float16", "float16", "float16", "matmul_bias_cce")), # ((4, 4), (16, 16), (128, 128), (16, 16), (16, 16))), # matmul_trans ("matmul_run3", "matmul_run", - ((1024, 64), (16, 1024), 1, "zZ", "nZ", "zN", True, True, "float16", "float16", "matmul_bias_cce")), + ((1024, 64), (16, 1024), 1, "zZ", "nZ", "zN", True, True, "float16", "float16", "float16", "matmul_bias_cce")), # ((4, 4), (16, 16), (128, 128), (16, 16), (16, 16))), # matmul ("matmul_run4", "matmul_run", - ((64, 1024), (16, 1024), 0, "zZ", "nZ", "zN", False, True, "float16", "float16", "matmul_cce")), + ((64, 1024), (16, 1024), 0, "zZ", "nZ", "zN", False, True, "float16", None, "float16", "matmul_cce")), # ((4, 4), (16, 16), (128, 128), (16, 16), (16, 16))), ("matmul_run5", "matmul_run", - ((1024, 16), (16, 1024), 1, "zZ", "nZ", "zN", False, False, "float16", "float16", "matmul_cce")), + ((1024, 16), (16, 1024), 1, "zZ", "nZ", "zN", False, False, "float16", "float16", "float16", "matmul_cce")), # ((8, 8), (8, 8), (128, 128), (128, 128), (16, 16))), ("matmul_run9", "matmul_run", - ((16, 1024), (16, 1024), 0, "zZ", "nZ", "zN", False, True, "float16", "float16", "matmul_cce")), + ((16, 1024), (16, 1024), 0, "zZ", "nZ", "zN", False, True, "float16", None, "float16", "matmul_cce")), # ((16, 16), (16, 16), (16, 16))), ("matmul_run16", "matmul_run", - ((16, 64), (64, 1024), 0, "zZ", "nZ", "zN", False, False, "float16", "float16", "matmul_cce")), + ((16, 64), (64, 1024), 0, "zZ", "nZ", "zN", False, False, "float16", None, "float16", "matmul_cce")), # ((16, 16), (16, 16), (16, 16), (4, 4))), # new shape for bert # ("matmul_run29", "matmul_run", - # ((8192,2), (1024,2), 0, 0, False, True, "float16", "float16", "matmul_cce"), + # ((8192,2), (1024,2), 0, 0, False, True, "float16", None, "float16", "matmul_cce"), # ((8, 8), (8, 8), (128, 128), (128, 128), (16, 16))), ("matmul_run30", "matmul_run", - ((64, 1024), (2, 1024), 0, "zZ", "nZ", "zN", False, True, "float16", "float16", "matmul_cce")), + ((64, 1024), (2, 1024), 0, "zZ", "nZ", "zN", False, True, "float16", None, "float16", "matmul_cce")), # ((4, 4), (16, 16), (16, 16), (16, 16), (16, 16))), ("matmul_run31", "matmul_run", - ((2, 64), (1024, 64), 0, "zZ", "nZ", "zN", False, True, "float16", "float16", "matmul_cce")), + ((2, 64), (1024, 64), 0, "zZ", "nZ", "zN", False, True, "float16", None, "float16", "matmul_cce")), # ((16, 16), (16, 16), (16, 16), (16, 16))), # zZ case ("matmul_run1", "matmul_run", - ((6272, 256), (6272, 256), 0, "zZ", "zZ", "zZ", True, False, "float16", "float32", "matmul_cast_cce")), + ((6272, 256), (6272, 256), 0, "zZ", "zZ", "zZ", True, False, "float16", None, "float32", "matmul_cast_cce")), ("matmul_run2", "matmul_run", - ((6272*16, 4*16), (6272*16, 4*16), 0, "zZ", "zZ", "zZ", True, False, "float16", "float32", "matmul_cce")), + ((6272*16, 4*16), (6272*16, 4*16), 0, "zZ", "zZ", "zZ", True, False, "float16", None, "float32", "matmul_cce")), ("matmul_run3", "matmul_run", - ((1568*16, 8*16), (1568*16, 8*16), 0, "zZ", "zZ", "zZ", True, False, "float16", "float32", "matmul_cce")), + ((1568*16, 8*16), (1568*16, 8*16), 0, "zZ", "zZ", "zZ", True, False, "float16", None, "float32", "matmul_cce")), # zN case ("matmul_run_zN_1", "matmul_run", - ((32, 48), (48, 64), 0, "zN", "zN", "zN", False, False, "float16", "float32", "matmul_cce")), + ((32, 48), (48, 64), 0, "zN", "zN", "zN", False, False, "float16", None, "float32", "matmul_cce")), ("matmul_run_zN_2", "matmul_run", - ((32, 48), (48, 64), 0, "zN", "zN", "zN", True, False, "float16", "float32", "matmul_cce")), + ((32, 48), (48, 64), 0, "zN", "zN", "zN", True, False, "float16", None, "float32", "matmul_cce")), ("matmul_run_zN_3", "matmul_run", - ((32, 48), (48, 64), 0, "zN", "zN", "zN", False, True, "float16", "float32", "matmul_cce")), + ((32, 48), (48, 64), 0, "zN", "zN", "zN", False, True, "float16", None, "float32", "matmul_cce")), ] self.testarg_rpc_cloud = [ @@ -121,11 +121,11 @@ class TestCase(TestBase): #shape_x, shape_y, bias, left_format, right_format, output_format, adj_x, adj_y, dtype, out_dtype, kernel_name, attrs ("matmul_run29", "matmul_run", - ((8192, 16), (1024, 16), 0, "zZ", "nZ", "zN", False, True, "float16", "float16", "matmul_cce"), + ((8192, 16), (1024, 16), 0, "zZ", "nZ", "zN", False, True, "float16", None, "float16", "matmul_cce"), ((8, 8), (8, 8), (128, 128), (128, 128), (128, 128))), # ("matmul_run33", "matmul_run", - # ((16, 32), (32, 32), 0, 0, False, True, "float16", "float16", "matmul_cce"), + # ((16, 32), (32, 32), 0, 0, False, True, "float16", None, "float16", "matmul_cce"), # ((4, 8), (4,8), (16, 128), (16, 128), (16, 128))), ] -- GitLab