# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from datetime import datetime
import logging
from enum import Enum
from gen_random import random_gaussian
import numpy as np
import akg.backend as cce
from akg.utils import kernel_exec as utils
from test_op import matmul
from base import get_rtol_atol
from tensorio import compare_tensor


logging.basicConfig(level=logging.DEBUG)


class MatmulType(Enum):
    gemm = 1
    gevm = 2
    gemv = 3


def get_name(caseIndex=1, name="leftMatrix", M=0, K=0, N=0, adj_x=False, adj_y=False):
    res = "{}_{}_{}_{}_{}_{}_{}.bin".format(caseIndex, name, M, K, N, adj_x, adj_y)
    return res


def get_shape(name="leftMatrix", M=0, K=0, N=0, batch_tuple=(1,), adj_x=False, adj_y=False):
    res_shape = ()
    if name == "leftMatrix":
        if adj_x:
            res_shape = batch_tuple + (K // cce.BLOCK_REDUCE, M // cce.BLOCK_IN, cce.BLOCK_REDUCE, cce.BLOCK_IN)
        else:
            res_shape = batch_tuple + (M // cce.BLOCK_IN, K // cce.BLOCK_REDUCE, cce.BLOCK_IN, cce.BLOCK_REDUCE)

    if name == "rightMatrix":
        if adj_y:
            res_shape = batch_tuple + (N // cce.BLOCK_OUT, K // cce.BLOCK_REDUCE, cce.BLOCK_REDUCE, cce.BLOCK_OUT)
        else:
            res_shape = batch_tuple + (K // cce.BLOCK_REDUCE, N // cce.BLOCK_OUT, cce.BLOCK_OUT, cce.BLOCK_REDUCE)

    if name == "result":
        res_shape = batch_tuple + (N // cce.BLOCK_OUT, M // cce.BLOCK_IN, cce.BLOCK_IN, cce.BLOCK_OUT)
    return res_shape


def get_shapes(batch_tuple, M, K, N, trans_data=False, trans_weight=False):
    shape_x = batch_tuple + (M, K)
    if trans_data:
        shape_x = batch_tuple + (K, M)
    shape_y = batch_tuple + (K, N)
    if trans_weight:
        shape_y = batch_tuple + (N, K)
    return shape_x, shape_y


def getMatmulType(m, n, k):
    type = MatmulType.gemm
    if m // cce.BLOCK_IN == 0:
        type = MatmulType.gevm
    elif n == 1:
        type = MatmulType.gemv
    return type


def np_matmul(matrix_a, matrix_b, batch_tuple, M, K, N, trans_data=False, trans_weight=False, output_format=None):
    """
    implementation for the batch matmul
    :param matrix_a: (batch1, batch2, ..., M, K)
    :param matrix_b: (batch1, batch2, ..., K, N)
    :return:
    (batch1, batch2, ..., M, N)
    """
    batch_len = len(batch_tuple)
    if trans_data:
        matrix_a = matrix_a.transpose(tuple(range(batch_len)) + (batch_len + 1, batch_len))
    if trans_weight:
        matrix_b = matrix_b.transpose(tuple(range(batch_len)) + (batch_len + 1, batch_len))

    mul = 1
    for i in batch_tuple:
        mul = mul * i
    reshape_x = matrix_a.reshape(mul, M, K)
    reshape_y = matrix_b.reshape(mul, K, N)
    flatten_shape = (mul, M, N)
    out = np.zeros(flatten_shape, dtype=np.float16)
    for b in range(mul):
        out[b, :] = np.dot(reshape_x[b, :], reshape_y[b, :])
        #out[b,:] = np.matmul(reshape_x[b,:], reshape_y[b,:])
    matmul_type = getMatmulType(M, N, K)
    out_shape = ()
    if matmul_type == MatmulType.gemm:
        out_shape = batch_tuple + (M // cce.BLOCK_IN, cce.BLOCK_IN, N // cce.BLOCK_OUT, cce.BLOCK_OUT)
    elif matmul_type == MatmulType.gevm:
        out_shape = batch_tuple + (1, M % cce.BLOCK_IN, N // cce.BLOCK_OUT, cce.BLOCK_OUT)
    elif matmul_type == MatmulType.gemv:
        out_shape = batch_tuple + (M // cce.BLOCK_IN, cce.BLOCK_IN, 1, N % cce.BLOCK_OUT)
    logging.debug(out_shape)
    # No Mo Mi Ni
    trans = tuple(range(batch_len)) + (batch_len + 2, batch_len, batch_len + 1, batch_len + 3)
    if output_format == "zZ":
        trans = tuple(range(batch_len)) + (batch_len, batch_len + 2, batch_len + 1, batch_len + 3)
    if matmul_type == MatmulType.gemv:
        # use the transpose of out
        trans = tuple(range(batch_len)) + (batch_len, batch_len + 2, batch_len + 3, batch_len + 1)
    res = out.reshape(out_shape).transpose(trans).copy()
    return res


def genData(batch_tuple, M, K, N, trans_data=False, trans_weight=False,
            dtype="float16", out_dtype="float16", bias=0, left_format="zZ", right_format="nZ", output_format="zN"):
    shape_x, shape_y = get_shapes(batch_tuple, M, K, N, trans_data, trans_weight)
    matrix_a = random_gaussian(shape_x, miu=0.1, sigma=0.01).astype(dtype)
    matrix_b = random_gaussian(shape_y, miu=0.1, sigma=0.01).astype(dtype)
    # matrix_a = np.ones(shape_x, dtype=np.float16)
    # matrix_b = np.ones(shape_y, dtype=np.float16)

    # this change is for gen data speed
    matrix_a_for_np = matrix_a.astype(np.float32)
    matrix_b_for_np = matrix_b.astype(np.float32)

    matmul_type = getMatmulType(M, N, K)
    out = np_matmul(matrix_a_for_np, matrix_b_for_np, batch_tuple, M, K, N, trans_data, trans_weight, output_format).astype(out_dtype)
    if dtype == "float16":
        out.astype(np.float16)

    bias_shape = batch_tuple + (N // cce.BLOCK_OUT, 1, 1, cce.BLOCK_OUT)
    if output_format == "zZ":
        bias_shape = batch_tuple + (1, N // cce.BLOCK_OUT, 1, cce.BLOCK_OUT)
    bias_data = np.full(bias_shape, np.nan, out_dtype)
    if bias == 1:
        bias_data = random_gaussian(bias_shape, miu=0.5, sigma=0.01).astype(out_dtype)
        out = out + bias_data

    shape_x = ()
    shape_y = ()
    if matmul_type == MatmulType.gemm:
        shape_x = (M // cce.BLOCK_IN, cce.BLOCK_IN, K // cce.BLOCK_REDUCE, cce.BLOCK_REDUCE)
        if trans_data:
            shape_x = (K // cce.BLOCK_REDUCE, cce.BLOCK_REDUCE, M // cce.BLOCK_IN, cce.BLOCK_IN)
        shape_y = (K // cce.BLOCK_REDUCE, cce.BLOCK_REDUCE, N // cce.BLOCK_OUT, cce.BLOCK_OUT)
        if trans_weight:
            shape_y = (N // cce.BLOCK_OUT, cce.BLOCK_OUT, K // cce.BLOCK_REDUCE, cce.BLOCK_REDUCE)
    elif matmul_type == MatmulType.gevm:
        shape_x = (1, M % cce.BLOCK_IN, K // cce.BLOCK_REDUCE, cce.BLOCK_REDUCE)
        shape_y = (K // cce.BLOCK_REDUCE, cce.BLOCK_REDUCE, N // cce.BLOCK_OUT, cce.BLOCK_OUT)
    elif matmul_type == MatmulType.gemv:
        # use traspose(b) transpose(a)
        shape_x = (M // cce.BLOCK_IN, cce.BLOCK_IN, K // cce.BLOCK_REDUCE, cce.BLOCK_REDUCE)
        shape_y = (K // cce.BLOCK_REDUCE, cce.BLOCK_REDUCE, 1, N % cce.BLOCK_OUT)

    batch_len = len(batch_tuple)
    # left_format zZ
    if left_format == "zZ":
        trans_x = tuple(range(batch_len)) + (batch_len + 0, batch_len + 2, batch_len + 1, batch_len + 3)
    elif left_format == "zN":
        trans_x = tuple(range(batch_len)) + (batch_len + 2, batch_len + 0, batch_len + 1, batch_len + 3)
    # right_format nZ
    if right_format == "nZ":
        trans_y = tuple(range(batch_len)) + (batch_len + 0, batch_len + 2, batch_len + 3, batch_len + 1)
    elif right_format == "zZ":
        trans_y = tuple(range(batch_len)) + (batch_len + 0, batch_len + 2, batch_len + 1, batch_len + 3)
    elif right_format == "zN":
        trans_y = tuple(range(batch_len)) + (batch_len + 2, batch_len + 0, batch_len + 1, batch_len + 3)
    fractal_a = matrix_a.reshape(batch_tuple + shape_x).transpose(trans_x).copy()
    fractal_b = matrix_b.reshape(batch_tuple + shape_y).transpose(trans_y).copy()
    if matmul_type == MatmulType.gemv:
        trans_y = tuple(range(batch_len)) + (batch_len + 2, batch_len + 0, batch_len + 3, batch_len + 1)
        trans_x = tuple(range(batch_len)) + (batch_len + 2, batch_len + 0, batch_len + 1, batch_len + 3)
        fractal_a = matrix_b.reshape(batch_tuple + shape_y).transpose(trans_y).copy()
        fractal_b = matrix_a.reshape(batch_tuple + shape_x).transpose(trans_x).copy()
    return fractal_a, fractal_b, out, bias_data


def matmul_data(batch_tuple, M, K, N, dtype, out_dtype, bias, adj_x, adj_y, left_format=None, right_format=None, output_format=None, debug_logging=False):
    m_x = ()
    m_y = ()
    bench_mark = ()
    bias_data = ()
    logging.debug("gen data start!")
    a = datetime.now()
    m_x, m_y, bench_mark, bias_data = genData(batch_tuple, M, K, N, adj_x, adj_y, dtype, out_dtype, bias, left_format, right_format, output_format)
    b = datetime.now()
    logging.debug((b - a).seconds)
    logging.debug("gen data end!")

    if debug_logging:
        logging.debug("m_x shape:{}".format(m_x.shape))
        logging.debug("m_y shape:{}".format(m_y.shape))
        logging.debug(type(m_x))
        logging.debug("bench_mark shape: {}".format(bench_mark.shape))

    return m_x, m_y, bench_mark, bias_data


def extract_dim(shape_x, shape_y, adj_x, adj_y):
    rank = len(shape_x)
    m = shape_x[-2] if adj_x == False else shape_x[-1]
    k = shape_x[-1] if adj_x == False else shape_x[-2]
    n = shape_y[-1] if adj_y == False else shape_y[-2]
    batch_tuple = shape_x[:-2] if rank > 2 else (1,)
    return batch_tuple, m, k, n


def reduce_data(reduce_type):
    res = cce.BLOCK_IN
    if reduce_type == "in":
        res = cce.BLOCK_IN
    elif reduce_type == "out":
        res = cce.BLOCK_OUT
    elif reduce_type == "reduce":
        res = cce.BLOCK_REDUCE
    return res


def get_fractal_shape(dim1, dim2, reduce1="in", reduce2="reduce", format="zZ"):
    result = ()
    dim1_reduce = reduce_data(reduce1)
    dim2_reduce = reduce_data(reduce2)
    if format == "zZ":
        result = (dim1 // dim1_reduce, dim2 // dim2_reduce, dim1_reduce, dim2_reduce)
    elif format == "nZ":
        result = (dim1 // dim1_reduce, dim2 // dim2_reduce, dim2_reduce, dim1_reduce)
    elif format == "nN":
        result = (dim2 // dim2_reduce, dim1 // dim1_reduce, dim2_reduce, dim1_reduce)
    elif format == "zN":
        result = (dim2 // dim2_reduce, dim1 // dim1_reduce, dim1_reduce, dim2_reduce)

    return result


def get_converted_shapes(m, n, k, batch_tuple, adj_x, adj_y, bias, left_format="zZ", right_format="nZ", out_format="zN"):
    matmul_type = getMatmulType(m, n, k)
    if matmul_type == MatmulType.gemm:
        # left_format zZ process
        if left_format == "zZ":
            shape_xx = batch_tuple + get_fractal_shape(m, k, "in", "reduce", "zZ")
            if adj_x:
                shape_xx = batch_tuple + get_fractal_shape(m, k, "in", "reduce", "nN")
        # left_format zN process
        elif left_format == "zN":
            shape_xx = batch_tuple + get_fractal_shape(m, k, "in", "reduce", "zN")
            if adj_x:
                shape_xx = batch_tuple + get_fractal_shape(m, k, "in", "reduce", "nZ")
        else:
            raise RuntimeError("Error: unsupport left matrix format: %s" % left_format)

        # right_format nZ
        if right_format == "nZ":
            shape_yy = batch_tuple + get_fractal_shape(k, n, "reduce", "out", "nZ")
            if adj_y:
                shape_yy = batch_tuple + get_fractal_shape(k, n, "reduce", "out", "zN")
        # right_format zZ
        elif right_format == "zZ":
            shape_yy = batch_tuple + get_fractal_shape(k, n, "reduce", "out", "zZ")
            if adj_y:
                shape_yy = batch_tuple + get_fractal_shape(k, n, "reduce", "out", "nN")
        elif right_format == "zN":
            shape_yy = batch_tuple + get_fractal_shape(k, n, "reduce", "out", "zN")
            if adj_y:
                shape_yy = batch_tuple + get_fractal_shape(k, n, "reduce", "out", "nZ")
        else:
            raise RuntimeError("Error: unsupport right matrix format: %s" % right_format)

        # output_format zN
        # output_shape = batch_tuple + (n//cce.BLOCK_OUT, m//cce.BLOCK_IN, cce.BLOCK_IN, cce.BLOCK_OUT)
        if out_format == "zN":
            output_shape = batch_tuple + get_fractal_shape(m, n, "in", "out", "zN")
        elif out_format == "zZ":
            output_shape = batch_tuple + get_fractal_shape(m, n, "in", "out", "zZ")
        else:
            raise RuntimeError("Error: unsupport output matrix format: %s" % out_format)

    elif matmul_type == MatmulType.gevm:
        shape_xx = batch_tuple + (1, k // cce.BLOCK_REDUCE, m % cce.BLOCK_IN, cce.BLOCK_REDUCE)
        shape_yy = batch_tuple + (k // cce.BLOCK_REDUCE, n // cce.BLOCK_OUT, cce.BLOCK_OUT, cce.BLOCK_REDUCE)
        output_shape = batch_tuple + (n // cce.BLOCK_OUT, 1, m % cce.BLOCK_IN, cce.BLOCK_OUT)
    elif matmul_type == MatmulType.gemv:
        # transpose of b * transpose of a
        shape_xx = batch_tuple + (1, k // cce.BLOCK_REDUCE, n % cce.BLOCK_IN, cce.BLOCK_REDUCE)
        shape_yy = batch_tuple + (k // cce.BLOCK_REDUCE, m // cce.BLOCK_OUT, cce.BLOCK_OUT, cce.BLOCK_REDUCE)
        output_shape = batch_tuple + (m // cce.BLOCK_OUT, 1, n % cce.BLOCK_IN, cce.BLOCK_OUT)

    if bias == 1:
        if out_format == "zN":
            bias_shape_nc1hwc0 = batch_tuple + (n // cce.BLOCK_OUT, 1, 1, cce.BLOCK_OUT)
        elif out_format == "zZ":
            bias_shape_nc1hwc0 = batch_tuple + (1, n // cce.BLOCK_OUT, 1, cce.BLOCK_OUT)

    else:
        bias_shape_nc1hwc0 = None 
    return shape_xx, shape_yy, bias_shape_nc1hwc0, output_shape, k


def matmul_execute(shape_x, shape_y, bias, left_format, right_format, out_format, adj_x, adj_y, dtype, out_dtype, kernel_name, attrs):
    '''
    There are four types of fractal format in Davinci core: zZ, zN, nZ, nN
    general matmul format
    left_trans: False right_trans False: zZ * nZ = zN
    left_trans: True  right_trans False: nN * nZ = zN
    left_trans: False right_trans True : zZ * zN = zN
    left_trans: True  right_trans True : nN * zN = zN

    Now we need to support: zN * nZ = zN
    use left_format to specify, left matrix data format
    use right_format to specify, right matrix data format
    '''
    batch_tuple, m, k, n = extract_dim(shape_x, shape_y, adj_x, adj_y)
    m = (m + 15) // 16 * 16
    n = (n + 15) // 16 * 16
    k = (k + 15) // 16 * 16
    shape_xx, shape_yy, bias_shape, out_shape, k = get_converted_shapes(m, n, k, batch_tuple, adj_x, adj_y, bias, left_format, right_format, out_format)
    mod = matmul_compile(shape_x, shape_y, bias, left_format, right_format, out_format, adj_x, adj_y, dtype, out_dtype, kernel_name, attrs)
    # Generate data
    m_x, m_y, bench_mark, bias_data = matmul_data(batch_tuple, m, k, n, dtype, out_dtype, bias, adj_x, adj_y, left_format, right_format, out_format)

    # mod launch
    output = np.full(out_shape, np.nan, out_dtype)
    if bias == 0:
        output = utils.mod_launch(mod, (m_x, m_y, output), expect=bench_mark)
    elif bias == 1:
        output = utils.mod_launch(mod, (m_x, m_y, bias_data, output), expect=bench_mark)

    # compare result
    rtol, atol = get_rtol_atol("matmul", dtype)
    compare_result = compare_tensor(output, bench_mark, rtol=rtol, atol=atol, equal_nan=True)
    # compare_result = utils.result_compare(output, bench_mark, r_tol=5e-3)
    return (m_x, m_y), output, bench_mark, compare_result


def matmul_compile(shape_x, shape_y, bias, left_format, right_format, output_format, adj_x, adj_y, dtype, out_dtype, kernel_name, attrs):
    batch_tuple, m, k, n = extract_dim(shape_x, shape_y, adj_x, adj_y)
    m = (m + 15) // 16 * 16
    n = (n + 15) // 16 * 16
    k = (k + 15) // 16 * 16
    shape_xx, shape_yy, bias_shape, out_shape, k = get_converted_shapes(m, n, k, batch_tuple, adj_x, adj_y, bias,
                                                                        left_format, right_format, output_format)
    input_shapes = [shape_xx, shape_yy, bias_shape]
    input_types = [dtype, dtype, out_dtype]
    has_bias = False
    if bias == 1:
        has_bias = True
    op_attrs = [out_dtype, left_format, right_format, output_format, adj_x, adj_y, has_bias, attrs]
    if has_bias == False:
        input_shapes = [shape_xx, shape_yy]
        input_types = [dtype, dtype]
        op_attrs = [None, out_dtype, left_format, right_format, output_format, adj_x, adj_y, has_bias, attrs]
    return utils.op_build_test(matmul.matmul, input_shapes, input_types, op_attrs, kernel_name, attrs)