# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
from tensorio import compare_tensor
from akg.utils import kernel_exec as utils
from akg.ops.nn import conv_input_ad
from test_run.conv_utils import conv_forward_naive
from gen_random import random_gaussian
import os
from akg.utils.kernel_exec import gen_kernel_name
from base import get_rtol_atol


def compare_5D(out_data, expect):
    data_len = expect.size
    actual = out_data
    N, C1, H, W, C0 = out_data.shape
    error = 0
    count = 0
    lastErr = -2
    continueErr = 0
    maxContinue = -1
    maxEnd = 0
    partial_debug = 0
    for n in range(N):
        for c1 in range(C1):
            for h in range(H):
                for w in range(W):
                    for c0 in range(C0):
                        a = actual[n, c1, h, w, c0]
                        b = expect[n, c1, h, w, c0]
                        if (abs(a - b) > abs(b) * 5e-02):
                            if (partial_debug and (a == 0.0)):
                                continue

                            error += 1
                            if lastErr + 1 == count:
                                continueErr += 1
                            else:
                                if continueErr > maxContinue:
                                    maxContinue = continueErr
                                    maxEnd = lastErr
                                continueErr = 1
                            lastErr = count
                        count += 1
    if continueErr > maxContinue:
        maxContinue = continueErr
        maxEnd = lastErr
    print("error num: %d/%d (%.2f%%)" % (error, count, 100.0 * error / count))
    print("longest error range: [%d, %d]" % (maxEnd - maxContinue + 1, maxEnd))

    if maxContinue >= 16:
        assert_res = False
    else:
        assert_res = True

    return assert_res


def conv_input_ad_run(fmap_shape, filter_shape, pad_, stride_, dilation_, attrs=None):
    conv_dtype = 'float16'
    block_size = 16

    in_n, in_c, in_h, in_w = fmap_shape
    cout, cin, w_h, w_w = filter_shape
    assert(in_c == cin)

    in_c = (in_c + block_size - 1) // block_size * block_size
    cout = (cout + block_size - 1) // block_size * block_size

    pad_top, pad_bottom, pad_left, pad_right = pad_
    stride_h, stride_w = stride_

    out_n = in_n
    out_c = cout
    out_h = (in_h + pad_top + pad_bottom - w_h) // stride_h + 1
    out_w = (in_w + pad_left + pad_right - w_w) // stride_w + 1

    w_shape = (cout, in_c, w_h, w_w)
    k_n, k_c, k_h, k_w = w_shape
    kernel_shape_nc1hwc0 = (k_n, k_c // block_size, k_h, k_w, block_size)
    k_n, k_c1, k_h, k_w, k_c0 = kernel_shape_nc1hwc0
    kernel_shape_fractal = (k_c // block_size * k_h * k_w, k_n // block_size, block_size, block_size)

    y_shape = (out_n, out_c, out_h, out_w)
    y_5D_shape = (out_n, out_c // block_size, out_h, out_w, block_size)

    dx_input_shapes = [y_5D_shape, kernel_shape_fractal]

    input_file = os.environ.get("RANDOM_DATA_DISK_PATH", "")
    expect_file = input_file + "/" + gen_kernel_name([dx_input_shapes], [conv_dtype], op_attrs=[fmap_shape, filter_shape, pad_, stride_, dilation_, attrs], kernel_name='conv_input_ad') + ".bin"

    print("gen_data begin.")
    fmap_data, filter_data, expect = gen_data_dx(fmap_shape, filter_shape, pad_, stride_, dilation_, expect_file, attrs=attrs)
    print("gen_data finished.")

    out_data = np.full(expect.shape, 0, 'float16')
    np_input = (fmap_data, filter_data)

    flag_w = os.environ.get("WRITE_TO_DISK", "No")
    if flag_w == "Yes":
        return np_input, out_data, expect, True


    mod = utils.op_build_test(conv_input_ad.conv_input_ad, [dx_input_shapes], [conv_dtype],
                              op_attrs=[fmap_shape, filter_shape, pad_, stride_, dilation_, attrs],
                              kernel_name='conv_input_ad', attrs=attrs)
    args = (fmap_data, filter_data, out_data)
    out_data = utils.mod_launch(mod, args, expect=expect)
    rtol, atol = get_rtol_atol("conv_input_ad", conv_dtype)
    assert_res = compare_tensor(out_data, expect, rtol=rtol, atol=atol, equal_nan=True)

    return np_input, out_data, expect, assert_res


def conv_input_ad_reuse_forward_run(fmap_shape, filter_shape, pad_, stride_, dilation_, Tile = None, attrs = None):
    if (Tile == None):
        Tile = [0, 0, 0, 0, 0]

    mod_data, mod_head_strided, mod_weight_flipped = conv_input_ad.conv_input_ad_reuse_forward(fmap_shape, filter_shape, pad_, stride_, dilation_,\
                                                                    tile_hh = Tile[0], tile_coco = Tile[1], tile_mm = Tile[2], tile_kk = Tile[3], tile_nn = Tile[4],\
                                                                    bypass_l1 = True, use_bias = False, block_size = 16, conv_dtype = 'float16')

    in_n, in_c, in_h, in_w = fmap_shape
    k_n, k_c, k_h, k_w = filter_shape
    pad_h, pad_w, pad_l, pad_r = pad_
    s_h, s_w = stride_
    d_h, d_w = dilation_
    block_size = 16

    o_n = in_n
    o_c = k_n
    o_h = 1 + (in_h + 2 * pad_h - (k_h - 1) * d_h - 1) // s_h
    o_w = 1 + (in_w + 2 * pad_w - (k_w - 1) * d_w - 1) // s_w

    Head_strided_shape = (o_n, o_c, (o_h - 1) * s_h + 1, (o_w - 1) * s_w + 1)
    B_flip_shape = (k_c, k_n, k_h, k_w)
    fmap_data_01, filter_data_01, expect_01 = gen_data(Head_strided_shape, B_flip_shape, k_h - 1, 1, 1, strided=s_h)
    if (s_h <= 1):
        Head_origin_5D = fmap_data_01
    else:
        Head_origin_5D = np.full((o_n, o_c // block_size, o_h, o_w, block_size), 0, 'float16')
        for i0 in range(Head_origin_5D.shape[0]):
            for i1 in range(Head_origin_5D.shape[1]):
                for i2 in range(Head_origin_5D.shape[2]):
                    for i3 in range(Head_origin_5D.shape[3]):
                        for i4 in range(Head_origin_5D.shape[4]):
                            Head_origin_5D[i0, i1, i2, i3, i4] = fmap_data_01[i0, i1, i2 * s_h, i3 * s_w, i4]
    B_origin_Fractal = np.flip(np.flip(filter_data_01, 1), 2)\
                            .reshape((k_n // block_size, k_h, k_w, k_c // block_size, block_size, block_size))
    B_origin_Fractal = np.transpose(B_origin_Fractal, (3, 1, 2, 0, 5, 4))\
                            .reshape((k_c // block_size * k_h * k_w, k_n // block_size, block_size, block_size))

    B_flipped_Fractal = np.reshape(filter_data_01, (k_n // block_size, k_h, k_w, k_c // block_size, block_size, block_size))
    B_flipped_Fractal = np.reshape(B_flipped_Fractal, (k_n // block_size * k_h * k_w, k_c // block_size, block_size, block_size))

    out_data_01 = np.full(expect_01.shape, 0, 'float16')
    input_01 = (fmap_data_01, filter_data_01)
    args = (fmap_data_01, filter_data_01, out_data_01)
    out_data_01 = utils.mod_launch(mod_data, args, expect=expect_01)

    assert_res = compare_5D(out_data_01, expect_01)

    H_strided = np.full((o_n, o_c // block_size, (o_h - 1) * s_h + 1, (o_w - 1) * s_w + 1, block_size), 0, 'float16')
    H_strided = utils.mod_launch(mod_head_strided, (Head_origin_5D, H_strided), expect=expect_01)

    B_flipped = np.full((k_n // block_size * k_h * k_w, k_c // block_size, block_size, block_size), 0, 'float16')
    B_flipped = utils.mod_launch(mod_weight_flipped, (B_origin_Fractal, B_flipped), expect=expect_01)

    assert_res &= compare_5D(H_strided, fmap_data_01)
    tmp1 = B_flipped_Fractal.reshape(-1).copy()
    tmp2 = B_flipped.reshape(-1).copy()
    ind = []
    for i in range(len(tmp1)):
        if (np.abs(tmp1[i] - tmp2[i]) > 0.05):
            ind.append(i)
    print("Len of bad indices: ", len(ind))
    assert_res &= (len(ind) == 0)
    print("Test result for conv_input_ad = ", assert_res)

    return input_01, out_data_01, expect_01, assert_res


def gen_data(fm_shape, w_shape, pad, stride, dilation, strided=-1):
    IN, IC, IH, IW = fm_shape
    C0 = 16
    IC = ((IC + C0 - 1) // C0) * C0

    WN, WC, WH, WW = w_shape
    WN = ((WN + C0 - 1) // C0) * C0
    WC = ((WC + C0 - 1) // C0) * C0

    ON = IN
    OC = WN
    WHD = (WH - 1) * dilation + 1
    WWD = (WW - 1) * dilation + 1
    OH = (IH + 2 * pad - WHD) // stride + 1
    OW = (IW + 2 * pad - WWD) // stride + 1

    if (strided <= 1):
        x = random_gaussian((IN, IC, IH, IW), miu=1, sigma=0.1).astype(np.float16)
    else:
        x_tmp = random_gaussian((IN, IC, (IH // strided + 1), (IW // strided + 1)), miu=1, sigma=0.1).astype(np.float16)
        x = np.full((IN, IC, IH, IW), 0, dtype=np.float16)
        for i0 in range(x_tmp.shape[0]):
            for i1 in range(x_tmp.shape[1]):
                for i2 in range(x_tmp.shape[2]):
                    for i3 in range(x_tmp.shape[3]):
                        x[i0, i1, i2 * strided, i3 * strided] = x_tmp[i0, i1, i2, i3]

    w = random_gaussian((WN, WC, WH, WW), miu=0.5, sigma=0.01).astype(np.float16)

    conv_param = {'stride': stride, 'pad': pad, 'dilation': dilation}
    out = conv_forward_naive(x, w, None, conv_param)

    # transpose to 5D - NC1HWC0
    feature = x.reshape(IN, IC // C0, C0, IH, IW).transpose(0, 1, 3, 4, 2).copy()
    # transpose to 5D - C1HWNC0
    filter = w.reshape(WN, WC // C0, C0, WH, WW).transpose(1, 3, 4, 0, 2).copy()
    # transpose to 5D - NC1HWC0
    output = out.reshape(ON, OC // C0, C0, OH, OW).transpose(0, 1, 3, 4, 2).copy()

    return feature, filter, output


def calculate_conv_backprop_input(x, w, dy, pad_list, stride_list):
    N, C, H, W = dy.shape
    Cin, Cout, KH, KW = w.shape
    assert(C == Cin)

    pad_top, pad_bottom, pad_left, pad_right = pad_list
    stride_h, stride_w = stride_list

    if stride_h > 1 or stride_w > 1:
        dy_ = np.full((N, C, H * stride_h, W * stride_w), 0, np.float16)
        for nn in range(N):
            for nc in range(C):
                for nh in range(H):
                    for nw in range(W):
                        dy_[nn, nc, nh * stride_h, nw * stride_w] = dy[nn, nc, nh, nw]
        dy = dy_
        H = H * stride_h
        W = W * stride_w
        stride_h = 1
        stride_w = 1

    H_out = (H + pad_top + pad_bottom - KH) // stride_h + 1
    W_out = (W + pad_left + pad_right - KW) // stride_w + 1
    assert(H_out == x.shape[2])
    assert(W_out == x.shape[3])

    dy_pad = np.pad(dy, ((0, 0), (0, 0), (pad_top, pad_bottom), (pad_left, pad_right)), mode='constant', constant_values=0)
    dx = np.zeros_like(x)
    w_trans = np.zeros((w.shape[1], w.shape[0], w.shape[2], w.shape[3]))
    for cout in range(Cout):
        for cin in range(Cin):
            for kh in range(KH):
                for kw in range(KW):
                    w_trans[cout, cin, kh, kw] = w[cin, cout, KH - 1 - kh, KW - 1 - kw]

    for nn in range(N):
        for nc in range(Cout):
            for nh in range(H_out):
                for nw in range(W_out):
                    dx[nn, nc, nh, nw] += np.sum(dy_pad[nn, :, nh * stride_h: nh * stride_h + KH, nw * stride_w: nw * stride_w + KW] * w_trans[nc, :, :, :], axis=(0, 1, 2))

    N, C, H, W = x.shape
    dx = dx.reshape(N, C // 16, 16, H, W).transpose(0, 1, 3, 4, 2).copy()

    return dx.astype(np.float16)


def gen_data_dx(fmap_shape, filter_shape, pad_, stride_, dilation_, expect_file, attrs=None):
    block_size = 16
    in_n, in_c, in_h, in_w = fmap_shape
    cout, cin, w_h, w_w = filter_shape
    assert in_c == cin

    in_c = (in_c + block_size - 1) // block_size * block_size
    cout = (cout + block_size - 1) // block_size * block_size

    pad_top, pad_bottom, pad_left, pad_right = pad_
    stride_h, stride_w = stride_

    dilation_h, dilation_w = dilation_
    assert dilation_h == 1
    assert dilation_w == 1
    x_shape = (in_n, in_c, in_h, in_w)
    w_shape = (cout, in_c, w_h, w_w)
    b_shape = (w_shape[0], )

    p_top = w_h - pad_top - 1
    p_left = w_w - pad_left - 1
    p_bottom = in_h + pad_top - stride_h * ((in_h + pad_top + pad_bottom - w_h) // stride_h + 1)
    p_right = in_w + pad_left - stride_w * ((in_w + pad_left + pad_right - w_w) // stride_w + 1)

    print("Data gen ...")
    x = random_gaussian(x_shape, miu=1, sigma=0.1).astype(np.float16)
    w = random_gaussian(w_shape, miu=1, sigma=0.1).astype(np.float16)

    Ho = (x_shape[2] + pad_top + pad_bottom - w_shape[2]) // stride_h + 1
    Wo = (x_shape[3] + pad_left + pad_right - w_shape[3]) // stride_w + 1

    out_shape = (x_shape[0], w_shape[0], Ho, Wo)
    dout = random_gaussian(out_shape, miu=1, sigma=0.1).astype(np.float16)

    dx_shape = (in_n, in_c // block_size, in_h, in_w, block_size)
    flag_w = os.environ.get("WRITE_TO_DISK", "No")
    if (flag_w == "No") and (os.path.exists(expect_file) == True):
        # read expect from file
        dx = np.fromfile(expect_file, np.float16).reshape(dx_shape)
    else:
        # compute expect data:
        dx = calculate_conv_backprop_input(x, w, dout, [p_top, p_bottom, p_left, p_right], [stride_h, stride_w])

    if flag_w == "Yes":
        # write expect to file
        with open(expect_file, "w+") as file:
            dx.tofile(file)
            file.close()

    # reshape
    C0 = block_size
    ON, OC, OH, OW = out_shape
    WN, WC, WH, WW = w_shape
    dout = dout.reshape(ON, OC // C0, C0, OH, OW).transpose(0, 1, 3, 4, 2).copy()
    w = w.reshape(WN, WC // C0, C0, WH, WW).transpose(1, 3, 4, 0, 2).copy()

    return dout, w, dx