# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.nn.layer.transformer import MultiHeadAttention, TransformerEncoderLayer, TransformerDecoderLayer, TransformerEncoder, TransformerDecoder, Transformer

import unittest


def generate_basic_params(mode="attn", self_attention=True):
    batch_size, query_length = [np.random.randint(2, 10) for _ in range(2)]
    d_head, num_heads = [np.random.randint(3, 10) for _ in range(2)]
    attn_dropout = 0.0
    embed_dim = d_head * num_heads
    if mode == "attn":
        if self_attention:
            kdim, vdim = embed_dim, embed_dim
            key_length, value_length = query_length, query_length
        else:
            kdim, vdim = [np.random.randint(5, 20) for _ in range(2)]
            key_length = np.random.randint(2, 10)
            value_length = key_length
        return batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout

    else:
        dropout, act_dropout = 0.0, 0.0
        dim_feedforward = np.random.randint(128, 1024)
        sequence_length = np.random.randint(2, 10)
        if mode == "encoder_layer":
            return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length
        elif mode == "decoder_layer":
            target_length = np.random.randint(2, 10)
            return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length, target_length


def generate_query_key_value_cache(self_attention,
                                   batch_size,
                                   num_heads,
                                   query_length,
                                   embed_dim,
                                   key_length=None,
                                   value_length=None,
                                   kdim=None,
                                   vdim=None,
                                   cache=None):
    query = np.random.rand(batch_size, query_length,
                           embed_dim).astype("float32")
    attn_mask = np.zeros((batch_size, num_heads, query_length, key_length))
    attn_mask[0][0][0][0] = -1e9

    head_dim = embed_dim // num_heads
    if self_attention:
        key, value = query, query
    else:
        key = np.random.rand(batch_size, key_length, kdim).astype("float32")
        value = np.random.rand(batch_size, value_length, vdim).astype("float32")
    cache_dict = {}
    if cache:
        if not self_attention:
            cache_dict["static_k"] = np.random.rand(
                batch_size, num_heads, key_length, head_dim).astype("float32")
            cache_dict["static_v"] = np.random.rand(
                batch_size, num_heads, value_length, head_dim).astype("float32")
        else:
            cache_dict["k"] = np.random.rand(batch_size, num_heads, key_length,
                                             head_dim).astype("float32")
            cache_dict["v"] = np.random.rand(
                batch_size, num_heads, value_length, head_dim).astype("float32")
    else:
        cache_dict = None
    return query, key, value, attn_mask, cache_dict


def fc(x, weight):
    return np.matmul(x, weight)


def softmax(x):
    np.seterr(invalid='ignore')
    output = np.zeros(x.shape, dtype=np.float64)
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            for k in range(x.shape[2]):
                x_curr = x[i, j, k, :]
                e_x = np.exp(x_curr - np.amax(x_curr))
                output[i, j, k, :] = e_x / np.sum(e_x)
    return output


def batch_matmul(x, y):
    assert x.shape[0] == y.shape[0]
    assert x.shape[1] == y.shape[1]
    retval = np.zeros(
        (x.shape[0], x.shape[1], x.shape[2], y.shape[3]), dtype=np.float64)
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            retval[i, j, :, :] = np.matmul(x[i, j, :, :], y[i, j, :, :])
    return retval


def scaled_dot_product_attention(q, k, v, d_key, attn_mask, multi_head_attn):
    k = k.transpose([0, 1, 3, 2])
    qkt = batch_matmul(q, k / np.sqrt(d_key, dtype=np.float64))
    if attn_mask is not None:
        qkt += attn_mask
    weight = softmax(qkt)
    attn_heads = batch_matmul(weight, v)
    attn_heads = attn_heads.transpose((0, 2, 1, 3))
    attn_heads = attn_heads.reshape((attn_heads.shape[0], attn_heads.shape[1],
                                     attn_heads.shape[2] * attn_heads.shape[3]))
    return attn_heads


def cal_qkv(key, value, num_heads, embed_dim, multi_head_attn):
    with fluid.dygraph.guard():
        head_dim = embed_dim // num_heads
        k_weight = multi_head_attn.k_proj.weight.numpy()
        v_weight = multi_head_attn.v_proj.weight.numpy()
        k = fc(key, k_weight)
        v = fc(value, v_weight)
        k = k.reshape((k.shape[0], k.shape[1], num_heads, head_dim))
        k = k.transpose((0, 2, 1, 3))
        v = v.reshape((v.shape[0], v.shape[1], num_heads, head_dim))
        v = v.transpose((0, 2, 1, 3))
        return k, v


def prepare_qkv(query, key, value, num_heads, embed_dim, self_attention,
                multi_head_attn, cache_dict):
    q_weight = multi_head_attn.q_proj.weight.numpy()
    q = fc(query, q_weight)
    q = q.reshape((q.shape[0], q.shape[1], num_heads, embed_dim // num_heads))
    q = q.transpose((0, 2, 1, 3))

    if not self_attention and cache_dict:
        k, v = cache_dict["static_k"], cache_dict["static_v"]
    else:
        k, v = cal_qkv(key, value, num_heads, embed_dim, multi_head_attn)
        if cache_dict is not None:
            k = np.concatenate((cache_dict["k"], k), axis=2)
            v = np.concatenate((cache_dict["v"], v), axis=2)
    return (q, k, v, cache_dict)


def add(x, y=None):
    fluid.enable_dygraph()
    with fluid.dygraph.guard():
        x = x.numpy() if not isinstance(x, np.ndarray) else x
        if y is not None:
            x += y
            return x
        return x


def relu(x):
    compare = x > 0
    return x * compare


def layer_norm(x, normalized_shape, norm, epsilon=1e-05, act=None):
    fluid.enable_dygraph()
    with fluid.dygraph.guard():
        # scale:
        weight = norm.weight.numpy()
        # shift:
        bias = norm.bias.numpy()

        batch_size, src_len, d_model = x.shape
        x = x.reshape((batch_size * src_len, d_model))
        mu = np.mean(x, axis=1, keepdims=True)
        sigma_squar = np.sum(np.square(x - mu), axis=1) / d_model
        x1_up = (x - mu)
        x1_down_1 = sigma_squar + epsilon
        x1_down = np.sqrt(x1_down_1)
        x1_down = x1_down.reshape((x1_down.shape[0], 1))
        x1 = x1_up / x1_down
        x_scaled = weight * x1
        x_scaled_bias = x_scaled + bias
        x_scaled_bias = x_scaled_bias.reshape((batch_size, src_len, d_model))
    return x_scaled_bias


def ffn(src, encoder_layer, ffn_fc1_act="relu"):
    assert ffn_fc1_act == "relu", "only relu is supported"
    fluid.enable_dygraph()
    with fluid.dygraph.guard():
        src = src.numpy() if not isinstance(src, np.ndarray) else src
        w1 = encoder_layer.linear1.weight.numpy()
        w2 = encoder_layer.linear2.weight.numpy()
        # fc1
        x1 = fc(src, w1)
        x1 = relu(x1)
        # fc2
        x2 = fc(x1, w2)
        return x2


class TestTransformer(unittest.TestCase):
    def test_multi_head_attention(self):
        def multihead_attention_test_helper(self_attention, cache):
            paddle.manual_seed(2020)
            paddle.framework.random._manual_program_seed(2020)
            # self_attention|cross_attention, cache|No cache
            with fluid.dygraph.guard(fluid.CPUPlace()):

                # generate params for multi_head_attention
                batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout = generate_basic_params(
                    "attn", self_attention)
                query, key, value, attn_mask, cache_dict = generate_query_key_value_cache(
                    self_attention, batch_size, num_heads, query_length,
                    embed_dim, key_length, value_length, kdim, vdim, cache)
                if cache and self_attention:
                    attn_mask = np.concatenate((attn_mask, attn_mask), axis=3)
                need_weight, param_attr, bias_attr = False, None, None
                # call paddle's function
                multi_head_attn = MultiHeadAttention(
                    embed_dim, num_heads, attn_dropout, kdim, vdim, need_weight,
                    param_attr, bias_attr)
                # construct cache object
                cache_obj = None
                if cache_dict:
                    if 'k' and 'v' in cache_dict:
                        cache_obj = multi_head_attn.Cache(
                            paddle.to_variable(cache_dict['k']),
                            paddle.to_variable(cache_dict['v']))
                    elif 'static_k' and 'static_v' in cache_dict:
                        cache_obj = multi_head_attn.StaticCache(
                            paddle.to_variable(cache_dict['static_k']),
                            paddle.to_variable(cache_dict['static_v']))
                if attn_mask is not None:
                    attn_output = multi_head_attn(
                        paddle.to_variable(query),
                        paddle.to_variable(key),
                        paddle.to_variable(value),
                        paddle.to_variable(attn_mask), cache_obj)
                else:
                    attn_output = multi_head_attn(
                        paddle.to_variable(query),
                        paddle.to_variable(key),
                        paddle.to_variable(value), attn_mask, cache_obj)
                attn_output = attn_output[0] if cache_dict else attn_output

                # implementation by numpy
                # compute q, k, v
                q, k, v, _ = prepare_qkv(query, key, value, num_heads,
                                         embed_dim, self_attention,
                                         multi_head_attn, cache_dict)
                # scale dot product attention
                attn_heads = scaled_dot_product_attention(
                    q, k, v, embed_dim // num_heads, attn_mask, multi_head_attn)
                out_proj_weight = multi_head_attn.out_proj.weight.numpy()
                reference = fc(attn_heads, out_proj_weight)

                np.testing.assert_allclose(
                    attn_output.numpy(), reference, atol=1e-6)

        multihead_attention_test_helper(True, True)
        multihead_attention_test_helper(True, False)
        multihead_attention_test_helper(False, True)
        multihead_attention_test_helper(False, False)

    def test_transformer_encoder_layer(self):

        with fluid.dygraph.guard(fluid.CPUPlace()):
            paddle.framework.manual_seed(2020)
            paddle.framework.random._manual_program_seed(2020)

            ffn_fc1_act = "relu"
            # 1.generate basic params
            batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params(
                mode="encoder_layer")
            # 2.generate input for encoder
            src = np.random.rand(batch_size, sequence_length,
                                 d_model).astype("float32")
            residual = src
            src_mask = np.zeros((batch_size, n_head, sequence_length,
                                 sequence_length)).astype("float32")
            src_mask[0][0][0][0] = -np.inf

            # paddle
            encoder_layer = TransformerEncoderLayer(
                d_model, n_head, dim_feedforward, dropout, ffn_fc1_act,
                attn_dropout, act_dropout)

            encoder_output = encoder_layer(
                paddle.to_variable(src),
                paddle.to_variable(src_mask))  # paddle.to_variable(src_mask))
            # 4.numpy:
            # paddle self attention
            self_attn = MultiHeadAttention(
                d_model, n_head, dropout=attn_dropout)
            attn_output = self_attn(
                paddle.to_variable(src),
                paddle.to_variable(src),
                paddle.to_variable(src), paddle.to_variable(src_mask)).numpy()

            src = attn_output + residual
            src_norm = layer_norm(src, d_model, encoder_layer.norm1)
            residual = src_norm

            ffn_output = ffn(src_norm, encoder_layer, ffn_fc1_act)
            src = residual + ffn_output
            src = layer_norm(src, d_model, encoder_layer.norm2)

            np.testing.assert_allclose(
                encoder_output.numpy(), src, rtol=1e-5, atol=1e-6)

    def test_transformer_decoder_layer(self):
        with fluid.dygraph.guard(fluid.CPUPlace()):
            paddle.framework.manual_seed(2020)
            activation = "relu"
            normalize_before = False
            batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, source_length, target_length = generate_basic_params(
                mode="decoder_layer")
            tgt = np.random.rand(batch_size, target_length,
                                 d_model).astype("float32")
            memory = np.random.rand(batch_size, source_length,
                                    d_model).astype("float32")
            tgt_mask = np.zeros((batch_size, n_head, target_length,
                                 target_length)).astype("float32")
            tgt_mask[0][0][0][0] = -1e9
            memory_mask = np.zeros((batch_size, n_head, target_length,
                                    source_length)).astype("float32")
            memory_mask[0][0][0][0] = -1e9
            for cache in [True, False]:
                self_attn = MultiHeadAttention(
                    d_model, n_head, dropout=attn_dropout)
                cross_attn = MultiHeadAttention(
                    d_model, n_head, dropout=attn_dropout)

                # paddle decoderlayer:
                decoder_layer = TransformerDecoderLayer(
                    d_model, n_head, dim_feedforward, dropout, activation,
                    attn_dropout, act_dropout, normalize_before)
                cache_objs = None
                if cache:
                    cache_objs = decoder_layer.gen_cache(
                        paddle.to_variable(memory))

                decoder_output = decoder_layer(
                    paddle.to_variable(tgt),
                    paddle.to_variable(memory),
                    paddle.to_variable(tgt_mask),
                    paddle.to_variable(memory_mask), cache_objs)

                decoder_output = decoder_output[0].numpy(
                ) if cache else decoder_output.numpy()

                # numpy:
                residual = tgt
                # self-attn
                self_attn_cache = cache_objs[
                    0] if cache_objs is not None else None
                tgt = self_attn(
                    paddle.to_variable(tgt),
                    paddle.to_variable(tgt),
                    paddle.to_variable(tgt),
                    paddle.to_variable(tgt_mask), self_attn_cache)

                tgt = tgt[0].numpy() if cache else tgt.numpy()

                tgt = residual + tgt
                # postprocess
                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm1)
                residual = tgt_norm
                # cross-attn
                cross_attn_cache = cache_objs[
                    1] if cache_objs is not None else None
                tgt = cross_attn(
                    paddle.to_variable(tgt_norm),
                    paddle.to_variable(memory),
                    paddle.to_variable(memory),
                    paddle.to_variable(memory_mask), cross_attn_cache)
                tgt = tgt[0].numpy() if cache else tgt.numpy()

                # postprocess
                tgt = tgt + residual
                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm2)
                residual = tgt_norm
                # FFN
                ffn_output = ffn(tgt_norm, decoder_layer, activation)
                # post process
                tgt = residual + ffn_output
                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm3)

                np.testing.assert_allclose(
                    decoder_output, tgt_norm, rtol=1e-5, atol=1e-6)

    def test_encoder(self):
        batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params(
            mode="encoder_layer")

        src = np.random.rand(batch_size, sequence_length,
                             d_model).astype("float32")

        src_mask = np.zeros((batch_size, n_head, sequence_length,
                             sequence_length)).astype("float32")
        src_mask[0][0][0][0] = -np.inf
        with fluid.dygraph.guard(fluid.CPUPlace()):
            encoder_layer = TransformerEncoderLayer(d_model, n_head,
                                                    dim_feedforward, dropout)
            num_layers = 6
            encoder = TransformerEncoder(encoder_layer, num_layers)
            # src, src_mask
            enc_output = encoder(
                paddle.to_variable(src), paddle.to_variable(src_mask))

    def test_decoder(self):
        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
            mode="decoder_layer")
        tgt = np.random.rand(batch_size, target_length,
                             d_model).astype("float32")
        memory = np.random.rand(batch_size, source_length,
                                d_model).astype("float32")
        tgt_mask = np.zeros((batch_size, n_head, target_length,
                             target_length)).astype("float32")
        tgt_mask[0][0][0][0] = -1e9
        memory_mask = np.zeros((batch_size, n_head, target_length,
                                source_length)).astype("float32")
        memory_mask[0][0][0][0] = -1e9
        with fluid.dygraph.guard(fluid.CPUPlace()):
            decoder_layer = TransformerDecoderLayer(d_model, n_head,
                                                    dim_feedforward, dropout)
            num_layers = 6
            decoder = TransformerDecoder(decoder_layer, num_layers)

            output = decoder(
                paddle.to_variable(tgt),
                paddle.to_variable(memory),
                paddle.to_variable(tgt_mask), paddle.to_variable(memory_mask))

    def test_transformer(self):
        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
            mode="decoder_layer")

        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
        with fluid.dygraph.guard(fluid.CPUPlace()):
            transformer = Transformer(
                d_model,
                n_head,
                dim_feedforward=dim_feedforward,
                dropout=dropout)
            src = paddle.to_variable(
                np.random.rand(batch_size, source_length, d_model).astype(
                    "float32"))
            tgt = paddle.to_variable(
                np.random.rand(batch_size, target_length, d_model).astype(
                    "float32"))
            src_mask = np.zeros((batch_size, n_head, source_length,
                                 source_length)).astype("float32")
            src_mask[0][0][0][0] = -np.inf
            src_mask = paddle.to_variable(src_mask)
            tgt_mask = np.zeros((batch_size, n_head, target_length,
                                 target_length)).astype("float32")
            tgt_mask[0][0][0][0] = -1e9
            memory_mask = np.zeros((batch_size, n_head, target_length,
                                    source_length)).astype("float32")
            memory_mask[0][0][0][0] = -1e9
            tgt_mask, memory_mask = paddle.to_variable(
                tgt_mask), paddle.to_variable(memory_mask)
            trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                       memory_mask)


if __name__ == "__main__":
    unittest.main()