# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

import numpy as np
from test_imperative_base import new_program_scope

import paddle
from paddle import fluid
from paddle.fluid import core
from paddle.fluid.dygraph.base import to_variable
from paddle.nn import BatchNorm, Linear


class Config:
    '''
    config for training
    '''

    # encoder rnn hidden_size
    encoder_size = 8
    # decoder size for decoder stage
    decoder_size = 8
    # size for word embedding
    word_vector_dim = 8
    # max length for label padding
    max_length = 3
    # optimizer setting
    LR = 1.0
    learning_rate_decay = None

    # batch size to train
    batch_size = 2
    # class number to classify
    num_classes = 64

    use_gpu = False
    # special label for start and end
    SOS = 0
    EOS = 1
    # settings for ctc data, not use in unittest
    DATA_DIR_NAME = "./dataset/ctc_data/data"
    TRAIN_DATA_DIR_NAME = "train_images"
    TRAIN_LIST_FILE_NAME = "train.list"

    # data shape for input image
    DATA_SHAPE = [1, 16, 64]


class ConvBNPool(paddle.nn.Layer):
    def __init__(
        self,
        group,
        out_ch,
        channels,
        act="relu",
        is_test=False,
        pool=True,
        use_cudnn=True,
    ):
        super().__init__()
        self.group = group
        self.pool = pool

        filter_size = 3
        conv_std_0 = (2.0 / (filter_size**2 * channels[0])) ** 0.5
        conv_param_0 = fluid.ParamAttr(
            initializer=paddle.nn.initializer.Normal(0.0, conv_std_0)
        )

        conv_std_1 = (2.0 / (filter_size**2 * channels[1])) ** 0.5
        conv_param_1 = fluid.ParamAttr(
            initializer=paddle.nn.initializer.Normal(0.0, conv_std_1)
        )

        self.conv_0_layer = paddle.nn.Conv2D(
            channels[0],
            out_ch[0],
            3,
            padding=1,
            weight_attr=conv_param_0,
            bias_attr=False,
        )
        self.bn_0_layer = BatchNorm(out_ch[0], act=act, is_test=is_test)
        self.conv_1_layer = paddle.nn.Conv2D(
            out_ch[0],
            out_ch[1],
            3,
            padding=1,
            weight_attr=conv_param_1,
            bias_attr=False,
        )
        self.bn_1_layer = BatchNorm(out_ch[1], act=act, is_test=is_test)

        if self.pool:
            self.pool_layer = paddle.nn.MaxPool2D(
                kernel_size=2,
                stride=2,
                ceil_mode=True,
            )

    def forward(self, inputs):
        conv_0 = self.conv_0_layer(inputs)
        bn_0 = self.bn_0_layer(conv_0)
        conv_1 = self.conv_1_layer(bn_0)
        bn_1 = self.bn_1_layer(conv_1)
        if self.pool:
            bn_pool = self.pool_layer(bn_1)
            return bn_pool
        return bn_1


class OCRConv(paddle.nn.Layer):
    def __init__(self, is_test=False, use_cudnn=True):
        super().__init__()
        self.conv_bn_pool_1 = ConvBNPool(
            2, [8, 8], [1, 8], is_test=is_test, use_cudnn=use_cudnn
        )
        self.conv_bn_pool_2 = ConvBNPool(
            2, [8, 8], [8, 8], is_test=is_test, use_cudnn=use_cudnn
        )
        self.conv_bn_pool_3 = ConvBNPool(
            2, [8, 8], [8, 8], is_test=is_test, use_cudnn=use_cudnn
        )
        self.conv_bn_pool_4 = ConvBNPool(
            2,
            [16, 16],
            [8, 16],
            is_test=is_test,
            pool=False,
            use_cudnn=use_cudnn,
        )

    def forward(self, inputs):
        inputs_1 = self.conv_bn_pool_1(inputs)
        inputs_2 = self.conv_bn_pool_2(inputs_1)
        inputs_3 = self.conv_bn_pool_3(inputs_2)
        inputs_4 = self.conv_bn_pool_4(inputs_3)

        return inputs_4


class DynamicGRU(paddle.nn.Layer):
    def __init__(
        self,
        size,
        param_attr=None,
        bias_attr=None,
        is_reverse=False,
        gate_activation='sigmoid',
        candidate_activation='tanh',
        h_0=None,
        origin_mode=False,
    ):
        super().__init__()

        self.gru_unit = paddle.nn.GRUCell(
            size * 3,
            size,
        )

        self.h_0 = h_0
        self.is_reverse = is_reverse
        self.size = size

    def forward(self, inputs):
        hidden = self.h_0
        res = []
        for i in range(inputs.shape[1]):
            if self.is_reverse:
                i = inputs.shape[1] - 1 - i
            input_ = paddle.slice(inputs, axes=[1], starts=[i], ends=[i + 1])
            input_ = paddle.reshape(input_, [-1, input_.shape[2]])
            hidden, reset = self.gru_unit(input_, hidden)
            hidden_ = paddle.reshape(hidden, [-1, 1, hidden.shape[1]])
            if self.is_reverse:
                res = [hidden_] + res
            else:
                res.append(hidden_)
        res = paddle.concat(res, axis=1)
        return res


class EncoderNet(paddle.nn.Layer):
    def __init__(
        self, rnn_hidden_size=Config.encoder_size, is_test=False, use_cudnn=True
    ):
        super().__init__()
        self.rnn_hidden_size = rnn_hidden_size
        para_attr = fluid.ParamAttr(
            initializer=paddle.nn.initializer.Normal(0.0, 0.02)
        )
        bias_attr = fluid.ParamAttr(
            initializer=paddle.nn.initializer.Normal(0.0, 0.02),
            learning_rate=2.0,
        )
        if fluid.framework.in_dygraph_mode():
            h_0 = np.zeros(
                (Config.batch_size, rnn_hidden_size), dtype="float32"
            )
            h_0 = to_variable(h_0)
        else:
            h_0 = paddle.tensor.fill_constant(
                shape=[Config.batch_size, rnn_hidden_size],
                dtype='float32',
                value=0,
            )
        self.ocr_convs = OCRConv(is_test=is_test, use_cudnn=use_cudnn)

        self.fc_1_layer = Linear(
            32, rnn_hidden_size * 3, weight_attr=para_attr, bias_attr=False
        )
        self.fc_2_layer = Linear(
            32, rnn_hidden_size * 3, weight_attr=para_attr, bias_attr=False
        )
        self.gru_forward_layer = DynamicGRU(
            size=rnn_hidden_size,
            h_0=h_0,
            param_attr=para_attr,
            bias_attr=bias_attr,
            candidate_activation='relu',
        )
        self.gru_backward_layer = DynamicGRU(
            size=rnn_hidden_size,
            h_0=h_0,
            param_attr=para_attr,
            bias_attr=bias_attr,
            candidate_activation='relu',
            is_reverse=True,
        )

        self.encoded_proj_fc = Linear(
            rnn_hidden_size * 2, Config.decoder_size, bias_attr=False
        )

    def forward(self, inputs):
        conv_features = self.ocr_convs(inputs)
        # sliced_feature = fluid.layers.im2sequence(
        #    input=conv_features,
        #    stride=[1, 1],
        #    filter_size=[conv_features.shape[2], 1])

        transpose_conv_features = paddle.transpose(
            conv_features, perm=[0, 3, 1, 2]
        )
        sliced_feature = paddle.reshape(
            transpose_conv_features,
            [
                -1,
                8,
                transpose_conv_features.shape[2]
                * transpose_conv_features.shape[3],
            ],
        )
        fc_1 = self.fc_1_layer(sliced_feature)
        fc_2 = self.fc_2_layer(sliced_feature)
        gru_forward = self.gru_forward_layer(fc_1)

        gru_backward = self.gru_backward_layer(fc_2)

        encoded_vector = paddle.concat([gru_forward, gru_backward], axis=2)

        encoded_proj = self.encoded_proj_fc(encoded_vector)

        return gru_backward, encoded_vector, encoded_proj


class SimpleAttention(paddle.nn.Layer):
    def __init__(self, decoder_size):
        super().__init__()

        self.fc_1 = Linear(decoder_size, decoder_size, bias_attr=False)
        self.fc_2 = Linear(decoder_size, 1, bias_attr=False)

    def forward(self, encoder_vec, encoder_proj, decoder_state):
        decoder_state_fc = self.fc_1(decoder_state)
        decoder_state_proj_reshape = paddle.reshape(
            decoder_state_fc, [-1, 1, decoder_state_fc.shape[1]]
        )
        decoder_state_expand = paddle.expand(
            decoder_state_proj_reshape,
            [-1, encoder_proj.shape[1], -1],
        )
        concated = paddle.add(encoder_proj, decoder_state_expand)
        concated = paddle.tanh(x=concated)
        attention_weight = self.fc_2(concated)

        weights_reshape = paddle.reshape(
            x=attention_weight,
            shape=[attention_weight.shape[0], attention_weight.shape[1]],
        )

        weights_reshape = paddle.nn.functional.softmax(weights_reshape)
        scaled = paddle.tensor.math._multiply_with_axis(
            x=encoder_vec, y=weights_reshape, axis=0
        )
        context = paddle.sum(scaled, axis=1)

        return context


class GRUDecoderWithAttention(paddle.nn.Layer):
    def __init__(self, decoder_size, num_classes):
        super().__init__()
        self.simple_attention = SimpleAttention(decoder_size)

        self.fc_1_layer = Linear(
            Config.encoder_size * 2, decoder_size * 3, bias_attr=False
        )
        self.fc_2_layer = Linear(
            decoder_size, decoder_size * 3, bias_attr=False
        )
        self.gru_unit = paddle.nn.GRUCell(decoder_size * 3, decoder_size)
        self.out_layer = Linear(decoder_size, num_classes + 2, bias_attr=None)

        self.decoder_size = decoder_size

    def forward(
        self, target_embedding, encoder_vec, encoder_proj, decoder_boot
    ):
        res = []
        hidden_mem = decoder_boot
        for i in range(target_embedding.shape[1]):
            current_word = paddle.slice(
                target_embedding, axes=[1], starts=[i], ends=[i + 1]
            )
            current_word = paddle.reshape(
                current_word, [-1, current_word.shape[2]]
            )

            context = self.simple_attention(
                encoder_vec, encoder_proj, hidden_mem
            )
            fc_1 = self.fc_1_layer(context)
            fc_2 = self.fc_2_layer(current_word)
            decoder_inputs = paddle.add(x=fc_1, y=fc_2)

            h, _ = self.gru_unit(decoder_inputs, hidden_mem)
            hidden_mem = h
            out = self.out_layer(h)
            out = paddle.nn.functional.softmax(out)
            res.append(out)

        res1 = paddle.concat(res, axis=1)

        return res1


class OCRAttention(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.encoder_net = EncoderNet()
        self.fc = Linear(
            Config.encoder_size,
            Config.decoder_size,
            bias_attr=False,
        )
        self.embedding = paddle.nn.Embedding(
            Config.num_classes + 2, Config.word_vector_dim
        )
        self.gru_decoder_with_attention = GRUDecoderWithAttention(
            Config.decoder_size, Config.num_classes
        )

    def forward(self, inputs, label_in):
        gru_backward, encoded_vector, encoded_proj = self.encoder_net(inputs)
        backward_first = paddle.slice(
            gru_backward, axes=[1], starts=[0], ends=[1]
        )
        backward_first = paddle.reshape(
            backward_first, [-1, backward_first.shape[2]]
        )
        decoder_boot = self.fc(backward_first)
        decoder_boot = paddle.nn.functional.relu(decoder_boot)
        label_in = paddle.reshape(label_in, [-1])
        trg_embedding = self.embedding(label_in)

        trg_embedding = paddle.reshape(
            trg_embedding,
            [-1, Config.max_length, trg_embedding.shape[1]],
        )

        prediction = self.gru_decoder_with_attention(
            trg_embedding, encoded_vector, encoded_proj, decoder_boot
        )

        return prediction


class TestDygraphOCRAttention(unittest.TestCase):
    def test_ocr_test(self):
        seed = 90
        epoch_num = 1
        if core.is_compiled_with_cuda():
            batch_num = 3
        else:
            batch_num = 2
        np.random.seed = seed
        image_np = np.random.randn(
            Config.batch_size,
            Config.DATA_SHAPE[0],
            Config.DATA_SHAPE[1],
            Config.DATA_SHAPE[2],
        ).astype('float32')
        label_in_np = np.arange(0, Config.max_length, dtype='int64').reshape(
            [1, Config.max_length]
        )
        for i in range(2, Config.batch_size + 1):
            label_in_np = np.vstack(
                (
                    label_in_np,
                    np.arange(
                        (i - 1) * Config.max_length,
                        i * Config.max_length,
                        dtype='int64',
                    ).reshape([1, Config.max_length]),
                )
            )

        label_out_np = np.arange(0, Config.max_length, dtype='int64').reshape(
            [1, Config.max_length]
        )
        for i in range(2, Config.batch_size + 1):
            label_out_np = np.vstack(
                (
                    label_out_np,
                    np.arange(
                        (i - 1) * Config.max_length,
                        i * Config.max_length,
                        dtype='int64',
                    ).reshape([1, Config.max_length]),
                )
            )

        def run_dygraph():
            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
            paddle.seed(seed)
            paddle.framework.random._manual_program_seed(seed)
            ocr_attention = OCRAttention()

            if Config.learning_rate_decay == "piecewise_decay":
                learning_rate = paddle.optimizer.lr.piecewise_decay(
                    [50000], [Config.LR, Config.LR * 0.01]
                )
            else:
                learning_rate = Config.LR
            optimizer = paddle.optimizer.SGD(
                learning_rate=0.001, parameters=ocr_attention.parameters()
            )
            dy_param_init_value = {}
            for param in ocr_attention.parameters():
                dy_param_init_value[param.name] = param.numpy()
            for epoch in range(epoch_num):
                for batch_id in range(batch_num):
                    label_in = to_variable(label_in_np)
                    label_out = to_variable(label_out_np)
                    label_out.stop_gradient = True
                    img = to_variable(image_np)
                    dy_prediction = ocr_attention(img, label_in)
                    label_out = paddle.reshape(label_out, [-1, 1])
                    dy_prediction = paddle.reshape(
                        dy_prediction, [label_out.shape[0], -1]
                    )
                    loss = paddle.nn.functional.cross_entropy(
                        input=dy_prediction,
                        label=label_out,
                        reduction='none',
                        use_softmax=False,
                    )
                    avg_loss = paddle.sum(loss)

                    dy_out = avg_loss.numpy()

                    if epoch == 0 and batch_id == 0:
                        for param in ocr_attention.parameters():
                            if param.name not in dy_param_init_value:
                                dy_param_init_value[param.name] = param.numpy()
                    avg_loss.backward()
                    dy_grad_value = {}
                    for param in ocr_attention.parameters():
                        if param.trainable:
                            np_array = np.array(
                                param._grad_ivar().value().get_tensor()
                            )
                            dy_grad_value[
                                param.name + core.grad_var_suffix()
                            ] = np_array

                    optimizer.minimize(avg_loss)
                    ocr_attention.clear_gradients()
                    dy_param_value = {}
                    for param in ocr_attention.parameters():
                        dy_param_value[param.name] = param.numpy()

            return dy_out, dy_param_init_value, dy_param_value

        with fluid.dygraph.guard():
            dy_out, dy_param_init_value, dy_param_value = run_dygraph()

        with fluid.dygraph.guard():
            (
                eager_out,
                eager_param_init_value,
                eager_param_value,
            ) = run_dygraph()

        with new_program_scope():
            paddle.seed(seed)
            paddle.framework.random._manual_program_seed(seed)
            exe = fluid.Executor(
                fluid.CPUPlace()
                if not core.is_compiled_with_cuda()
                else fluid.CUDAPlace(0)
            )
            ocr_attention = OCRAttention()

            if Config.learning_rate_decay == "piecewise_decay":
                learning_rate = paddle.optimizer.lr.piecewise_decay(
                    [50000], [Config.LR, Config.LR * 0.01]
                )
            else:
                learning_rate = Config.LR

            optimizer = paddle.optimizer.SGD(learning_rate=0.001)

            images = paddle.static.data(
                name='pixel', shape=[-1] + Config.DATA_SHAPE, dtype='float32'
            )
            images.desc.set_need_check_feed(False)
            static_label_in = paddle.static.data(
                name='label_in', shape=[-1, 1], dtype='int64', lod_level=0
            )
            static_label_in.desc.set_need_check_feed(False)
            static_label_out = paddle.static.data(
                name='label_out', shape=[-1, 1], dtype='int64', lod_level=0
            )
            static_label_out.desc.set_need_check_feed(False)

            static_label_out.stop_gradient = True
            static_label_out.trainable = False

            static_prediction = ocr_attention(images, static_label_in)

            static_prediction = paddle.reshape(
                static_prediction, shape=[-1, Config.num_classes + 2]
            )

            cost = paddle.nn.functional.cross_entropy(
                input=static_prediction,
                label=static_label_out,
                reduction='none',
                use_softmax=False,
            )
            static_avg_loss = paddle.sum(cost)
            # param_grad_list = fluid.backward.append_backward(static_avg_loss)
            optimizer.minimize(static_avg_loss)

            static_param_init_value = {}
            static_param_name_list = []
            static_grad_name_list = []
            for param in ocr_attention.parameters():
                static_param_name_list.append(param.name)
                if param.trainable:
                    static_grad_name_list.append(
                        param.name + core.grad_var_suffix()
                    )

            out = exe.run(
                fluid.default_startup_program(),
                fetch_list=static_param_name_list,
            )

            for i in range(len(static_param_name_list)):
                static_param_init_value[static_param_name_list[i]] = out[i]

            fetch_list = [static_avg_loss.name]
            fetch_list.extend(static_param_name_list)
            fetch_list.extend(static_grad_name_list)
            for epoch in range(epoch_num):
                for batch_id in range(batch_num):
                    static_label_in = label_in_np
                    static_label_out = label_out_np
                    static_label_out = static_label_out.reshape((-1, 1))
                    out = exe.run(
                        fluid.default_main_program(),
                        feed={
                            "pixel": image_np,
                            "label_in": static_label_in,
                            "label_out": static_label_out,
                        },
                        fetch_list=fetch_list,
                    )
                    static_param_value = {}
                    static_grad_value = {}
                    static_out = out[0]
                    for i in range(1, len(static_param_name_list) + 1):
                        static_param_value[static_param_name_list[i - 1]] = out[
                            i
                        ]
                    grad_start_pos = len(static_param_name_list) + 1
                    for i in range(
                        grad_start_pos,
                        len(static_grad_name_list) + grad_start_pos,
                    ):
                        static_grad_value[
                            static_grad_name_list[i - grad_start_pos]
                        ] = out[i]

        np.testing.assert_allclose(static_out, dy_out, rtol=1e-05, atol=1e-8)

        for key, value in static_param_init_value.items():
            np.testing.assert_array_equal(value, dy_param_init_value[key])

        for key, value in static_param_value.items():
            np.testing.assert_allclose(
                value, dy_param_value[key], rtol=1e-05, atol=1e-8
            )

        # check eager here
        np.testing.assert_allclose(static_out, eager_out, rtol=1e-05, atol=1e-8)

        for key, value in static_param_init_value.items():
            np.testing.assert_array_equal(value, eager_param_init_value[key])

        for key, value in static_param_value.items():
            np.testing.assert_allclose(
                value, eager_param_value[key], rtol=1e-05, atol=1e-8
            )


if __name__ == '__main__':
    paddle.enable_static()
    unittest.main()