#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT model."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import six
import json
import numpy as np
import paddle.fluid as fluid

from pdnlp.module.transformer_encoder import encoder as encoder
from pdnlp.module.transformer_encoder import pre_process_layer as pre_process_layer

class BertModel(object):
    
    def __init__(self,
                 src_ids,
                 position_ids,
                 sentence_ids,
                 input_mask,
                 config,
                 weight_sharing=True,
                 use_fp16=False,
                 model_name = ''):

        self._emb_size = config["hidden_size"]
        self._n_layer = config["num_hidden_layers"]
        self._n_head = config["num_attention_heads"]
        self._voc_size = config["vocab_size"]
        self._max_position_seq_len = config["max_position_embeddings"]
        self._sent_types = config["type_vocab_size"]
        self._hidden_act = config["hidden_act"]
        self._prepostprocess_dropout = config["hidden_dropout_prob"]
        self._attention_dropout = config["attention_probs_dropout_prob"]
        self._weight_sharing = weight_sharing

        self.model_name = model_name

        self._word_emb_name = self.model_name + "word_embedding"
        self._pos_emb_name = self.model_name + "pos_embedding"
        self._sent_emb_name = self.model_name + "sent_embedding"
        self._dtype = "float16" if use_fp16 else "float32"

        # Initialize all weigths by truncated normal initializer, and all biases 
        # will be initialized by constant zero by default.
        self._param_initializer = fluid.initializer.TruncatedNormal(
            scale=config["initializer_range"])

        self._build_model(src_ids, position_ids, sentence_ids, input_mask, config)

    def _build_model(self, src_ids, position_ids, sentence_ids, input_mask, config):
        # padding id in vocabulary must be set to 0
        emb_out = fluid.layers.embedding(
            input=src_ids,
            size=[self._voc_size, self._emb_size],
            dtype=self._dtype,
            param_attr=fluid.ParamAttr(
                name=self._word_emb_name, initializer=self._param_initializer),
            is_sparse=False)
        
        self.emb_out =emb_out
        
        position_emb_out = fluid.layers.embedding(
            input=position_ids,
            size=[self._max_position_seq_len, self._emb_size],
            dtype=self._dtype,
            param_attr=fluid.ParamAttr(
                name=self._pos_emb_name, initializer=self._param_initializer))
    
        self.position_emb_out = position_emb_out

        sent_emb_out = fluid.layers.embedding(
            sentence_ids,
            size=[self._sent_types, self._emb_size],
            dtype=self._dtype,
            param_attr=fluid.ParamAttr(
                name=self._sent_emb_name, initializer=self._param_initializer))

        self.sent_emb_out = sent_emb_out

        emb_out = emb_out + position_emb_out
        emb_out = emb_out + sent_emb_out

        emb_out = pre_process_layer(
            emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')

        if self._dtype == "float16":
            input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)

        self_attn_mask = fluid.layers.matmul(
            x = input_mask, y = input_mask, transpose_y = True)

        self_attn_mask = fluid.layers.scale(
            x = self_attn_mask, scale = 10000.0, bias = -1.0, bias_after_scale = False)
        
        n_head_self_attn_mask = fluid.layers.stack(
            x=[self_attn_mask] * self._n_head, axis=1)
        
        n_head_self_attn_mask.stop_gradient = True

        self._enc_out = encoder(
            enc_input = emb_out,
            attn_bias = n_head_self_attn_mask,
            n_layer = self._n_layer,
            n_head = self._n_head,
            d_key = self._emb_size // self._n_head,
            d_value = self._emb_size // self._n_head,
            d_model = self._emb_size,
            d_inner_hid = self._emb_size * 4,
            prepostprocess_dropout = self._prepostprocess_dropout,
            attention_dropout = self._attention_dropout,
            relu_dropout = 0,
            hidden_act = self._hidden_act,
            preprocess_cmd = "",
            postprocess_cmd = "dan",
            param_initializer = self._param_initializer,
            name = self.model_name + 'encoder')

    def get_sequence_output(self):
        return self._enc_out

    def get_pooled_output(self):
        """Get the first feature of each sequence for classification"""

        next_sent_feat = fluid.layers.slice(
            input = self._enc_out, axes = [1], starts = [0], ends = [1])
        next_sent_feat = fluid.layers.fc(
            input = next_sent_feat,
            size = self._emb_size,
            act = "tanh",
            param_attr = fluid.ParamAttr(
                name = self.model_name + "pooled_fc.w_0", 
                initializer = self._param_initializer),
            bias_attr = "pooled_fc.b_0")
        return next_sent_feat

    def get_pretraining_output(self, mask_label, mask_pos, labels):
        """Get the loss & accuracy for pretraining"""

        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')

        # extract the first token feature in each sentence
        next_sent_feat = self.get_pooled_output()
        reshaped_emb_out = fluid.layers.reshape(
            x=self._enc_out, shape = [-1, self._emb_size])
        # extract masked tokens' feature
        mask_feat = fluid.layers.gather(input = reshaped_emb_out, index = mask_pos)

        # transform: fc
        mask_trans_feat = fluid.layers.fc(
            input = mask_feat,
            size = self._emb_size,
            act = self._hidden_act,
            param_attr = fluid.ParamAttr(
                name = self.model_name + 'mask_lm_trans_fc.w_0',
                initializer = self._param_initializer),
            bias_attr = fluid.ParamAttr(name = self.model_name + 'mask_lm_trans_fc.b_0'))
        # transform: layer norm 
        mask_trans_feat = pre_process_layer(
            mask_trans_feat, 'n', name = self.model_name + 'mask_lm_trans')

        mask_lm_out_bias_attr = fluid.ParamAttr(
            name = self.model_name + "mask_lm_out_fc.b_0",
            initializer = fluid.initializer.Constant(value = 0.0))
        if self._weight_sharing:
            fc_out = fluid.layers.matmul(
                x = mask_trans_feat,
                y = fluid.default_main_program().global_block().var(
                    self._word_emb_name),
                transpose_y = True)
            fc_out += fluid.layers.create_parameter(
                shape = [self._voc_size],
                dtype = self._dtype,
                attr = mask_lm_out_bias_attr,
                is_bias = True)

        else:
            fc_out = fluid.layers.fc(input = mask_trans_feat,
                                     size = self._voc_size,
                                     param_attr = fluid.ParamAttr(
                                         name = self.model_name + "mask_lm_out_fc.w_0",
                                         initializer = self._param_initializer),
                                     bias_attr = mask_lm_out_bias_attr)

        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
            logits = fc_out, label = mask_label)
        mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)

        next_sent_fc_out = fluid.layers.fc(
            input = next_sent_feat,
            size = 2,
            param_attr = fluid.ParamAttr(
                name = self.model_name + "next_sent_fc.w_0", 
                initializer = self._param_initializer),
            bias_attr = self.model_name + "next_sent_fc.b_0")

        next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(
            logits = next_sent_fc_out, label = labels, return_softmax = True)

        next_sent_acc = fluid.layers.accuracy(
            input = next_sent_softmax, label = labels)

        mean_next_sent_loss = fluid.layers.mean(next_sent_loss)

        loss = mean_next_sent_loss + mean_mask_lm_loss
        return next_sent_acc, mean_mask_lm_loss, loss


if __name__ == "__main__":
    print("hello wolrd!")