#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Ernie model."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import json
import six
import logging
import paddle.fluid as fluid
import paddle.fluid.layers as L

from io import open

from models.ernie_model.transformer_encoder import encoder, pre_process_layer
from models.ernie_model.transformer_encoder import graph_encoder

log = logging.getLogger(__name__)


class ErnieConfig(object):
    def __init__(self, config_path):
        self._config_dict = self._parse(config_path)

    def _parse(self, config_path):
        try:
            with open(config_path, 'r', encoding='utf8') as json_file:
                config_dict = json.load(json_file)
        except Exception:
            raise IOError("Error in parsing Ernie model config file '%s'" %
                          config_path)
        else:
            return config_dict

    def __getitem__(self, key):
        return self._config_dict.get(key, None)

    def print_config(self):
        for arg, value in sorted(six.iteritems(self._config_dict)):
            log.info('%s: %s' % (arg, value))
        log.info('------------------------------------------------')


class ErnieModel(object):
    def __init__(self,
                 src_ids,
                 sentence_ids,
                 position_ids=None,
                 input_mask=None,
                 task_ids=None,
                 config=None,
                 weight_sharing=True,
                 use_fp16=False,
                 name=""):

        self._set_config(config, name, weight_sharing)
        if position_ids is None:
            position_ids = self._build_position_ids(src_ids)
        if input_mask is None:
            input_mask = self._build_input_mask(src_ids)
        self._build_model(src_ids, position_ids, sentence_ids, task_ids,
                          input_mask)
        self._debug_summary(input_mask)

    def _debug_summary(self, input_mask):
        #histogram
        seqlen_before_pad = L.cast(
            L.reduce_sum(
                input_mask, dim=1), dtype='float32')
        seqlen_after_pad = L.reduce_sum(
            L.cast(
                L.zeros_like(input_mask), dtype='float32') + 1.0, dim=1)
        pad_num = seqlen_after_pad - seqlen_before_pad
        pad_rate = pad_num / seqlen_after_pad

    def _build_position_ids(self, src_ids):
        d_shape = L.shape(src_ids)
        d_seqlen = d_shape[1]
        d_batch = d_shape[0]
        position_ids = L.reshape(
            L.range(
                0, d_seqlen, 1, dtype='int32'), [1, d_seqlen, 1],
            inplace=True)
        position_ids = L.expand(position_ids, [d_batch, 1, 1])
        position_ids = L.cast(position_ids, 'int64')
        position_ids.stop_gradient = True
        return position_ids

    def _build_input_mask(self, src_ids):
        zero = L.fill_constant([1], dtype='int64', value=0)
        input_mask = L.logical_not(L.equal(src_ids,
                                           zero))  # assume pad id == 0
        input_mask = L.cast(input_mask, 'float')
        input_mask.stop_gradient = True
        return input_mask

    def _set_config(self, config, name, weight_sharing):
        self._emb_size = config['hidden_size']
        self._n_layer = config['num_hidden_layers']
        self._n_head = config['num_attention_heads']
        self._voc_size = config['vocab_size']
        self._max_position_seq_len = config['max_position_embeddings']
        if config.get('sent_type_vocab_size'):
            self._sent_types = config['sent_type_vocab_size']
        else:
            self._sent_types = config['type_vocab_size']

        self._use_task_id = config['use_task_id']
        if self._use_task_id:
            self._task_types = config['task_type_vocab_size']
        self._hidden_act = config['hidden_act']
        self._postprocess_cmd = config.get('postprocess_cmd', 'dan')
        self._preprocess_cmd = config.get('preprocess_cmd', '')
        self._prepostprocess_dropout = config['hidden_dropout_prob']
        self._attention_dropout = config['attention_probs_dropout_prob']
        self._weight_sharing = weight_sharing
        self.name = name

        self._word_emb_name = self.name + "word_embedding"
        self._pos_emb_name = self.name + "pos_embedding"
        self._sent_emb_name = self.name + "sent_embedding"
        self._task_emb_name = self.name + "task_embedding"
        self._dtype = "float16" if config['use_fp16'] else "float32"
        self._emb_dtype = "float32"

        # Initialize all weigths by truncated normal initializer, and all biases
        # will be initialized by constant zero by default.
        self._param_initializer = fluid.initializer.TruncatedNormal(
            scale=config['initializer_range'])

    def _build_model(self, src_ids, position_ids, sentence_ids, task_ids,
                     input_mask):

        emb_out = self._build_embedding(src_ids, position_ids, sentence_ids,
                                        task_ids)
        self.input_mask = input_mask
        self._enc_out, self.all_hidden, self.all_attn, self.all_ffn = encoder(
            enc_input=emb_out,
            input_mask=input_mask,
            n_layer=self._n_layer,
            n_head=self._n_head,
            d_key=self._emb_size // self._n_head,
            d_value=self._emb_size // self._n_head,
            d_model=self._emb_size,
            d_inner_hid=self._emb_size * 4,
            prepostprocess_dropout=self._prepostprocess_dropout,
            attention_dropout=self._attention_dropout,
            relu_dropout=0,
            hidden_act=self._hidden_act,
            preprocess_cmd=self._preprocess_cmd,
            postprocess_cmd=self._postprocess_cmd,
            param_initializer=self._param_initializer,
            name=self.name + 'encoder')
        if self._dtype == "float16":
            self._enc_out = fluid.layers.cast(
                x=self._enc_out, dtype=self._emb_dtype)

    def _build_embedding(self, src_ids, position_ids, sentence_ids, task_ids):
        # padding id in vocabulary must be set to 0
        emb_out = fluid.layers.embedding(
            input=src_ids,
            size=[self._voc_size, self._emb_size],
            dtype=self._emb_dtype,
            param_attr=fluid.ParamAttr(
                name=self._word_emb_name, initializer=self._param_initializer),
            is_sparse=False)

        position_emb_out = fluid.layers.embedding(
            input=position_ids,
            size=[self._max_position_seq_len, self._emb_size],
            dtype=self._emb_dtype,
            param_attr=fluid.ParamAttr(
                name=self._pos_emb_name, initializer=self._param_initializer))

        sent_emb_out = fluid.layers.embedding(
            sentence_ids,
            size=[self._sent_types, self._emb_size],
            dtype=self._emb_dtype,
            param_attr=fluid.ParamAttr(
                name=self._sent_emb_name, initializer=self._param_initializer))

        self.all_emb = [emb_out, position_emb_out, sent_emb_out]
        emb_out = emb_out + position_emb_out
        emb_out = emb_out + sent_emb_out

        if self._use_task_id:
            task_emb_out = fluid.layers.embedding(
                task_ids,
                size=[self._task_types, self._emb_size],
                dtype=self._emb_dtype,
                param_attr=fluid.ParamAttr(
                    name=self._task_emb_name,
                    initializer=self._param_initializer))

            emb_out = emb_out + task_emb_out

        emb_out = pre_process_layer(
            emb_out,
            'nd',
            self._prepostprocess_dropout,
            name=self.name + 'pre_encoder')

        if self._dtype == "float16":
            emb_out = fluid.layers.cast(x=emb_out, dtype=self._dtype)
        return emb_out

    def get_sequence_output(self):
        return self._enc_out

    def get_pooled_output(self):
        """Get the first feature of each sequence for classification"""
        next_sent_feat = self._enc_out[:, 0, :]
        #next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1])
        next_sent_feat = fluid.layers.fc(
            input=next_sent_feat,
            size=self._emb_size,
            act="tanh",
            param_attr=fluid.ParamAttr(
                name=self.name + "pooled_fc.w_0",
                initializer=self._param_initializer),
            bias_attr=self.name + "pooled_fc.b_0")
        return next_sent_feat

    def get_lm_output(self, mask_label, mask_pos):
        """Get the loss & accuracy for pretraining"""

        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')

        # extract the first token feature in each sentence
        self.next_sent_feat = self.get_pooled_output()
        reshaped_emb_out = fluid.layers.reshape(
            x=self._enc_out, shape=[-1, self._emb_size])
        # extract masked tokens' feature
        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)

        # transform: fc
        mask_trans_feat = fluid.layers.fc(
            input=mask_feat,
            size=self._emb_size,
            act=self._hidden_act,
            param_attr=fluid.ParamAttr(
                name=self.name + 'mask_lm_trans_fc.w_0',
                initializer=self._param_initializer),
            bias_attr=fluid.ParamAttr(name=self.name + 'mask_lm_trans_fc.b_0'))

        # transform: layer norm 
        mask_trans_feat = fluid.layers.layer_norm(
            mask_trans_feat,
            begin_norm_axis=len(mask_trans_feat.shape) - 1,
            param_attr=fluid.ParamAttr(
                name=self.name + 'mask_lm_trans_layer_norm_scale',
                initializer=fluid.initializer.Constant(1.)),
            bias_attr=fluid.ParamAttr(
                name=self.name + 'mask_lm_trans_layer_norm_bias',
                initializer=fluid.initializer.Constant(0.)))
        # transform: layer norm 
        #mask_trans_feat = pre_process_layer(
        #    mask_trans_feat, 'n', name=self.name + 'mask_lm_trans')

        mask_lm_out_bias_attr = fluid.ParamAttr(
            name=self.name + "mask_lm_out_fc.b_0",
            initializer=fluid.initializer.Constant(value=0.0))
        if self._weight_sharing:
            fc_out = fluid.layers.matmul(
                x=mask_trans_feat,
                y=fluid.default_main_program().global_block().var(
                    self._word_emb_name),
                transpose_y=True)
            fc_out += fluid.layers.create_parameter(
                shape=[self._voc_size],
                dtype=self._emb_dtype,
                attr=mask_lm_out_bias_attr,
                is_bias=True)

        else:
            fc_out = fluid.layers.fc(input=mask_trans_feat,
                                     size=self._voc_size,
                                     param_attr=fluid.ParamAttr(
                                         name=self.name + "mask_lm_out_fc.w_0",
                                         initializer=self._param_initializer),
                                     bias_attr=mask_lm_out_bias_attr)

        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
            logits=fc_out, label=mask_label)
        return mask_lm_loss

    def get_task_output(self, task, task_labels):
        task_fc_out = fluid.layers.fc(
            input=self.next_sent_feat,
            size=task["num_labels"],
            param_attr=fluid.ParamAttr(
                name=self.name + task["task_name"] + "_fc.w_0",
                initializer=self._param_initializer),
            bias_attr=self.name + task["task_name"] + "_fc.b_0")
        task_loss, task_softmax = fluid.layers.softmax_with_cross_entropy(
            logits=task_fc_out, label=task_labels, return_softmax=True)
        task_acc = fluid.layers.accuracy(input=task_softmax, label=task_labels)
        return task_loss, task_acc


class ErnieGraphModel(ErnieModel):
    def __init__(self,
                 src_ids,
                 task_ids=None,
                 config=None,
                 weight_sharing=True,
                 use_fp16=False,
                 slot_seqlen=40,
                 name=""):
        self.slot_seqlen = slot_seqlen
        self._set_config(config, name, weight_sharing)
        input_mask = self._build_input_mask(src_ids)
        position_ids = self._build_position_ids(src_ids)
        sentence_ids = self._build_sentence_ids(src_ids)
        self._build_model(src_ids, position_ids, sentence_ids, task_ids,
                          input_mask)
        self._debug_summary(input_mask)

    def _build_position_ids(self, src_ids):
        src_shape = L.shape(src_ids)
        src_seqlen = src_shape[1]
        src_batch = src_shape[0]

        slot_seqlen = self.slot_seqlen

        num_b = (src_seqlen / slot_seqlen) - 1
        a_position_ids = L.reshape(
            L.range(
                0, slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1],
            inplace=True) # [1, slot_seqlen, 1]
        a_position_ids = L.expand(a_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen * num_b, 1]

        zero = L.fill_constant([1], dtype='int64', value=0)
        input_mask = L.cast(L.equal(src_ids[:, :slot_seqlen], zero), "int32")  # assume pad id == 0 [B, slot_seqlen, 1]
        a_pad_len = L.reduce_sum(input_mask, 1) # [B, 1, 1]

        b_position_ids = L.reshape(
            L.range(
                slot_seqlen, 2*slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1],
            inplace=True) # [1, slot_seqlen, 1]
        b_position_ids = L.expand(b_position_ids, [src_batch, num_b, 1]) # [B, slot_seqlen * num_b, 1]
        b_position_ids = b_position_ids - a_pad_len # [B, slot_seqlen * num_b, 1]

        position_ids = L.concat([a_position_ids, b_position_ids], 1)
        position_ids = L.cast(position_ids, 'int64')
        position_ids.stop_gradient = True
        return position_ids

    def _build_sentence_ids(self, src_ids):
        src_shape = L.shape(src_ids)
        src_seqlen = src_shape[1]
        src_batch = src_shape[0]

        slot_seqlen = self.slot_seqlen

        zeros = L.zeros([src_batch, slot_seqlen, 1], "int64")
        ones = L.ones([src_batch, src_seqlen-slot_seqlen, 1], "int64")

        sentence_ids = L.concat([zeros, ones], 1)
        sentence_ids.stop_gradient = True
        return sentence_ids

    def _build_model(self, src_ids, position_ids, sentence_ids, task_ids,
                     input_mask):

        emb_out = self._build_embedding(src_ids, position_ids, sentence_ids,
                                        task_ids)
        self.input_mask = input_mask
        self._enc_out, self.all_hidden, self.all_attn, self.all_ffn = graph_encoder(
            enc_input=emb_out,
            input_mask=input_mask,
            n_layer=self._n_layer,
            n_head=self._n_head,
            d_key=self._emb_size // self._n_head,
            d_value=self._emb_size // self._n_head,
            d_model=self._emb_size,
            d_inner_hid=self._emb_size * 4,
            prepostprocess_dropout=self._prepostprocess_dropout,
            attention_dropout=self._attention_dropout,
            relu_dropout=0,
            hidden_act=self._hidden_act,
            preprocess_cmd=self._preprocess_cmd,
            postprocess_cmd=self._postprocess_cmd,
            param_initializer=self._param_initializer,
            slot_seqlen=self.slot_seqlen,
            name=self.name + 'encoder')
        if self._dtype == "float16":
            self._enc_out = fluid.layers.cast(
                x=self._enc_out, dtype=self._emb_dtype)