#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle.fluid.layers as layers
import paddle.fluid as fluid
import numpy as np


def dropout(input, test_mode, args):
    if args.dropout and (not test_mode):
        return layers.dropout(
            input,
            dropout_prob=args.dropout,
            dropout_implementation="upscale_in_train",
            seed=args.random_seed,
            is_test=False)
    else:
        return input


def lstmp_encoder(input_seq, gate_size, h_0, c_0, para_name, proj_size,
                  test_mode, args):
    # A lstm encoder implementation with projection.
    # Linear transformation part for input gate, output gate, forget gate
    # and cell activation vectors need be done outside of dynamic_lstm.
    # So the output size is 4 times of gate_size.

    input_seq = dropout(input_seq, test_mode, args)
    input_proj = layers.fc(input=input_seq,
                           param_attr=fluid.ParamAttr(
                               name=para_name + '_gate_w', initializer=None),
                           size=gate_size * 4,
                           act=None,
                           bias_attr=False)
    hidden, cell = layers.dynamic_lstmp(
        input=input_proj,
        size=gate_size * 4,
        proj_size=proj_size,
        h_0=h_0,
        c_0=c_0,
        use_peepholes=False,
        proj_clip=args.proj_clip,
        cell_clip=args.cell_clip,
        proj_activation="identity",
        param_attr=fluid.ParamAttr(initializer=None),
        bias_attr=fluid.ParamAttr(initializer=None))

    return hidden, cell, input_proj


def encoder(x,
            y,
            vocab_size,
            emb_size,
            init_hidden=None,
            init_cell=None,
            para_name='',
            custom_samples=None,
            custom_probabilities=None,
            test_mode=False,
            args=None):
    x_emb = layers.embedding(
        input=x,
        size=[vocab_size, emb_size],
        dtype='float32',
        is_sparse=False,
        param_attr=fluid.ParamAttr(name='embedding_para'))
    rnn_input = x_emb
    rnn_outs = []
    rnn_outs_ori = []
    cells = []
    projs = []
    for i in range(args.num_layers):
        rnn_input = dropout(rnn_input, test_mode, args)
        if init_hidden and init_cell:
            h0 = layers.squeeze(
                layers.slice(
                    init_hidden, axes=[0], starts=[i], ends=[i + 1]),
                axes=[0])
            c0 = layers.squeeze(
                layers.slice(
                    init_cell, axes=[0], starts=[i], ends=[i + 1]),
                axes=[0])
        else:
            h0 = c0 = None
        rnn_out, cell, input_proj = lstmp_encoder(
            rnn_input, args.hidden_size, h0, c0,
            para_name + 'layer{}'.format(i + 1), emb_size, test_mode, args)
        rnn_out_ori = rnn_out
        if i > 0:
            rnn_out = rnn_out + rnn_input
        rnn_out = dropout(rnn_out, test_mode, args)
        cell = dropout(cell, test_mode, args)
        rnn_outs.append(rnn_out)
        rnn_outs_ori.append(rnn_out_ori)
        rnn_input = rnn_out
        cells.append(cell)
        projs.append(input_proj)

    softmax_weight = layers.create_parameter(
        [vocab_size, emb_size], dtype="float32", name="softmax_weight")
    softmax_bias = layers.create_parameter(
        [vocab_size], dtype="float32", name='softmax_bias')
    projection = layers.matmul(rnn_outs[-1], softmax_weight, transpose_y=True)
    projection = layers.elementwise_add(projection, softmax_bias)

    projection = layers.reshape(projection, shape=[-1, vocab_size])

    if args.sample_softmax and (not test_mode):
        loss = layers.sampled_softmax_with_cross_entropy(
            logits=projection,
            label=y,
            num_samples=args.n_negative_samples_batch,
            seed=args.random_seed)
    else:
        label = layers.one_hot(input=y, depth=vocab_size)
        loss = layers.softmax_with_cross_entropy(
            logits=projection, label=label, soft_label=True)
    return [x_emb, projection, loss], rnn_outs, rnn_outs_ori, cells, projs


class LanguageModel(object):
    def __init__(self, args, vocab_size, test_mode):
        self.args = args
        self.vocab_size = vocab_size
        self.test_mode = test_mode

    def build(self):
        args = self.args
        emb_size = args.embed_size
        proj_size = args.embed_size
        hidden_size = args.hidden_size
        batch_size = args.batch_size
        num_layers = args.num_layers
        num_steps = args.num_steps

        lstm_outputs = []

        x_f = layers.data(name="x", shape=[1], dtype='int64', lod_level=1)
        y_f = layers.data(name="y", shape=[1], dtype='int64', lod_level=1)

        x_b = layers.data(name="x_r", shape=[1], dtype='int64', lod_level=1)
        y_b = layers.data(name="y_r", shape=[1], dtype='int64', lod_level=1)

        init_hiddens_ = layers.data(
            name="init_hiddens", shape=[1], dtype='float32')
        init_cells_ = layers.data(name="init_cells", shape=[1], dtype='float32')

        init_hiddens = layers.reshape(
            init_hiddens_, shape=[2 * num_layers, -1, proj_size])
        init_cells = layers.reshape(
            init_cells_, shape=[2 * num_layers, -1, hidden_size])

        init_hidden = layers.slice(
            init_hiddens, axes=[0], starts=[0], ends=[num_layers])
        init_cell = layers.slice(
            init_cells, axes=[0], starts=[0], ends=[num_layers])
        init_hidden_r = layers.slice(
            init_hiddens, axes=[0], starts=[num_layers], ends=[2 * num_layers])
        init_cell_r = layers.slice(
            init_cells, axes=[0], starts=[num_layers], ends=[2 * num_layers])

        if args.use_custom_samples:
            custom_samples = layers.data(
                name="custom_samples",
                shape=[args.n_negative_samples_batch + 1],
                dtype='int64',
                lod_level=1)
            custom_samples_r = layers.data(
                name="custom_samples_r",
                shape=[args.n_negative_samples_batch + 1],
                dtype='int64',
                lod_level=1)
            custom_probabilities = layers.data(
                name="custom_probabilities",
                shape=[args.n_negative_samples_batch + 1],
                dtype='float32',
                lod_level=1)
        else:
            custom_samples = None
            custom_samples_r = None
            custom_probabilities = None

        forward, fw_hiddens, fw_hiddens_ori, fw_cells, fw_projs = encoder(
            x_f,
            y_f,
            self.vocab_size,
            emb_size,
            init_hidden,
            init_cell,
            para_name='fw_',
            custom_samples=custom_samples,
            custom_probabilities=custom_probabilities,
            test_mode=self.test_mode,
            args=args)
        backward, bw_hiddens, bw_hiddens_ori, bw_cells, bw_projs = encoder(
            x_b,
            y_b,
            self.vocab_size,
            emb_size,
            init_hidden_r,
            init_cell_r,
            para_name='bw_',
            custom_samples=custom_samples_r,
            custom_probabilities=custom_probabilities,
            test_mode=self.test_mode,
            args=args)

        losses = layers.concat([forward[-1], backward[-1]])
        self.loss = layers.reduce_mean(losses)
        self.loss.persistable = True
        self.grad_vars = [x_f, y_f, x_b, y_b, self.loss]
        self.grad_vars_name = ['x', 'y', 'x_r', 'y_r', 'final_loss']
        fw_vars_name = ['x_emb', 'proj', 'loss'] + [
            'init_hidden', 'init_cell'
        ] + ['rnn_out', 'rnn_out2', 'cell', 'cell2', 'xproj', 'xproj2']
        bw_vars_name = ['x_emb_r', 'proj_r', 'loss_r'] + [
            'init_hidden_r', 'init_cell_r'
        ] + [
            'rnn_out_r', 'rnn_out2_r', 'cell_r', 'cell2_r', 'xproj_r',
            'xproj2_r'
        ]
        fw_vars = forward + [init_hidden, init_cell
                             ] + fw_hiddens + fw_cells + fw_projs
        bw_vars = backward + [init_hidden_r, init_cell_r
                              ] + bw_hiddens + bw_cells + bw_projs
        for i in range(len(fw_vars_name)):
            self.grad_vars.append(fw_vars[i])
            self.grad_vars.append(bw_vars[i])
            self.grad_vars_name.append(fw_vars_name[i])
            self.grad_vars_name.append(bw_vars_name[i])
        if args.use_custom_samples:
            self.feed_order = [
                'x', 'y', 'x_r', 'y_r', 'custom_samples', 'custom_samples_r',
                'custom_probabilities'
            ]
        else:
            self.feed_order = ['x', 'y', 'x_r', 'y_r']
        self.last_hidden = [
            fluid.layers.sequence_last_step(input=x)
            for x in fw_hiddens_ori + bw_hiddens_ori
        ]
        self.last_cell = [
            fluid.layers.sequence_last_step(input=x)
            for x in fw_cells + bw_cells
        ]
        self.last_hidden = layers.concat(self.last_hidden, axis=0)
        self.last_hidden.persistable = True
        self.last_cell = layers.concat(self.last_cell, axis=0)
        self.last_cell.persistable = True