# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .layers import Layer
from paddle.fluid import layers
import copy

__all__ = ['LSTMCell', 'GRUCell']


class LSTMCell(Layer):
    """
    LSTMCell implementation using basic operators.
    There are two LSTMCell version, the default one is compatible with CUDNN LSTM implementation.
    The algorithm can be described as the equations below.
        .. math::
            i_t &= sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i)
            f_t &= sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f)
            o_t &= sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o)
            \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c)
            c_t &= f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t}
            h_t &= o_t \\odot tanh(c_t)
    The other LSTMCell version is compatible with the BasicLSTMUnit used in static graph.
    The algorithm can be described as the equations below.
            i_t &= sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i)
            f_t &= sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias )
            o_t &= sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o)
            \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
            c_t &= f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t}
            h_t &= o_t \\odot tanh(c_t)
    Args:
        hidden_size (integer): The hidden size used in the Cell.
        input_size (integer): The input size used in the Cell.
        param_attr(ParamAttr|None): The parameter attribute for the learnable
            weight matrix. Note:
            If it is set to None or one attribute of ParamAttr, lstm_unit will
            create ParamAttr as param_attr. If the Initializer of the param_attr
            is not set, the parameter is initialized with Xavier. Default: None.
        bias_attr (ParamAttr|None): The parameter attribute for the bias
            of LSTM unit.
            If it is set to None or one attribute of ParamAttr, lstm_unit will 
            create ParamAttr as bias_attr. If the Initializer of the bias_attr
            is not set, the bias is initialized as zero. Default: None.
        gate_activation (function|None): The activation function for gates (actGate).
                                  Default: 'fluid.layers.sigmoid'
        activation (function|None): The activation function for cells (actNode).
                             Default: 'fluid.layers.tanh'
        forget_bias(float|1.0): forget bias used when computing forget gate. This 
            is not used in default LSTMCell implementation (CUDNN compatiable)
        use_cudnn_impl(bool|True): whether to use CUDNN compatible LSTMCell
        dtype(string): data type used in this unit
    
    Returns:
        None
    Examples:
        .. code-block:: python
            from paddle import fluid
            import paddle.fluid.core as core
            from paddle.fluid.dygraph.rnn import LSTMCell
            import numpy as np
            batch_size = 64
            input_size = 128
            hidden_size = 256
            step_input_np = np.random.uniform(-0.1, 0.1, (
                batch_size, input_size)).astype('float64')
            pre_hidden_np = np.random.uniform(-0.1, 0.1, (
                batch_size, hidden_size)).astype('float64')
            pre_cell_np = np.random.uniform(-0.1, 0.1, (
                batch_size, hidden_size)).astype('float64')
            if core.is_compiled_with_cuda():
                place = core.CUDAPlace(0)
            else:
                place = core.CPUPlace()
            with fluid.dygraph.guard(place):
                cudnn_lstm = LSTMCell(hidden_size, input_size)
                step_input_var = fluid.dygraph.to_variable(step_input_np)
                pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np)
                pre_cell_var = fluid.dygraph.to_variable(pre_cell_np)
                new_hidden, new_cell = cudnn_lstm(step_input_var, pre_hidden_var, pre_cell_var) 
    """

    def __init__(self,
                 hidden_size,
                 input_size,
                 param_attr=None,
                 bias_attr=None,
                 gate_activation=None,
                 activation=None,
                 forget_bias=1.0,
                 use_cudnn_impl=True,
                 dtype='float64'):
        super(LSTMCell, self).__init__(dtype)

        self._hidden_size = hidden_size
        self._input_size = input_size
        self._param_attr = param_attr
        self._bias_attr = bias_attr
        self._dtype = dtype
        self._gate_activation = gate_activation or layers.sigmoid
        self._activation = activation or layers.tanh
        self._use_cudnn_impl = use_cudnn_impl

        if self._use_cudnn_impl:

            if self._param_attr is not None and self._param_attr.name is not None:
                weight_ih_param_attr = copy.deepcopy(self._param_attr)
                weight_hh_param_attr = copy.deepcopy(self._param_attr)
                weight_ih_param_attr.name += "_weight_ih"
                weight_hh_param_attr.name += "_weight_hh"
            else:
                weight_ih_param_attr = self._param_attr
                weight_hh_param_attr = self._param_attr

            if self._bias_attr is not None and self._bias_attr.name is not None:
                bias_ih_param_attr = copy.deepcopy(self._bias_attr)
                bias_hh_param_attr = copy.deepcopy(self._bias_attr)
                bias_ih_param_attr.name += "_bias_ih"
                bias_hh_param_attr.name += "_bias_hh"
            else:
                bias_ih_param_attr = self._bias_attr
                bias_hh_param_attr = self._bias_attr

            self._weight_ih = self.create_parameter(
                attr=weight_ih_param_attr,
                shape=[4 * self._hidden_size, self._input_size],
                dtype=self._dtype)

            self._weight_hh = self.create_parameter(
                attr=weight_hh_param_attr,
                shape=[4 * self._hidden_size, self._hidden_size],
                dtype=self._dtype)

            self._bias_ih = self.create_parameter(
                attr=bias_ih_param_attr,
                shape=[4 * self._hidden_size],
                dtype=self._dtype,
                is_bias=True)
            self._bias_hh = self.create_parameter(
                attr=bias_hh_param_attr,
                shape=[4 * self._hidden_size],
                dtype=self._dtype,
                is_bias=True)

        else:

            self._forget_bias = layers.fill_constant(
                [1], dtype=dtype, value=forget_bias)
            self._forget_bias.stop_gradient = False

            self._weight = self.create_parameter(
                attr=self._param_attr,
                shape=[
                    self._input_size + self._hidden_size, 4 * self._hidden_size
                ],
                dtype=dtype)

            self._bias = self.create_parameter(
                attr=self._bias_attr,
                shape=[4 * self._hidden_size],
                dtype=dtype,
                is_bias=True)

    def forward(self, input, pre_hidden, pre_cell):

        if self._use_cudnn_impl:
            igates = layers.matmul(input, y=self._weight_ih, transpose_y=True)
            igates = layers.elementwise_add(igates, self._bias_ih)
            hgates = layers.matmul(
                pre_hidden, self._weight_hh, transpose_y=True)
            hgates = layers.elementwise_add(hgates, self._bias_hh)

            chunked_igates = layers.split(igates, num_or_sections=4, dim=1)
            chunked_hgates = layers.split(hgates, num_or_sections=4, dim=1)

            ingate = layers.elementwise_add(chunked_igates[0],
                                            chunked_hgates[0])
            ingate = self._gate_activation(ingate)

            forgetgate = layers.elementwise_add(chunked_igates[1],
                                                chunked_hgates[1])
            forgetgate = self._gate_activation(forgetgate)

            cellgate = layers.elementwise_add(chunked_igates[2],
                                              chunked_hgates[2])
            cellgate = self._activation(cellgate)

            outgate = layers.elementwise_add(chunked_igates[3],
                                             chunked_hgates[3])
            outgate = self._gate_activation(outgate)

            new_cell = (forgetgate * pre_cell) + (ingate * cellgate)
            new_hidden = outgate * self._activation(new_cell)

        else:

            concat_input_hidden = layers.concat([input, pre_hidden], 1)
            gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)

            gate_input = layers.elementwise_add(gate_input, self._bias)
            i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
            new_cell = layers.elementwise_add(
                layers.elementwise_mul(
                    pre_cell,
                    self._gate_activation(
                        layers.elementwise_add(f, self._forget_bias))),
                layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j)))
            new_hidden = self._activation(new_cell) * self._gate_activation(o)

        return new_hidden, new_cell


class GRUCell(Layer):
    """
    GRU implementation using basic operators.
    There are two GRUCell version, the default one is compatible with CUDNN GRU implementation.
    The algorithm can be described as the equations below.
        .. math::
            u_t & = sigmoid(W_{ux} x_{t} + b_ux + W_{uh} h_{t-1} + b_uh)
            r_t & = sigmoid(W_{rx} x_{t} + b_rx + W_{rh} h_{t-1} + b_rh)
            \\tilde{h_{t}} & = tanh(W_{cx} x_{t} + b_cx + r_t \\odot (W_{ch} h_{t-1} + b_ch))
            h_t & = u_t h_{t-1} + (1-u_t) \\tilde{h_{t}}
    The other LSTMCell version is compatible with the BasicGRUUnit used in static graph.
    The algorithm can be described as the equations below.
            u_t & = sigmoid(W_{ux} x_{t} + W_{uh} h_{t-1} + b_u)
            r_t & = sigmoid(W_{rx} x_{t} + W_{rh} h_{t-1} + b_r)
            \\tilde{h_{t}} & = tanh(W_{cx} x_{t} + W_{ch} \\odot(r_t, h_{t-1}) + b_m)
            h_t & = u_t h_{t-1} + (1-u_t) \\tilde{h_{t}}
    Args:
        hidden_size (integer): The hidden size used in the Cell.
        input_size (integer): The input size used in the Cell.
        param_attr(ParamAttr|None): The parameter attribute for the learnable
            weight matrix. Note:
            If it is set to None or one attribute of ParamAttr, gru_unit will
            create ParamAttr as param_attr. If the Initializer of the param_attr
            is not set, the parameter is initialized with Xavier. Default: None.
        bias_attr (ParamAttr|None): The parameter attribute for the bias
            of GRU unit.
            If it is set to None or one attribute of ParamAttr, gru_unit will 
            create ParamAttr as bias_attr. If the Initializer of the bias_attr
            is not set, the bias is initialized zero. Default: None.
        gate_activation (function|None): The activation function for gates (actGate).
                                  Default: 'fluid.layers.sigmoid'
        activation (function|None): The activation function for cell (actNode).
                             Default: 'fluid.layers.tanh'
        use_cudnn_impl(bool|True): whether to use CUDNN compatible LSTMCell
        dtype(string): data type used in this unit
    
    Returns:
        None
    Examples:
        .. code-block:: python
            from paddle import fluid
            import paddle.fluid.core as core
            from paddle.fluid.dygraph.rnn import GRUCell
            import numpy as np
            batch_size = 64
            input_size = 128
            hidden_size = 256
            step_input_np = np.random.uniform(-0.1, 0.1, (
            batch_size, input_size)).astype('float64')
            pre_hidden_np = np.random.uniform(-0.1, 0.1, (
            batch_size, hidden_size)).astype('float64')
            if core.is_compiled_with_cuda():
                place = core.CUDAPlace(0)
            else:
                place = core.CPUPlace()
            with fluid.dygraph.guard(place):
                cudnn_gru = GRUCell(hidden_size, input_size)
                step_input_var = fluid.dygraph.to_variable(step_input_np)
                pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np)
    """

    def __init__(self,
                 hidden_size,
                 input_size,
                 param_attr=None,
                 bias_attr=None,
                 gate_activation=None,
                 activation=None,
                 use_cudnn_impl=True,
                 dtype='float64'):
        super(GRUCell, self).__init__()

        self._hidden_size = hidden_size
        self._input_size = input_size
        self._param_attr = param_attr
        self._bias_attr = bias_attr
        self._dtype = dtype
        self._gate_activation = gate_activation or layers.sigmoid
        self._activation = activation or layers.tanh
        self._use_cudnn_impl = use_cudnn_impl

        if self._use_cudnn_impl:

            if self._param_attr is not None and self._param_attr.name is not None:
                weight_ih_param_attr = copy.deepcopy(self._param_attr)
                weight_hh_param_attr = copy.deepcopy(self._param_attr)
                weight_ih_param_attr.name += "_weight_ih"
                weight_hh_param_attr.name += "_weight_hh"
            else:
                weight_ih_param_attr = self._param_attr
                weight_hh_param_attr = self._param_attr

            if self._bias_attr is not None and self._bias_attr.name is not None:
                bias_ih_param_attr = copy.deepcopy(self._bias_attr)
                bias_hh_param_attr = copy.deepcopy(self._bias_attr)
                bias_ih_param_attr.name += "_bias_ih"
                bias_hh_param_attr.name += "_bias_hh"
            else:
                bias_ih_param_attr = self._bias_attr
                bias_hh_param_attr = self._bias_attr

            self._weight_ih = self.create_parameter(
                attr=weight_ih_param_attr,
                shape=[3 * self._hidden_size, self._input_size],
                dtype=self._dtype)

            self._weight_hh = self.create_parameter(
                attr=weight_hh_param_attr,
                shape=[3 * self._hidden_size, self._hidden_size],
                dtype=self._dtype)

            self._bias_ih = self.create_parameter(
                attr=bias_ih_param_attr,
                shape=[3 * self._hidden_size],
                dtype=self._dtype,
                is_bias=True)
            self._bias_hh = self.create_parameter(
                attr=bias_hh_param_attr,
                shape=[3 * self._hidden_size],
                dtype=self._dtype,
                is_bias=True)

        else:

            if self._param_attr is not None and self._param_attr.name is not None:
                gate_weight_param_attr = copy.deepcopy(self._param_attr)
                candidate_weight_param_attr = copy.deepcopy(self._param_attr)
                gate_weight_param_attr.name += "_gate_weight"
                candidate_weight_param_attr.name += "_candidate_weight"
            else:
                gate_weight_param_attr = self._param_attr
                candidate_weight_param_attr = self._param_attr

            if self._bias_attr is not None and self._bias_attr.name is not None:
                gate_bias_param_attr = copy.deepcopy(self._bias_attr)
                candidate_bias_param_attr = copy.deepcopy(self._bias_attr)
                gate_bias_param_attr.name += "_gate_bias"
                candidate_bias_param_attr.name += "_candidate_bias"
            else:
                gate_bias_param_attr = self._bias_attr
                candidate_bias_param_attr = self._bias_attr

            self._gate_weight = self.create_parameter(
                attr=gate_weight_param_attr,
                shape=[
                    self._input_size + self._hidden_size, 2 * self._hidden_size
                ],
                dtype=dtype)

            self._candidate_weight = self.create_parameter(
                attr=candidate_weight_param_attr,
                shape=[
                    self._input_size + self._hidden_size, self._hidden_size
                ],
                dtype=dtype)

            self._gate_bias = self.create_parameter(
                attr=gate_bias_param_attr,
                shape=[2 * self._hidden_size],
                dtype=dtype,
                is_bias=True)
            self._candidate_bias = self.create_parameter(
                attr=candidate_bias_param_attr,
                shape=[self._hidden_size],
                dtype=dtype,
                is_bias=True)

    def forward(self, input, pre_hidden):

        if self._use_cudnn_impl:

            igates = layers.matmul(input, y=self._weight_ih, transpose_y=True)
            igates = layers.elementwise_add(igates, self._bias_ih)
            hgates = layers.matmul(
                pre_hidden, self._weight_hh, transpose_y=True)
            hgates = layers.elementwise_add(hgates, self._bias_hh)

            chunked_igates = layers.split(igates, num_or_sections=3, dim=1)
            chunked_hgates = layers.split(hgates, num_or_sections=3, dim=1)

            reset_gate = layers.elementwise_add(chunked_igates[0],
                                                chunked_hgates[0])
            reset_gate = self._gate_activation(reset_gate)

            input_gate = layers.elementwise_add(chunked_igates[1],
                                                chunked_hgates[1])
            input_gate = self._gate_activation(input_gate)

            _temp = reset_gate * chunked_hgates[2]
            new_gate = layers.elementwise_add(chunked_igates[2], _temp)
            new_gate = self._activation(new_gate)

            new_hidden = (pre_hidden - new_gate) * input_gate + new_gate

        else:

            concat_input_hidden = layers.concat([input, pre_hidden], 1)

            gate_input = layers.matmul(
                x=concat_input_hidden, y=self._gate_weight)

            gate_input = layers.elementwise_add(gate_input, self._gate_bias)
            gate_input = self._gate_activation(gate_input)
            r, u = layers.split(gate_input, num_or_sections=2, dim=1)

            r_hidden = r * pre_hidden

            candidate = layers.matmul(
                layers.concat([input, r_hidden], 1), self._candidate_weight)
            candidate = layers.elementwise_add(candidate, self._candidate_bias)

            c = self._activation(candidate)
            new_hidden = u * pre_hidden + (1 - u) * c

        return new_hidden