# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
 
# recurrent_units.py
# Version 2.0
#
# Some recurrent units can be used in recurrent layer group, 
#   to use these units, import this module in your config_file:
#     import trainer.recurrent_units 
# 

from paddle.trainer.config_parser import *

# long short term memory, can be used in recurrent machine
# *inputs* must be a list of Projections, for example:
#   inputs = [FullMatrixProjection("input_layer_name")],
# *para_prefix* defines parameter names, if the *para_prefix* of 
#   two LstmRecurrentUnit is same, they share same parameters
# *out_memory* can be defined outside if it's used outside
def LstmRecurrentUnit(name, size, 
                      active_type, state_active_type, gate_active_type, 
                      inputs, para_prefix = None, 
                      error_clipping_threshold = 0,
                      out_memory = None):

    if para_prefix is None: 
        para_prefix = name
    if out_memory is None:
        out_memory = Memory(name = name, size = size)

    state_memory = Memory(name = name + "_" + "state", size = size)
 
    Layer(
          name = name + "_" + "input_recurrent",
          type = "mixed",
          size = size * 4, #(input_s, input_gate, forget_gate, output_gate)
          error_clipping_threshold = error_clipping_threshold,
          bias = Bias(initial_std = 0, 
                      parameter_name = para_prefix + "_input_recurrent.b"),
          inputs = inputs + [
            FullMatrixProjection(out_memory,
                                 parameter_name = para_prefix + "_input_recurrent.w"),
            ],
    )
    LstmStepLayer(
          name = name,
          size = size,
          bias = Bias(parameter_name = para_prefix + "_check.b"),
          inputs = [name + "_" + "input_recurrent", state_memory],
          active_type = active_type,
          active_gate_type = gate_active_type,
          active_state_type = state_active_type,
    )
    GetOutputLayer(
          name = name + "_" + "state",
          size = size,
          inputs = Input(name, input_layer_argument = "state"),
    )

def LstmRecurrentUnitNaive(name, size, 
                           active_type, state_active_type, gate_active_type, 
                           inputs, para_prefix = None, 
                           error_clipping_threshold = 0,
                           out_memory = None):

    if para_prefix is None: 
        para_prefix = name
    if out_memory is None:
        out_memory = Memory(name = name, size = size)

    state_memory = Memory(name = name + "_" + "state", size = size)
 
    Layer(
          name = name + "_" + "input_recurrent",
          type = "mixed",
          size = size * 4, #(input_s, input_gate, forget_gate, output_gate)
          error_clipping_threshold = error_clipping_threshold,
          bias = Bias(initial_std = 0, 
                      parameter_name = para_prefix + "_input_recurrent.b"),
          inputs = inputs + [
            FullMatrixProjection(out_memory,
                                 parameter_name = para_prefix + "_input_recurrent.w"),
            ],
    )
    ExpressionLayer(
          name = name + "_" + "input_s",
          size = size,
          active_type = active_type,
          inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=0)],
    )
    ExpressionLayer(
          name = name + "_" + "input_gate",
          active_type = gate_active_type,
          inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=size),
                    DotMulProjection(state_memory,
                                     parameter_name = para_prefix + "_input_check.w")],
    )
    ExpressionLayer(
          name = name + "_" + "forget_gate",
          active_type = gate_active_type,
          inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=size*2),
                    DotMulProjection(state_memory,
                                     parameter_name = para_prefix + "_forget_check.w")],
    )
    ExpressionLayer(
          name = name + "_" + "state",
          inputs = [DotMulOperator([name + "_" + "input_s",
                                    name + "_" + "input_gate"]),
                    DotMulOperator([state_memory, 
                                    name + "_" + "forget_gate"]),
                    ],
    )
    ExpressionLayer(
          name = name + "_" + "output_gate",
          active_type = gate_active_type,
          inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=size*3),
                    DotMulProjection(name + "_" + "state",
                                     parameter_name = para_prefix + "_output_check.w")],
    )
    ExpressionLayer(
          name = name + "_" + "state_atv",
          active_type = state_active_type,
          inputs = IdentityProjection(name + "_" + "state"),
    )
    ExpressionLayer(
          name = name,
          inputs = DotMulOperator([name + "_" + "state_atv",
                                   name + "_" + "output_gate"]),
    )

# like LstmRecurrentUnit, but it's a layer group.
# it is equivalent to LstmLayer
def LstmRecurrentLayerGroup(name, size, 
                            active_type, state_active_type, gate_active_type, 
                            inputs, para_prefix = None,
                            error_clipping_threshold = 0,
                            seq_reversed = False):

    input_layer_name = name + "_" + "transform_input"
    Layer(
          name = input_layer_name,
          type = "mixed",
          size = size * 4,
          active_type = "",
          bias = False,
          inputs = inputs,
    )

    RecurrentLayerGroupBegin(name + "_layer_group", 
                             in_links = [input_layer_name], 
                             out_links = [name],
                             seq_reversed = seq_reversed)

    LstmRecurrentUnit(
        name = name,
        size = size,
        active_type = active_type,
        state_active_type = state_active_type,
        gate_active_type = gate_active_type,
        inputs = [IdentityProjection(input_layer_name)],
        para_prefix = para_prefix,
        error_clipping_threshold = error_clipping_threshold,
        )

    RecurrentLayerGroupEnd(name + "_layer_group")


# gated recurrent unit, can be used in recurrent machine
# *inputs* should be a list of Projections, for example:
#   inputs = [FullMatrixProjection("input_layer_name")],
# *para_prefix* defines parameter names, if the *para_prefix* of 
#   two GatedRecurrentUnit is same, they share same parameters
# *out_memory* can be defined outside if it's used outside

def GatedRecurrentUnit(name, size, 
                       active_type, gate_active_type, 
                       inputs, para_prefix = None, 
                       error_clipping_threshold = 0,
                       out_memory = None):
    if type_of(inputs) == str: #only used by GatedRecurrentLayerGroup
        input_layer_name = inputs
    else:
        input_layer_name = name + "_" + "transform_input"
        Layer(
            name = input_layer_name,
            type = "mixed",
            size = size * 3,
            active_type = "",
            bias = False,
            inputs = inputs,
        )

    if para_prefix is None: 
        para_prefix = name
    if out_memory is None:
        out_memory = Memory(name = name, size = size)

    GruStepLayer(
          name = name,
          size = size,
          bias = Bias(parameter_name = para_prefix + "_gate.b"),
          inputs = [input_layer_name,
                    Input(out_memory, parameter_name = para_prefix + "_gate.w")],
          active_type = active_type,
          active_gate_type = gate_active_type,
    )

def GatedRecurrentUnitNaive(name, size, 
                            active_type, gate_active_type, 
                            inputs, para_prefix = None, 
                            error_clipping_threshold = 0,
                            out_memory = None):

    if type_of(inputs) == str: #only used by GatedRecurrentLayerGroup
        input_layer_name = inputs
    else:
        input_layer_name = name + "_" + "transform_input"
        Layer(
            name = input_layer_name,
            type = "mixed",
            size = size * 3,
            active_type = "",
            bias = False,
            inputs = inputs,
        )

    if para_prefix is None: 
        para_prefix = name
    if out_memory is None:
        out_memory = Memory(name = name, size = size)

    Layer(
          name = name + "_" + "update_gate",
          type = "mixed",
          size = size, 
          active_type = gate_active_type,
          error_clipping_threshold = error_clipping_threshold,
          bias = Bias(initial_std = 0, parameter_name = para_prefix + "_update_gate.b"),
          inputs = [IdentityOffsetProjection(input_layer_name, offset=0),
                    FullMatrixProjection(out_memory,
                                         parameter_name = para_prefix + "_update_gate.w")],
    )
    Layer(
          name = name + "_" + "reset_gate",
          type = "mixed",
          size = size, 
          active_type = gate_active_type,
          error_clipping_threshold = error_clipping_threshold,
          bias = Bias(initial_std = 0, parameter_name = para_prefix + "_reset_gate.b"),
          inputs = [IdentityOffsetProjection(input_layer_name, offset=size),
                    FullMatrixProjection(out_memory,
                                         parameter_name = para_prefix + "_reset_gate.w")],
    )
    ExpressionLayer(
          name = name + "_" + "reset_output",
          inputs = DotMulOperator([out_memory, name + "_" + "reset_gate"]),
    )
    Layer(
          name = name + "_" + "output_candidate",
          type = "mixed",
          size = size, 
          active_type = active_type,
          error_clipping_threshold = error_clipping_threshold,
          bias = Bias(initial_std = 0, parameter_name = para_prefix + "_output_candidate.b"),
          inputs = [IdentityOffsetProjection(input_layer_name, offset=size*2),
                    FullMatrixProjection(name + "_" + "reset_output",
                                         parameter_name = para_prefix + "_output_candidate.w")],
    )
    ExpressionLayer( #element-wise interpolation
          name = name,
          inputs = [IdentityProjection(out_memory),
                    DotMulOperator([out_memory, 
                                    name + "_" + "update_gate"], scale=-1.0),
                    DotMulOperator([name + "_" + "output_candidate", 
                                    name + "_" + "update_gate"]),
                    ],
    )

# like GatedRecurrentUnit, but it's a layer group.
# it is equivalent to GatedRecurrentLayer.
def GatedRecurrentLayerGroup(name, size, 
                             active_type, gate_active_type, 
                             inputs, para_prefix = None,
                             error_clipping_threshold = 0,
                             seq_reversed = False):

    input_layer_name = name + "_" + "transform_input"
    Layer(
          name = input_layer_name,
          type = "mixed",
          size = size * 3,
          active_type = "",
          bias = False,
          inputs = inputs,
    )

    RecurrentLayerGroupBegin(name + "_layer_group", 
                             in_links = [input_layer_name], 
                             out_links = [name],
                             seq_reversed = seq_reversed)

    GatedRecurrentUnit(
        name = name,
        size = size,
        active_type = active_type,
        gate_active_type = gate_active_type,
        inputs = input_layer_name, #transform outside
        para_prefix = para_prefix,
        error_clipping_threshold = error_clipping_threshold,
        )

    RecurrentLayerGroupEnd(name + "_layer_group")