recurrent_units.py

# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# recurrent_units.py
# Version 2.0
#
# Some recurrent units can be used in recurrent layer group, 
#   to use these units, import this module in your config_file:
#     import trainer.recurrent_units 
# 

from paddle.trainer.config_parser import *


# long short term memory, can be used in recurrent machine
# *inputs* must be a list of Projections, for example:
#   inputs = [FullMatrixProjection("input_layer_name")],
# *para_prefix* defines parameter names, if the *para_prefix* of 
#   two LstmRecurrentUnit is same, they share same parameters
# *out_memory* can be defined outside if it's used outside
def LstmRecurrentUnit(name,
                      size,
                      active_type,
                      state_active_type,
                      gate_active_type,
                      inputs,
                      para_prefix=None,
                      error_clipping_threshold=0,
                      out_memory=None):

    if para_prefix is None:
        para_prefix = name
    if out_memory is None:
        out_memory = Memory(name=name, size=size)

    state_memory = Memory(name=name + "_" + "state", size=size)

    Layer(
        name=name + "_" + "input_recurrent",
        type="mixed",
        size=size * 4,  #(input_s, input_gate, forget_gate, output_gate)
        error_clipping_threshold=error_clipping_threshold,
        bias=Bias(
            initial_std=0, parameter_name=para_prefix + "_input_recurrent.b"),
        inputs=inputs + [
            FullMatrixProjection(
                out_memory, parameter_name=para_prefix + "_input_recurrent.w"),
        ], )
    LstmStepLayer(
        name=name,
        size=size,
        bias=Bias(parameter_name=para_prefix + "_check.b"),
        inputs=[name + "_" + "input_recurrent", state_memory],
        active_type=active_type,
        active_gate_type=gate_active_type,
        active_state_type=state_active_type, )
    GetOutputLayer(
        name=name + "_" + "state",
        size=size,
        inputs=Input(
            name, input_layer_argument="state"), )


def LstmRecurrentUnitNaive(name,
                           size,
                           active_type,
                           state_active_type,
                           gate_active_type,
                           inputs,
                           para_prefix=None,
                           error_clipping_threshold=0,
                           out_memory=None):

    if para_prefix is None:
        para_prefix = name
    if out_memory is None:
        out_memory = Memory(name=name, size=size)

    state_memory = Memory(name=name + "_" + "state", size=size)

    Layer(
        name=name + "_" + "input_recurrent",
        type="mixed",
        size=size * 4,  #(input_s, input_gate, forget_gate, output_gate)
        error_clipping_threshold=error_clipping_threshold,
        bias=Bias(
            initial_std=0, parameter_name=para_prefix + "_input_recurrent.b"),
        inputs=inputs + [
            FullMatrixProjection(
                out_memory, parameter_name=para_prefix + "_input_recurrent.w"),
        ], )
    ExpressionLayer(
        name=name + "_" + "input_s",
        size=size,
        active_type=active_type,
        inputs=[
            IdentityOffsetProjection(
                name + "_" + "input_recurrent", offset=0)
        ], )
    ExpressionLayer(
        name=name + "_" + "input_gate",
        active_type=gate_active_type,
        inputs=[
            IdentityOffsetProjection(
                name + "_" + "input_recurrent", offset=size), DotMulProjection(
                    state_memory, parameter_name=para_prefix + "_input_check.w")
        ], )
    ExpressionLayer(
        name=name + "_" + "forget_gate",
        active_type=gate_active_type,
        inputs=[
            IdentityOffsetProjection(
                name + "_" + "input_recurrent", offset=size * 2),
            DotMulProjection(
                state_memory, parameter_name=para_prefix + "_forget_check.w")
        ], )
    ExpressionLayer(
        name=name + "_" + "state",
        inputs=[
            DotMulOperator([name + "_" + "input_s", name + "_" + "input_gate"]),
            DotMulOperator([state_memory, name + "_" + "forget_gate"]),
        ], )
    ExpressionLayer(
        name=name + "_" + "output_gate",
        active_type=gate_active_type,
        inputs=[
            IdentityOffsetProjection(
                name + "_" + "input_recurrent", offset=size * 3),
            DotMulProjection(
                name + "_" + "state",
                parameter_name=para_prefix + "_output_check.w")
        ], )
    ExpressionLayer(
        name=name + "_" + "state_atv",
        active_type=state_active_type,
        inputs=IdentityProjection(name + "_" + "state"), )
    ExpressionLayer(
        name=name,
        inputs=DotMulOperator(
            [name + "_" + "state_atv", name + "_" + "output_gate"]), )


# like LstmRecurrentUnit, but it's a layer group.
# it is equivalent to LstmLayer
def LstmRecurrentLayerGroup(name,
                            size,
                            active_type,
                            state_active_type,
                            gate_active_type,
                            inputs,
                            para_prefix=None,
                            error_clipping_threshold=0,
                            seq_reversed=False):

    input_layer_name = name + "_" + "transform_input"
    Layer(
        name=input_layer_name,
        type="mixed",
        size=size * 4,
        active_type="",
        bias=False,
        inputs=inputs, )

    RecurrentLayerGroupBegin(
        name + "_layer_group",
        in_links=[input_layer_name],
        out_links=[name],
        seq_reversed=seq_reversed)

    LstmRecurrentUnit(
        name=name,
        size=size,
        active_type=active_type,
        state_active_type=state_active_type,
        gate_active_type=gate_active_type,
        inputs=[IdentityProjection(input_layer_name)],
        para_prefix=para_prefix,
        error_clipping_threshold=error_clipping_threshold, )

    RecurrentLayerGroupEnd(name + "_layer_group")


# gated recurrent unit, can be used in recurrent machine
# *inputs* should be a list of Projections, for example:
#   inputs = [FullMatrixProjection("input_layer_name")],
# *para_prefix* defines parameter names, if the *para_prefix* of 
#   two GatedRecurrentUnit is same, they share same parameters
# *out_memory* can be defined outside if it's used outside


def GatedRecurrentUnit(name,
                       size,
                       active_type,
                       gate_active_type,
                       inputs,
                       para_prefix=None,
                       error_clipping_threshold=0,
                       out_memory=None):
    if type_of(inputs) == str:  #only used by GatedRecurrentLayerGroup
        input_layer_name = inputs
    else:
        input_layer_name = name + "_" + "transform_input"
        Layer(
            name=input_layer_name,
            type="mixed",
            size=size * 3,
            active_type="",
            bias=False,
            inputs=inputs, )

    if para_prefix is None:
        para_prefix = name
    if out_memory is None:
        out_memory = Memory(name=name, size=size)

    GruStepLayer(
        name=name,
        size=size,
        bias=Bias(parameter_name=para_prefix + "_gate.b"),
        inputs=[
            input_layer_name, Input(
                out_memory, parameter_name=para_prefix + "_gate.w")
        ],
        active_type=active_type,
        active_gate_type=gate_active_type, )


def GatedRecurrentUnitNaive(name,
                            size,
                            active_type,
                            gate_active_type,
                            inputs,
                            para_prefix=None,
                            error_clipping_threshold=0,
                            out_memory=None):

    if type_of(inputs) == str:  #only used by GatedRecurrentLayerGroup
        input_layer_name = inputs
    else:
        input_layer_name = name + "_" + "transform_input"
        Layer(
            name=input_layer_name,
            type="mixed",
            size=size * 3,
            active_type="",
            bias=False,
            inputs=inputs, )

    if para_prefix is None:
        para_prefix = name
    if out_memory is None:
        out_memory = Memory(name=name, size=size)

    Layer(
        name=name + "_" + "update_gate",
        type="mixed",
        size=size,
        active_type=gate_active_type,
        error_clipping_threshold=error_clipping_threshold,
        bias=Bias(
            initial_std=0, parameter_name=para_prefix + "_update_gate.b"),
        inputs=[
            IdentityOffsetProjection(
                input_layer_name, offset=0), FullMatrixProjection(
                    out_memory, parameter_name=para_prefix + "_update_gate.w")
        ], )
    Layer(
        name=name + "_" + "reset_gate",
        type="mixed",
        size=size,
        active_type=gate_active_type,
        error_clipping_threshold=error_clipping_threshold,
        bias=Bias(
            initial_std=0, parameter_name=para_prefix + "_reset_gate.b"),
        inputs=[
            IdentityOffsetProjection(
                input_layer_name, offset=size), FullMatrixProjection(
                    out_memory, parameter_name=para_prefix + "_reset_gate.w")
        ], )
    ExpressionLayer(
        name=name + "_" + "reset_output",
        inputs=DotMulOperator([out_memory, name + "_" + "reset_gate"]), )
    Layer(
        name=name + "_" + "output_candidate",
        type="mixed",
        size=size,
        active_type=active_type,
        error_clipping_threshold=error_clipping_threshold,
        bias=Bias(
            initial_std=0, parameter_name=para_prefix + "_output_candidate.b"),
        inputs=[
            IdentityOffsetProjection(
                input_layer_name, offset=size * 2), FullMatrixProjection(
                    name + "_" + "reset_output",
                    parameter_name=para_prefix + "_output_candidate.w")
        ], )
    ExpressionLayer(  #element-wise interpolation
        name=name,
        inputs=[
            IdentityProjection(out_memory),
            DotMulOperator(
                [out_memory, name + "_" + "update_gate"], scale=-1.0),
            DotMulOperator(
                [name + "_" + "output_candidate", name + "_" + "update_gate"]),
        ], )


# like GatedRecurrentUnit, but it's a layer group.
# it is equivalent to GatedRecurrentLayer.
def GatedRecurrentLayerGroup(name,
                             size,
                             active_type,
                             gate_active_type,
                             inputs,
                             para_prefix=None,
                             error_clipping_threshold=0,
                             seq_reversed=False):

    input_layer_name = name + "_" + "transform_input"
    Layer(
        name=input_layer_name,
        type="mixed",
        size=size * 3,
        active_type="",
        bias=False,
        inputs=inputs, )

    RecurrentLayerGroupBegin(
        name + "_layer_group",
        in_links=[input_layer_name],
        out_links=[name],
        seq_reversed=seq_reversed)

    GatedRecurrentUnit(
        name=name,
        size=size,
        active_type=active_type,
        gate_active_type=gate_active_type,
        inputs=input_layer_name,  #transform outside
        para_prefix=para_prefix,
        error_clipping_threshold=error_clipping_threshold, )

    RecurrentLayerGroupEnd(name + "_layer_group")