Merge pull request #2641 from lcy-seso/enable_boot_memory_for_lstm

enable users to set intial memory states for lstm/gru group.

Merge pull request #2641 from lcy-seso/enable_boot_memory_for_lstm
enable users to set intial memory states for lstm/gru group.
d011514e · Cao Ying · GitHub · c8e56d31 · 5c68aaca · d011514e
Showing with 58 addition and 43 deletion

python/paddle/trainer_config_helpers/layers.py python/paddle/trainer_config_helpers/layers.py +26 -25

python/paddle/trainer_config_helpers/networks.py python/paddle/trainer_config_helpers/networks.py +32 -18

未找到文件。
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1149,10 +1149,10 @@ def pooling_layer(input,
 @layer_support(DROPOUT)
 def lstmemory(input,
              name=None,
+              size=None,
              reverse=False,
              act=None,
              gate_act=None,
-              size=None,
              state_act=None,
              bias_attr=None,
              param_attr=None,
@@ -1194,6 +1194,8 @@ def lstmemory(input,
    :param name: The lstmemory layer name.
    :type name: basestring
+    :param size: DEPRECATED. size of the lstm cell
+    :type size: int
    :param input: input layer name.
    :type input: LayerOutput
    :param reverse: is sequence process reversed or not.
@@ -1220,15 +1222,15 @@ def lstmemory(input,
    assert state_act.support_hppl
    assert act.support_hppl
    assert input.size is not None and input.size % 4 == 0
    if size is not None:
        if input.size / 4 == size:
            plog = logger.warning
        else:
            plog = logger.fatal
+        plog("size of lstmemory layer: %s is automatically set to "
-        plog("NOTE: The lstmemory layer[%s]'s size is set by previous input "
+             "size of input layer / 4. The parameter size passing to "
-             "layer. The lstm size should be equal with input layer size/4. The"
+             "this layer is ignored." % (name))
-             " size which is set explicitly will be ignored." % name)
    Layer(
        name=name,
@@ -1255,11 +1257,11 @@ def lstmemory(input,
 @wrap_name_default("gru")
 @layer_support(DROPOUT)
 def grumemory(input,
+              size=None,
              name=None,
              reverse=False,
              act=None,
              gate_act=None,
-              size=None,
              bias_attr=None,
              param_attr=None,
              layer_attr=None):
@@ -1318,6 +1320,8 @@ def grumemory(input,
    :type name: None|basestring
    :param input: input layer.
    :type input: LayerOutput.
+    :param size: DEPRECATED. size of the gru cell
+    :type size: int
    :param reverse: Whether sequence process is reversed or not.
    :type reverse: bool
    :param act: activation type, TanhActivation by default. This activation
@@ -1334,9 +1338,6 @@ def grumemory(input,
    :type param_attr: ParameterAttribute|None|False
    :param layer_attr: Extra Layer attribute
    :type layer_attr: ExtraLayerAttribute|None
-    :param size: Stub parameter of size, but actually not used. If set this size
-                 will get a warning.
-    :type size: None
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -1348,9 +1349,9 @@ def grumemory(input,
            plog = logger.warning
        else:
            plog = logger.fatal
-        plog("NOTE: the gru memory layer's size is set by previous input layer,"
+        plog("size of grumemory layer: %s is automatically set to "
-             " and should be input size / 3. Set size explicitly will be "
+             "size of input layer / 3. The parameter size passing to this "
-             "ignored.")
+             "layer is ignored." % (name))
    Layer(
        name=name,
@@ -2524,8 +2525,8 @@ def img_cmrnorm_layer(input,
 @wrap_bias_attr_default()
-@wrap_param_attr_default(default_factory=lambda _: ParamAttr(initial_mean=1.0,
+@wrap_param_attr_default(
-                                                             initial_std=0.))
+    default_factory=lambda _: ParamAttr(initial_mean=1.0, initial_std=0.))
 @wrap_act_default(act=ReluActivation())
 @wrap_name_default("batch_norm")
 @layer_support(DROPOUT)
@@ -3013,25 +3014,25 @@ def lstm_step_layer(input,
                    bias_attr=None,
                    layer_attr=None):
    """
-    LSTM Step Layer. It used in recurrent_group. The lstm equations are shown
+    LSTM Step Layer. This function is used only in recurrent_group.
-    as follow.
+    The lstm equations are shown as follows.
    ..  math::
-        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
-        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
-        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
-        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
        h_t & = o_t tanh(c_t)
    The input of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
    :code:`mixed_layer` and :code:`full_matrix_projection` to calculate these
-    input vector.
+    input vectors.
    The state of lstm step is :math:`c_{t-1}`. And lstm step layer will do
@@ -3042,14 +3043,14 @@ def lstm_step_layer(input,
        ...
-    This layer contains two outputs. Default output is :math:`h_t`. The other
+    This layer has two outputs. Default output is :math:`h_t`. The other
-    output is :math:`o_t`, which name is 'state' and can use
+    output is :math:`o_t`, whose name is 'state' and can use
    :code:`get_output_layer` to extract this output.
    :param name: Layer's name.
    :type name: basestring
-    :param size: Layer's size. NOTE: lstm layer's size, should be equal as
+    :param size: Layer's size. NOTE: lstm layer's size, should be equal to
-                 :code:`input.size/4`, and should be equal as
+                 :code:`input.size/4`, and should be equal to
                 :code:`state.size`.
    :type size: int
    :param input: input layer. :math:`Wx_t + Wh_{t-1}`

--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -614,6 +614,7 @@ def simple_lstm(input,
 @wrap_name_default('lstm_unit')
 def lstmemory_unit(input,
+                   memory_boot=None,
                   name=None,
                   size=None,
                   param_attr=None,
@@ -626,9 +627,9 @@ def lstmemory_unit(input,
                   lstm_layer_attr=None,
                   get_output_layer_attr=None):
    """
-    Define calculations that a LSTM unit performs in a single time step.
+    Define calculations that a LSTM unit performs during a single time step.
-    This function itself is not a recurrent layer, so that it can not be
+    This function itself is not a recurrent layer, so it can not be
-    directly applied to sequence input. This function is always used in
+    directly used to process sequence inputs. This function is always used in
    recurrent_group (see layers.py for more details) to implement attention
    mechanism.
@@ -638,13 +639,13 @@ def lstmemory_unit(input,
    ..  math::
-        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
-        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
-        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
-        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
        h_t & = o_t tanh(c_t)
@@ -661,6 +662,8 @@ def lstmemory_unit(input,
    :param input: input layer name.
    :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
    :param name: lstmemory unit name.
    :type name: basestring
    :param size: lstmemory unit size.
@@ -692,7 +695,8 @@ def lstmemory_unit(input,
        assert input.size % 4 == 0
        size = input.size / 4
    out_mem = memory(name=name, size=size)
-    state_mem = memory(name="%s_state" % name, size=size)
+    state_mem = memory(
+        name="%s_state" % name, size=size, boot_layer=memory_boot)
    with mixed_layer(
            name="%s_input_recurrent" % name,
@@ -726,6 +730,7 @@ def lstmemory_unit(input,
 def lstmemory_group(input,
                    size=None,
                    name=None,
+                    memory_boot=None,
                    reverse=False,
                    param_attr=None,
                    act=None,
@@ -737,7 +742,7 @@ def lstmemory_group(input,
                    lstm_layer_attr=None,
                    get_output_layer_attr=None):
    """
-    lstm_group is a recurrent layer group version of Long Short Term Memory. It
+    lstm_group is a recurrent_group version of Long Short Term Memory. It
    does exactly the same calculation as the lstmemory layer (see lstmemory in
    layers.py for the maths) does. A promising benefit is that LSTM memory
    cell states, or hidden states in every time step are accessible to the
@@ -748,8 +753,8 @@ def lstmemory_group(input,
    NOTE: In PaddlePaddle's implementation, the following input-to-hidden
    multiplications:
-    :math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`,
+    :math:`W_{x_i}x_{t}` , :math:`W_{x_f}x_{t}`,
-    :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` are not done in lstmemory_unit to
+    :math:`W_{x_c}x_t`, :math:`W_{x_o}x_{t}` are not done in lstmemory_unit to
    speed up the calculations. Consequently, an additional mixed_layer with
    full_matrix_projection must be included before lstmemory_unit is called.
@@ -765,10 +770,12 @@ def lstmemory_group(input,
    :param input: input layer name.
    :type input: LayerOutput
-    :param name: lstmemory group name.
-    :type name: basestring
    :param size: lstmemory group size.
    :type size: int
+    :param name: name of the lstmemory group.
+    :type name: basestring
+    :param memory_boot: the initialization state of LSTM cell.
+    :type memory_boot: LayerOutput | None
    :param reverse: is lstm reversed
    :type reverse: bool
    :param param_attr: Parameter config, None if use default.
@@ -798,6 +805,7 @@ def lstmemory_group(input,
    def __lstm_step__(ipt):
        return lstmemory_unit(
            input=ipt,
+            memory_boot=memory_boot,
            name=name,
            size=size,
            mixed_bias_attr=mixed_bias_attr,
@@ -819,6 +827,7 @@ def lstmemory_group(input,
 @wrap_name_default('gru_unit')
 def gru_unit(input,
+             memory_boot=None,
             size=None,
             name=None,
             gru_bias_attr=None,
@@ -829,8 +838,8 @@ def gru_unit(input,
             naive=False):
    """
    Define calculations that a gated recurrent unit performs in a single time
-    step. This function itself is not a recurrent layer, so that it can not be
+    step. This function itself is not a recurrent layer, so it can not be
-    directly applied to sequence input. This function is almost always used in
+    directly used to process sequence inputs. This function is always used in
    the recurrent_group (see layers.py for more details) to implement attention
    mechanism.
@@ -838,6 +847,8 @@ def gru_unit(input,
    :param input: input layer name.
    :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
    :param name: name of the gru group.
    :type name: basestring
    :param size: hidden size of the gru.
@@ -856,7 +867,7 @@ def gru_unit(input,
    if size is None:
        size = input.size / 3
-    out_mem = memory(name=name, size=size)
+    out_mem = memory(name=name, size=size, boot_layer=memory_boot)
    if naive:
        __step__ = gru_step_naive_layer
@@ -878,6 +889,7 @@ def gru_unit(input,
 @wrap_name_default('gru_group')
 def gru_group(input,
+              memory_boot=None,
              size=None,
              name=None,
              reverse=False,
@@ -888,7 +900,7 @@ def gru_group(input,
              gru_layer_attr=None,
              naive=False):
    """
-    gru_group is a recurrent layer group version of Gated Recurrent Unit. It
+    gru_group is a recurrent_group version of Gated Recurrent Unit. It
    does exactly the same calculation as the grumemory layer does. A promising
    benefit is that gru hidden states are accessible to the user. This is
    especially useful in attention model. If you do not need to access
@@ -908,6 +920,8 @@ def gru_group(input,
    :param input: input layer name.
    :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
    :param name: name of the gru group.
    :type name: basestring
    :param size: hidden size of the gru.
@@ -929,6 +943,7 @@ def gru_group(input,
    def __gru_step__(ipt):
        return gru_unit(
            input=ipt,
+            memory_boot=memory_boot,
            name=name,
            size=size,
            gru_bias_attr=gru_bias_attr,
@@ -1083,7 +1098,6 @@ def simple_gru2(input,
    return grumemory(
        name=name,
-        size=size,
        input=m,
        reverse=reverse,
        bias_attr=gru_bias_attr,