enable intializing memory state for lstmemory_group.

b0ad9c90 · caoying03 · 633082ad · b0ad9c90 · b0ad9c90
Showing with 57 addition and 42 deletion

python/paddle/trainer_config_helpers/layers.py python/paddle/trainer_config_helpers/layers.py +26 -25

python/paddle/trainer_config_helpers/networks.py python/paddle/trainer_config_helpers/networks.py +31 -17

未找到文件。
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1149,10 +1149,10 @@ def pooling_layer(input,
 @layer_support(DROPOUT)
 def lstmemory(input,
              name=None,
+              size=None,
              reverse=False,
              act=None,
              gate_act=None,
-              size=None,
              state_act=None,
              bias_attr=None,
              param_attr=None,
@@ -1194,6 +1194,8 @@ def lstmemory(input,

    :param name: The lstmemory layer name.
    :type name: basestring
+    :param size: DEPRECATED. size of the lstm cell
+    :type size: int
    :param input: input layer name.
    :type input: LayerOutput
    :param reverse: is sequence process reversed or not.
@@ -1220,15 +1222,15 @@ def lstmemory(input,
    assert state_act.support_hppl
    assert act.support_hppl
    assert input.size is not None and input.size % 4 == 0
+
    if size is not None:
        if input.size / 4 == size:
            plog = logger.warning
        else:
            plog = logger.fatal
-
-        plog("NOTE: The lstmemory layer[%s]'s size is set by previous input "
-             "layer. The lstm size should be equal with input layer size/4. The"
-             " size which is set explicitly will be ignored." % name)
+        plog("size of lstmemory layer: %s is automatically set to "
+             "size of input layer / 4. The parameter size passing to "
+             "this layer is ignored." % (name))

    Layer(
        name=name,
@@ -1255,11 +1257,11 @@ def lstmemory(input,
 @wrap_name_default("gru")
 @layer_support(DROPOUT)
 def grumemory(input,
+              size=None,
              name=None,
              reverse=False,
              act=None,
              gate_act=None,
-              size=None,
              bias_attr=None,
              param_attr=None,
              layer_attr=None):
@@ -1318,6 +1320,8 @@ def grumemory(input,
    :type name: None|basestring
    :param input: input layer.
    :type input: LayerOutput.
+    :param size: DEPRECATED. size of the gru cell
+    :type size: int
    :param reverse: Whether sequence process is reversed or not.
    :type reverse: bool
    :param act: activation type, TanhActivation by default. This activation
@@ -1334,9 +1338,6 @@ def grumemory(input,
    :type param_attr: ParameterAttribute|None|False
    :param layer_attr: Extra Layer attribute
    :type layer_attr: ExtraLayerAttribute|None
-    :param size: Stub parameter of size, but actually not used. If set this size
-                 will get a warning.
-    :type size: None
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -1348,9 +1349,9 @@ def grumemory(input,
            plog = logger.warning
        else:
            plog = logger.fatal
-        plog("NOTE: the gru memory layer's size is set by previous input layer,"
-             " and should be input size / 3. Set size explicitly will be "
-             "ignored.")
+        plog("size of grumemory layer: %s is automatically set to "
+             "size of input layer / 3. The parameter size passing to this "
+             "layer is ignored." % (name))

    Layer(
        name=name,
@@ -2524,8 +2525,8 @@ def img_cmrnorm_layer(input,


 @wrap_bias_attr_default()
-@wrap_param_attr_default(default_factory=lambda _: ParamAttr(initial_mean=1.0,
-                                                             initial_std=0.))
+@wrap_param_attr_default(
+    default_factory=lambda _: ParamAttr(initial_mean=1.0, initial_std=0.))
 @wrap_act_default(act=ReluActivation())
 @wrap_name_default("batch_norm")
 @layer_support(DROPOUT)
@@ -3013,25 +3014,25 @@ def lstm_step_layer(input,
                    bias_attr=None,
                    layer_attr=None):
    """
-    LSTM Step Layer. It used in recurrent_group. The lstm equations are shown
-    as follow.
+    LSTM Step Layer. This function is used only in recurrent_group.
+    The lstm equations are shown as follows.

    ..  math::

-        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)

-        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)

-        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)

-        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)

        h_t & = o_t tanh(c_t)


    The input of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
    :code:`mixed_layer` and :code:`full_matrix_projection` to calculate these
-    input vector.
+    input vectors.

    The state of lstm step is :math:`c_{t-1}`. And lstm step layer will do

@@ -3042,14 +3043,14 @@ def lstm_step_layer(input,
        ...


-    This layer contains two outputs. Default output is :math:`h_t`. The other
-    output is :math:`o_t`, which name is 'state' and can use
+    This layer has two outputs. Default output is :math:`h_t`. The other
+    output is :math:`o_t`, whose name is 'state' and can use
    :code:`get_output_layer` to extract this output.

    :param name: Layer's name.
    :type name: basestring
-    :param size: Layer's size. NOTE: lstm layer's size, should be equal as
-                 :code:`input.size/4`, and should be equal as
+    :param size: Layer's size. NOTE: lstm layer's size, should be equal to
+                 :code:`input.size/4`, and should be equal to
                 :code:`state.size`.
    :type size: int
    :param input: input layer. :math:`Wx_t + Wh_{t-1}`

--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -614,6 +614,7 @@ def simple_lstm(input,

 @wrap_name_default('lstm_unit')
 def lstmemory_unit(input,
+                   memory_boot=None,
                   name=None,
                   size=None,
                   param_attr=None,
@@ -626,9 +627,9 @@ def lstmemory_unit(input,
                   lstm_layer_attr=None,
                   get_output_layer_attr=None):
    """
-    Define calculations that a LSTM unit performs in a single time step.
-    This function itself is not a recurrent layer, so that it can not be
-    directly applied to sequence input. This function is always used in
+    Define calculations that a LSTM unit performs during a single time step.
+    This function itself is not a recurrent layer, so it can not be
+    directly used to process sequence inputs. This function is always used in
    recurrent_group (see layers.py for more details) to implement attention
    mechanism.

@@ -638,13 +639,13 @@ def lstmemory_unit(input,

    ..  math::

-        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)

-        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)

-        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)

-        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)

        h_t & = o_t tanh(c_t)

@@ -661,6 +662,8 @@ def lstmemory_unit(input,

    :param input: input layer name.
    :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
    :param name: lstmemory unit name.
    :type name: basestring
    :param size: lstmemory unit size.
@@ -692,7 +695,8 @@ def lstmemory_unit(input,
        assert input.size % 4 == 0
        size = input.size / 4
    out_mem = memory(name=name, size=size)
-    state_mem = memory(name="%s_state" % name, size=size)
+    state_mem = memory(
+        name="%s_state" % name, size=size, boot_layer=memory_boot)

    with mixed_layer(
            name="%s_input_recurrent" % name,
@@ -726,6 +730,7 @@ def lstmemory_unit(input,
 def lstmemory_group(input,
                    size=None,
                    name=None,
+                    memory_boot=None,
                    reverse=False,
                    param_attr=None,
                    act=None,
@@ -737,7 +742,7 @@ def lstmemory_group(input,
                    lstm_layer_attr=None,
                    get_output_layer_attr=None):
    """
-    lstm_group is a recurrent layer group version of Long Short Term Memory. It
+    lstm_group is a recurrent_group version of Long Short Term Memory. It
    does exactly the same calculation as the lstmemory layer (see lstmemory in
    layers.py for the maths) does. A promising benefit is that LSTM memory
    cell states, or hidden states in every time step are accessible to the
@@ -748,8 +753,8 @@ def lstmemory_group(input,

    NOTE: In PaddlePaddle's implementation, the following input-to-hidden
    multiplications:
-    :math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`,
-    :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` are not done in lstmemory_unit to
+    :math:`W_{x_i}x_{t}` , :math:`W_{x_f}x_{t}`,
+    :math:`W_{x_c}x_t`, :math:`W_{x_o}x_{t}` are not done in lstmemory_unit to
    speed up the calculations. Consequently, an additional mixed_layer with
    full_matrix_projection must be included before lstmemory_unit is called.

@@ -765,8 +770,10 @@ def lstmemory_group(input,

    :param input: input layer name.
    :type input: LayerOutput
-    :param name: lstmemory group name.
+    :param name: name of the lstmemory group.
    :type name: basestring
+    :param memory_boot: the initialization state of LSTM cell.
+    :type memory_boot: LayerOutput | None
    :param size: lstmemory group size.
    :type size: int
    :param reverse: is lstm reversed
@@ -798,6 +805,7 @@ def lstmemory_group(input,
    def __lstm_step__(ipt):
        return lstmemory_unit(
            input=ipt,
+            memory_boot=memory_boot,
            name=name,
            size=size,
            mixed_bias_attr=mixed_bias_attr,
@@ -819,6 +827,7 @@ def lstmemory_group(input,

 @wrap_name_default('gru_unit')
 def gru_unit(input,
+             memory_boot=None,
             size=None,
             name=None,
             gru_bias_attr=None,
@@ -829,8 +838,8 @@ def gru_unit(input,
             naive=False):
    """
    Define calculations that a gated recurrent unit performs in a single time
-    step. This function itself is not a recurrent layer, so that it can not be
-    directly applied to sequence input. This function is almost always used in
+    step. This function itself is not a recurrent layer, so it can not be
+    directly used to process sequence inputs. This function is always used in
    the recurrent_group (see layers.py for more details) to implement attention
    mechanism.

@@ -838,6 +847,8 @@ def gru_unit(input,

    :param input: input layer name.
    :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
    :param name: name of the gru group.
    :type name: basestring
    :param size: hidden size of the gru.
@@ -856,7 +867,7 @@ def gru_unit(input,
    if size is None:
        size = input.size / 3

-    out_mem = memory(name=name, size=size)
+    out_mem = memory(name=name, size=size, boot_layer=memory_boot)

    if naive:
        __step__ = gru_step_naive_layer
@@ -878,6 +889,7 @@ def gru_unit(input,

 @wrap_name_default('gru_group')
 def gru_group(input,
+              memory_boot=None,
              size=None,
              name=None,
              reverse=False,
@@ -888,7 +900,7 @@ def gru_group(input,
              gru_layer_attr=None,
              naive=False):
    """
-    gru_group is a recurrent layer group version of Gated Recurrent Unit. It
+    gru_group is a recurrent_group version of Gated Recurrent Unit. It
    does exactly the same calculation as the grumemory layer does. A promising
    benefit is that gru hidden states are accessible to the user. This is
    especially useful in attention model. If you do not need to access
@@ -908,6 +920,8 @@ def gru_group(input,

    :param input: input layer name.
    :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
    :param name: name of the gru group.
    :type name: basestring
    :param size: hidden size of the gru.
@@ -929,6 +943,7 @@ def gru_group(input,
    def __gru_step__(ipt):
        return gru_unit(
            input=ipt,
+            memory_boot=memory_boot,
            name=name,
            size=size,
            gru_bias_attr=gru_bias_attr,
@@ -1083,7 +1098,6 @@ def simple_gru2(input,

    return grumemory(
        name=name,
-        size=size,
        input=m,
        reverse=reverse,
        bias_attr=gru_bias_attr,