提交 d011514e 编写于 作者: C Cao Ying 提交者: GitHub

Merge pull request #2641 from lcy-seso/enable_boot_memory_for_lstm

enable users to set intial memory states for lstm/gru group.
......@@ -1149,10 +1149,10 @@ def pooling_layer(input,
@layer_support(DROPOUT)
def lstmemory(input,
name=None,
size=None,
reverse=False,
act=None,
gate_act=None,
size=None,
state_act=None,
bias_attr=None,
param_attr=None,
......@@ -1194,6 +1194,8 @@ def lstmemory(input,
:param name: The lstmemory layer name.
:type name: basestring
:param size: DEPRECATED. size of the lstm cell
:type size: int
:param input: input layer name.
:type input: LayerOutput
:param reverse: is sequence process reversed or not.
......@@ -1220,15 +1222,15 @@ def lstmemory(input,
assert state_act.support_hppl
assert act.support_hppl
assert input.size is not None and input.size % 4 == 0
if size is not None:
if input.size / 4 == size:
plog = logger.warning
else:
plog = logger.fatal
plog("NOTE: The lstmemory layer[%s]'s size is set by previous input "
"layer. The lstm size should be equal with input layer size/4. The"
" size which is set explicitly will be ignored." % name)
plog("size of lstmemory layer: %s is automatically set to "
"size of input layer / 4. The parameter size passing to "
"this layer is ignored." % (name))
Layer(
name=name,
......@@ -1255,11 +1257,11 @@ def lstmemory(input,
@wrap_name_default("gru")
@layer_support(DROPOUT)
def grumemory(input,
size=None,
name=None,
reverse=False,
act=None,
gate_act=None,
size=None,
bias_attr=None,
param_attr=None,
layer_attr=None):
......@@ -1318,6 +1320,8 @@ def grumemory(input,
:type name: None|basestring
:param input: input layer.
:type input: LayerOutput.
:param size: DEPRECATED. size of the gru cell
:type size: int
:param reverse: Whether sequence process is reversed or not.
:type reverse: bool
:param act: activation type, TanhActivation by default. This activation
......@@ -1334,9 +1338,6 @@ def grumemory(input,
:type param_attr: ParameterAttribute|None|False
:param layer_attr: Extra Layer attribute
:type layer_attr: ExtraLayerAttribute|None
:param size: Stub parameter of size, but actually not used. If set this size
will get a warning.
:type size: None
:return: LayerOutput object.
:rtype: LayerOutput
"""
......@@ -1348,9 +1349,9 @@ def grumemory(input,
plog = logger.warning
else:
plog = logger.fatal
plog("NOTE: the gru memory layer's size is set by previous input layer,"
" and should be input size / 3. Set size explicitly will be "
"ignored.")
plog("size of grumemory layer: %s is automatically set to "
"size of input layer / 3. The parameter size passing to this "
"layer is ignored." % (name))
Layer(
name=name,
......@@ -2524,8 +2525,8 @@ def img_cmrnorm_layer(input,
@wrap_bias_attr_default()
@wrap_param_attr_default(default_factory=lambda _: ParamAttr(initial_mean=1.0,
initial_std=0.))
@wrap_param_attr_default(
default_factory=lambda _: ParamAttr(initial_mean=1.0, initial_std=0.))
@wrap_act_default(act=ReluActivation())
@wrap_name_default("batch_norm")
@layer_support(DROPOUT)
......@@ -3013,25 +3014,25 @@ def lstm_step_layer(input,
bias_attr=None,
layer_attr=None):
"""
LSTM Step Layer. It used in recurrent_group. The lstm equations are shown
as follow.
LSTM Step Layer. This function is used only in recurrent_group.
The lstm equations are shown as follows.
.. math::
i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
h_t & = o_t tanh(c_t)
The input of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
:code:`mixed_layer` and :code:`full_matrix_projection` to calculate these
input vector.
input vectors.
The state of lstm step is :math:`c_{t-1}`. And lstm step layer will do
......@@ -3042,14 +3043,14 @@ def lstm_step_layer(input,
...
This layer contains two outputs. Default output is :math:`h_t`. The other
output is :math:`o_t`, which name is 'state' and can use
This layer has two outputs. Default output is :math:`h_t`. The other
output is :math:`o_t`, whose name is 'state' and can use
:code:`get_output_layer` to extract this output.
:param name: Layer's name.
:type name: basestring
:param size: Layer's size. NOTE: lstm layer's size, should be equal as
:code:`input.size/4`, and should be equal as
:param size: Layer's size. NOTE: lstm layer's size, should be equal to
:code:`input.size/4`, and should be equal to
:code:`state.size`.
:type size: int
:param input: input layer. :math:`Wx_t + Wh_{t-1}`
......
......@@ -614,6 +614,7 @@ def simple_lstm(input,
@wrap_name_default('lstm_unit')
def lstmemory_unit(input,
memory_boot=None,
name=None,
size=None,
param_attr=None,
......@@ -626,9 +627,9 @@ def lstmemory_unit(input,
lstm_layer_attr=None,
get_output_layer_attr=None):
"""
Define calculations that a LSTM unit performs in a single time step.
This function itself is not a recurrent layer, so that it can not be
directly applied to sequence input. This function is always used in
Define calculations that a LSTM unit performs during a single time step.
This function itself is not a recurrent layer, so it can not be
directly used to process sequence inputs. This function is always used in
recurrent_group (see layers.py for more details) to implement attention
mechanism.
......@@ -638,13 +639,13 @@ def lstmemory_unit(input,
.. math::
i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
h_t & = o_t tanh(c_t)
......@@ -661,6 +662,8 @@ def lstmemory_unit(input,
:param input: input layer name.
:type input: LayerOutput
:param memory_boot: the initialization state of the LSTM cell.
:type memory_boot: LayerOutput | None
:param name: lstmemory unit name.
:type name: basestring
:param size: lstmemory unit size.
......@@ -692,7 +695,8 @@ def lstmemory_unit(input,
assert input.size % 4 == 0
size = input.size / 4
out_mem = memory(name=name, size=size)
state_mem = memory(name="%s_state" % name, size=size)
state_mem = memory(
name="%s_state" % name, size=size, boot_layer=memory_boot)
with mixed_layer(
name="%s_input_recurrent" % name,
......@@ -726,6 +730,7 @@ def lstmemory_unit(input,
def lstmemory_group(input,
size=None,
name=None,
memory_boot=None,
reverse=False,
param_attr=None,
act=None,
......@@ -737,7 +742,7 @@ def lstmemory_group(input,
lstm_layer_attr=None,
get_output_layer_attr=None):
"""
lstm_group is a recurrent layer group version of Long Short Term Memory. It
lstm_group is a recurrent_group version of Long Short Term Memory. It
does exactly the same calculation as the lstmemory layer (see lstmemory in
layers.py for the maths) does. A promising benefit is that LSTM memory
cell states, or hidden states in every time step are accessible to the
......@@ -748,8 +753,8 @@ def lstmemory_group(input,
NOTE: In PaddlePaddle's implementation, the following input-to-hidden
multiplications:
:math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`,
:math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` are not done in lstmemory_unit to
:math:`W_{x_i}x_{t}` , :math:`W_{x_f}x_{t}`,
:math:`W_{x_c}x_t`, :math:`W_{x_o}x_{t}` are not done in lstmemory_unit to
speed up the calculations. Consequently, an additional mixed_layer with
full_matrix_projection must be included before lstmemory_unit is called.
......@@ -765,10 +770,12 @@ def lstmemory_group(input,
:param input: input layer name.
:type input: LayerOutput
:param name: lstmemory group name.
:type name: basestring
:param size: lstmemory group size.
:type size: int
:param name: name of the lstmemory group.
:type name: basestring
:param memory_boot: the initialization state of LSTM cell.
:type memory_boot: LayerOutput | None
:param reverse: is lstm reversed
:type reverse: bool
:param param_attr: Parameter config, None if use default.
......@@ -798,6 +805,7 @@ def lstmemory_group(input,
def __lstm_step__(ipt):
return lstmemory_unit(
input=ipt,
memory_boot=memory_boot,
name=name,
size=size,
mixed_bias_attr=mixed_bias_attr,
......@@ -819,6 +827,7 @@ def lstmemory_group(input,
@wrap_name_default('gru_unit')
def gru_unit(input,
memory_boot=None,
size=None,
name=None,
gru_bias_attr=None,
......@@ -829,8 +838,8 @@ def gru_unit(input,
naive=False):
"""
Define calculations that a gated recurrent unit performs in a single time
step. This function itself is not a recurrent layer, so that it can not be
directly applied to sequence input. This function is almost always used in
step. This function itself is not a recurrent layer, so it can not be
directly used to process sequence inputs. This function is always used in
the recurrent_group (see layers.py for more details) to implement attention
mechanism.
......@@ -838,6 +847,8 @@ def gru_unit(input,
:param input: input layer name.
:type input: LayerOutput
:param memory_boot: the initialization state of the LSTM cell.
:type memory_boot: LayerOutput | None
:param name: name of the gru group.
:type name: basestring
:param size: hidden size of the gru.
......@@ -856,7 +867,7 @@ def gru_unit(input,
if size is None:
size = input.size / 3
out_mem = memory(name=name, size=size)
out_mem = memory(name=name, size=size, boot_layer=memory_boot)
if naive:
__step__ = gru_step_naive_layer
......@@ -878,6 +889,7 @@ def gru_unit(input,
@wrap_name_default('gru_group')
def gru_group(input,
memory_boot=None,
size=None,
name=None,
reverse=False,
......@@ -888,7 +900,7 @@ def gru_group(input,
gru_layer_attr=None,
naive=False):
"""
gru_group is a recurrent layer group version of Gated Recurrent Unit. It
gru_group is a recurrent_group version of Gated Recurrent Unit. It
does exactly the same calculation as the grumemory layer does. A promising
benefit is that gru hidden states are accessible to the user. This is
especially useful in attention model. If you do not need to access
......@@ -908,6 +920,8 @@ def gru_group(input,
:param input: input layer name.
:type input: LayerOutput
:param memory_boot: the initialization state of the LSTM cell.
:type memory_boot: LayerOutput | None
:param name: name of the gru group.
:type name: basestring
:param size: hidden size of the gru.
......@@ -929,6 +943,7 @@ def gru_group(input,
def __gru_step__(ipt):
return gru_unit(
input=ipt,
memory_boot=memory_boot,
name=name,
size=size,
gru_bias_attr=gru_bias_attr,
......@@ -1083,7 +1098,6 @@ def simple_gru2(input,
return grumemory(
name=name,
size=size,
input=m,
reverse=reverse,
bias_attr=gru_bias_attr,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册