提交 260c734c 编写于 作者: L luotao02

fix bug in trainer_config_helpers

ISSUE=4592807 

git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1423 1ad973e4-5ce8-4261-8a94-b56d1f490c56
上级 9c0895ee
...@@ -51,7 +51,7 @@ SequenceSoftmaxActivation ...@@ -51,7 +51,7 @@ SequenceSoftmaxActivation
========================= =========================
.. automodule:: paddle.trainer_config_helpers.activations .. automodule:: paddle.trainer_config_helpers.activations
:members: SequenceSoftmax :members: SequenceSoftmaxActivation
:noindex: :noindex:
ReluActivation ReluActivation
......
...@@ -136,6 +136,18 @@ gru_step_layer ...@@ -136,6 +136,18 @@ gru_step_layer
Recurrent Layer Group Recurrent Layer Group
===================== =====================
recurrent_group
---------------
.. automodule:: paddle.trainer_config_helpers.layers
:members: recurrent_group
:noindex:
beam_search
------------
.. automodule:: paddle.trainer_config_helpers.layers
:members: beam_search
:noindex:
get_output_layer get_output_layer
----------------- -----------------
.. automodule:: paddle.trainer_config_helpers.layers .. automodule:: paddle.trainer_config_helpers.layers
......
...@@ -43,34 +43,52 @@ vgg_16_network ...@@ -43,34 +43,52 @@ vgg_16_network
Recurrent Recurrent
========= =========
LSTM
----
lstmemory_unit lstmemory_unit
-------------- ``````````````
.. automodule:: paddle.trainer_config_helpers.networks .. automodule:: paddle.trainer_config_helpers.networks
:members: lstmemory_unit :members: lstmemory_unit
:noindex: :noindex:
lstmemory_group lstmemory_group
--------------- ```````````````
.. automodule:: paddle.trainer_config_helpers.networks .. automodule:: paddle.trainer_config_helpers.networks
:members: lstmemory_group :members: lstmemory_group
:noindex: :noindex:
simple_lstm
```````````
.. automodule:: paddle.trainer_config_helpers.networks
:members: simple_lstm
:noindex:
bidirectional_lstm
``````````````````
.. automodule:: paddle.trainer_config_helpers.networks
:members: bidirectional_lstm
:noindex:
GRU
---
gru_unit gru_unit
--------- ````````
.. automodule:: paddle.trainer_config_helpers.networks .. automodule:: paddle.trainer_config_helpers.networks
:members: gru_unit :members: gru_unit
:noindex: :noindex:
simple_lstm gru_group
----------- `````````
.. automodule:: paddle.trainer_config_helpers.networks .. automodule:: paddle.trainer_config_helpers.networks
:members: simple_lstm :members: gru_group
:noindex: :noindex:
bidirectional_lstm simple_gru
------------------ ``````````
.. automodule:: paddle.trainer_config_helpers.networks .. automodule:: paddle.trainer_config_helpers.networks
:members: bidirectional_lstm :members: simple_gru
:noindex: :noindex:
simple_attention simple_attention
......
...@@ -10,10 +10,10 @@ AdamOptimizer ...@@ -10,10 +10,10 @@ AdamOptimizer
:members: AdamOptimizer :members: AdamOptimizer
:noindex: :noindex:
AdamxOptimizer AdamaxOptimizer
================ ================
.. automodule:: paddle.trainer_config_helpers.optimizers .. automodule:: paddle.trainer_config_helpers.optimizers
:members: AdamxOptimizer :members: AdamaxOptimizer
:noindex: :noindex:
AdaGradOptimizer AdaGradOptimizer
......
...@@ -28,9 +28,8 @@ ld-linux-x86-64.so.2 ...@@ -28,9 +28,8 @@ ld-linux-x86-64.so.2
x86_64-scm-linux-gnu/ x86_64-scm-linux-gnu/
.lint.*.md5 .lint.*.md5
examples/crf/*.bin
.idea/ .idea/
.test_env
Paddle_wrap.cxx Paddle_wrap.cxx
Paddle_wrap.h Paddle_wrap.h
paddle.py paddle.py
......
...@@ -97,13 +97,13 @@ protected: ...@@ -97,13 +97,13 @@ protected:
* @param starts Each start position of each samples. * @param starts Each start position of each samples.
* @param inputValue The input values. * @param inputValue The input values.
*/ */
void forwardSequence(int batchSize, size_t numSequences, void forwardSequence(int batchSize, size_t numSequences, const int *starts,
const int *starts, MatrixPtr inputValue); MatrixPtr inputValue);
/** /**
* Compute lstm backward one sequence by one sequence. * Compute lstm backward one sequence by one sequence.
*/ */
void backwardSequence(int batchSize, size_t numSequences, void backwardSequence(int batchSize, size_t numSequences, const int *starts,
const int *starts, MatrixPtr inputGrad); MatrixPtr inputGrad);
/** /**
* Compute lstm forward one batch by one batch. The batch value is * Compute lstm forward one batch by one batch. The batch value is
...@@ -121,21 +121,21 @@ protected: ...@@ -121,21 +121,21 @@ protected:
* } * }
* @endcode * @endcode
*/ */
void forwardBatch(int batchSize, size_t numSequences, void forwardBatch(int batchSize, size_t numSequences, const int *starts,
const int *starts, MatrixPtr inputValue); MatrixPtr inputValue);
/** /**
* Compute lstm backward one batch by one batch. * Compute lstm backward one batch by one batch.
*/ */
void backwardBatch(int batchSize, size_t numSequences, void backwardBatch(int batchSize, size_t numSequences, const int *starts,
const int *starts, MatrixPtr inputGrad); MatrixPtr inputGrad);
/** /**
* This function only supports GPU. It not need to reorganize input into * This function only supports GPU. It not need to reorganize input into
* batch value. It will launch one kernel to parallelly compute forward * batch value. It will launch one kernel to parallelly compute forward
* propagation in sequence level. * propagation in sequence level.
*/ */
void forwardSeqParallel(int batchSize, size_t numSequences, void forwardSeqParallel(int batchSize, size_t numSequences, const int *starts,
const int *starts, MatrixPtr inputValue); MatrixPtr inputValue);
/** /**
* Backward propagation corresponding to forwardSeqParallel. * Backward propagation corresponding to forwardSeqParallel.
*/ */
...@@ -157,7 +157,8 @@ protected: ...@@ -157,7 +157,8 @@ protected:
/// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$. /// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
std::unique_ptr<Weight> weight_; std::unique_ptr<Weight> weight_;
/// Learned bias parameter, shape: (1, 7 * size). /// Learned bias parameter, shape: (1, 7 * size).
/// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$. /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf},
/// W_{co}\f$.
std::unique_ptr<Weight> bias_; std::unique_ptr<Weight> bias_;
/// The reeal bias, point to \f$b_i, b_f, b_c, b_o\f$. /// The reeal bias, point to \f$b_i, b_f, b_c, b_o\f$.
MatrixPtr localBias_; MatrixPtr localBias_;
......
...@@ -669,7 +669,7 @@ def fc_layer(input, size, act=None, name=None, ...@@ -669,7 +669,7 @@ def fc_layer(input, size, act=None, name=None,
act=LinearActivation(), act=LinearActivation(),
bias_attr=False) bias_attr=False)
which is equal to: which is equal to:
.. code-block:: python .. code-block:: python
...@@ -795,15 +795,15 @@ def lstmemory(input, name=None, reverse=False, act=None, ...@@ -795,15 +795,15 @@ def lstmemory(input, name=None, reverse=False, act=None,
.. math:: .. math::
i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
h_t = o_t tanh(c_t) h_t & = o_t tanh(c_t)
NOTE: In paddle's implementation, the multiply operation NOTE: In paddle's implementation, the multiply operation
...@@ -1294,8 +1294,6 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=No ...@@ -1294,8 +1294,6 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=No
label=data_layer, label=data_layer,
num_classes=3) num_classes=3)
:param name: layer name
:type name: basestring
:param input: Input layers. It could be a LayerOutput or list/tuple of :param input: Input layers. It could be a LayerOutput or list/tuple of
LayerOutput. LayerOutput.
:type input: LayerOutput|list|tuple :type input: LayerOutput|list|tuple
...@@ -1303,6 +1301,8 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=No ...@@ -1303,6 +1301,8 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=No
:type label: LayerOutput :type label: LayerOutput
:param num_classes: number of classes. :param num_classes: number of classes.
:type num_classes: int :type num_classes: int
:param name: layer name
:type name: basestring
:param bias_attr: Bias attribute. None means default bias. :param bias_attr: Bias attribute. None means default bias.
False means no bias. False means no bias.
:type bias_attr: ParameterAttribute|False :type bias_attr: ParameterAttribute|False
...@@ -1943,18 +1943,18 @@ def lstm_step_layer(input, state, size, act=None, ...@@ -1943,18 +1943,18 @@ def lstm_step_layer(input, state, size, act=None,
.. math:: .. math::
i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
h_t = o_t tanh(c_t) h_t & = o_t tanh(c_t)
The input\_ of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use The input of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
:code:`mixed_layer` and :code:`full_matrix_projection` to calculate these :code:`mixed_layer` and :code:`full_matrix_projection` to calculate these
input vector. input vector.
...@@ -2347,12 +2347,12 @@ def eos_layer(input, eos_id, name=None, layer_attr=None): ...@@ -2347,12 +2347,12 @@ def eos_layer(input, eos_id, name=None, layer_attr=None):
eos = eos_layer(input=layer, eos_id=id) eos = eos_layer(input=layer, eos_id=id)
:param name: Layer name.
:type name: basestring
:param input: Input layer name. :param input: Input layer name.
:type input: LayerOutput :type input: LayerOutput
:param eos_id: end id of sequence :param eos_id: end id of sequence
:type eos_id: int :type eos_id: int
:param name: Layer name.
:type name: basestring
:param layer_attr: extra layer attributes. :param layer_attr: extra layer attributes.
:type layer_attr: ExtraLayerAttribute. :type layer_attr: ExtraLayerAttribute.
:return: layer name. :return: layer name.
...@@ -2529,11 +2529,11 @@ def conv_operator(input, filter_size, num_filters, ...@@ -2529,11 +2529,11 @@ def conv_operator(input, filter_size, num_filters,
:param num_filter: channel of output data. :param num_filter: channel of output data.
:type num_filter: int :type num_filter: int
:param num_channel: channel of input data. :param num_channel: channel of input data.
:rtype num_channel: int :type num_channel: int
:param stride: The x dimension of the stride. :param stride: The x dimension of the stride.
:rtype stride: int :type stride: int
:param stride_y: The y dimension of the stride. :param stride_y: The y dimension of the stride.
:rtype stride_y: int :type stride_y: int
:param padding: The x dimension of padding. :param padding: The x dimension of padding.
:type padding: int :type padding: int
:param padding_y: The y dimension of padding. :param padding_y: The y dimension of padding.
...@@ -2632,7 +2632,7 @@ def tensor_layer(input, size, act=None, name=None, ...@@ -2632,7 +2632,7 @@ def tensor_layer(input, size, act=None, name=None,
:param input: Input layer. :param input: Input layer.
:type input: LayerOutput|list|tuple. :type input: LayerOutput|list|tuple.
:param size: the layer dimension. :param size: the layer dimension.
:rtype: int. :type size: int.
:param act: Activation Type. Default is tanh. :param act: Activation Type. Default is tanh.
:type act: BaseActivation :type act: BaseActivation
:param param_attr: The Parameter Attribute. :param param_attr: The Parameter Attribute.
...@@ -2840,7 +2840,7 @@ def convex_comb_layer(input, size, name=None): ...@@ -2840,7 +2840,7 @@ def convex_comb_layer(input, size, name=None):
""" """
A layer for convex weighted average of vectors takes two inputs. A layer for convex weighted average of vectors takes two inputs.
- Input: a vector containing the convex weights (batchSize x weightdim), - Input: a vector containing the convex weights (batchSize x weightdim),
and a matrix in a vector form (batchSize x (weightdim*datadim)). and a matrix in a vector form (batchSize x (weightdim * datadim)).
- Output: a vector (batchSize * datadim). - Output: a vector (batchSize * datadim).
.. math:: .. math::
...@@ -2893,8 +2893,8 @@ def block_expand_layer(input, ...@@ -2893,8 +2893,8 @@ def block_expand_layer(input,
name=None): name=None):
""" """
Expand feature map to minibatch matrix. Expand feature map to minibatch matrix.
- matrix width is: block_y * block_x * channel - matrix width is: block_y * block_x * channel
- matirx height is: outputH * outputW - matirx height is: outputH * outputW
.. math:: .. math::
...@@ -3100,11 +3100,11 @@ def rank_cost(left, right, lable, weight=None, name=None, coeff=1.0): ...@@ -3100,11 +3100,11 @@ def rank_cost(left, right, lable, weight=None, name=None, coeff=1.0):
.. math:: .. math::
C_{i,j} = -\\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) C_{i,j} & = -\\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}})
o_{i,j} = o_i - o_j o_{i,j} & = o_i - o_j
\\tilde{P_{i,j}} = \\{0, 0.5, 1\\} \ or \ \\{0, 1\\} \\tilde{P_{i,j}} & = \\{0, 0.5, 1\\} \ or \ \\{0, 1\\}
In this formula: In this formula:
- :math:`C_{i,j}` is the cross entropy cost. - :math:`C_{i,j}` is the cross entropy cost.
......
...@@ -440,20 +440,20 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None, ...@@ -440,20 +440,20 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
""" """
Simple LSTM Cell. Simple LSTM Cell.
It just combine a mix_layer with fully_matrix_projection and a lstmemory It just combine a mixed layer with fully_matrix_projection and a lstmemory
layer. The simple lstm cell was implemented as follow equations. layer. The simple lstm cell was implemented as follow equations.
.. math:: .. math::
i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
h_t = o_t tanh(c_t) h_t & = o_t tanh(c_t)
Please refer **Generating Sequences With Recurrent Neural Networks** if you Please refer **Generating Sequences With Recurrent Neural Networks** if you
want to know what lstm is. Link_ is here. want to know what lstm is. Link_ is here.
...@@ -502,28 +502,42 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None, ...@@ -502,28 +502,42 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
@wrap_name_default('lstm_unit') @wrap_name_default('lstm_unit')
def lstmemory_unit(input, name=None, size=None, def lstmemory_unit(input, name=None, size=None, param_attr=None,
mixed_bias_attr=None, mixed_layer_attr=None, act=None, gate_act=None, state_act=None,
param_attr=None, lstm_bias_attr=None, mixed_bias_attr=None, lstm_bias_attr=None,
act=None, gate_act=None, mixed_layer_attr=None,lstm_layer_attr=None,
state_act=None, lstm_layer_attr=None,
get_output_layer_attr=None): get_output_layer_attr=None):
""" """
TODO(yuyang18): complete docs TODO(yuyang18): complete docs
@param input: :param input: input layer name.
@param name: :type input: LayerOutput
@param size: :param name: lstmemory unit name.
@param mixed_bias_attr: :type name: basestring
@param mixed_layer_attr: :param size: lstmemory unit size.
@param param_attr: :type size: int
@param lstm_bias_attr: :param param_attr: Parameter config, None if use default.
@param act: :type param_attr: ParameterAttribute
@param gate_act: :param act: lstm final activate type
@param state_act: :type act: BaseActivation
@param lstm_layer_attr: :param gate_act: lstm gate activate type
@param get_output_layer_attr: :type gate_act: BaseActivation
@return: :param state_act: lstm state activate type.
:type state_act: BaseActivation
:param mixed_bias_attr: bias parameter attribute of mixed layer.
False means no bias, None means default bias.
:type mixed_bias_attr: ParameterAttribute|False
:param lstm_bias_attr: bias parameter attribute of lstm layer.
False means no bias, None means default bias.
:type lstm_bias_attr: ParameterAttribute|False
:param mixed_layer_attr: mixed layer's extra attribute.
:type mixed_layer_attr: ExtraLayerAttribute
:param lstm_layer_attr: lstm layer's extra attribute.
:type lstm_layer_attr: ExtraLayerAttribute
:param get_output_layer_attr: get output layer's extra attribute.
:type get_output_layer_attr: ExtraLayerAttribute
:return: lstmemory unit name.
:rtype: LayerOutput
""" """
if size is None: if size is None:
assert input.size % 4 == 0 assert input.size % 4 == 0
...@@ -560,32 +574,48 @@ def lstmemory_unit(input, name=None, size=None, ...@@ -560,32 +574,48 @@ def lstmemory_unit(input, name=None, size=None,
@wrap_name_default('lstm_group') @wrap_name_default('lstm_group')
def lstmemory_group(input, size=None, name=None, def lstmemory_group(input, size=None, name=None,
reverse=False, param_attr=None, reverse=False, param_attr=None,
mix_bias_attr=None, lstm_bias_attr=None,
act=None, gate_act=None, state_act=None, act=None, gate_act=None, state_act=None,
mixed_bias_attr=None, lstm_bias_attr=None,
mixed_layer_attr=None, lstm_layer_attr=None, mixed_layer_attr=None, lstm_layer_attr=None,
get_output_layer_attr=None): get_output_layer_attr=None):
""" """
TODO(yuyang18): complete docs TODO(yuyang18): complete docs
@param input: :param input: input layer name.
@param size: :type input: LayerOutput
@param name: :param name: lstmemory group name.
@param reverse: :type name: basestring
@param param_attr: :param size: lstmemory group size.
@param mix_bias_attr: :type size: int
@param lstm_bias_attr: :param reverse: is lstm reversed
@param act: :type reverse: bool
@param gate_act: :param param_attr: Parameter config, None if use default.
@param state_act: :type param_attr: ParameterAttribute
@param mixed_layer_attr: :param act: lstm final activate type
@param lstm_layer_attr: :type act: BaseActivation
@param get_output_layer_attr: :param gate_act: lstm gate activate type
@return: :type gate_act: BaseActivation
:param state_act: lstm state activate type.
:type state_act: BaseActivation
:param mixed_bias_attr: bias parameter attribute of mixed layer.
False means no bias, None means default bias.
:type mixed_bias_attr: ParameterAttribute|False
:param lstm_bias_attr: bias parameter attribute of lstm layer.
False means no bias, None means default bias.
:type lstm_bias_attr: ParameterAttribute|False
:param mixed_layer_attr: mixed layer's extra attribute.
:type mixed_layer_attr: ExtraLayerAttribute
:param lstm_layer_attr: lstm layer's extra attribute.
:type lstm_layer_attr: ExtraLayerAttribute
:param get_output_layer_attr: get output layer's extra attribute.
:type get_output_layer_attr: ExtraLayerAttribute
:return: lstmemory group name.
:rtype: LayerOutput
""" """
def __lstm_step__(ipt): def __lstm_step__(ipt):
return lstmemory_unit(input=ipt, name=name, return lstmemory_unit(input=ipt, name=name,
size=size, mixed_bias_attr=mix_bias_attr, size=size, mixed_bias_attr=mixed_bias_attr,
mixed_layer_attr=mixed_layer_attr, mixed_layer_attr=mixed_layer_attr,
param_attr=param_attr, param_attr=param_attr,
lstm_bias_attr=lstm_bias_attr, lstm_bias_attr=lstm_bias_attr,
...@@ -760,13 +790,14 @@ def simple_attention(encoded_sequence, ...@@ -760,13 +790,14 @@ def simple_attention(encoded_sequence,
Size of the context vector equals to size of encoded_sequence. Size of the context vector equals to size of encoded_sequence.
.. math:: .. math::
a(s_{i-1},h_{j}) = v_{a}f(W_{a}s_{t-1} + U_{a}h_{j})
.. math:: a(s_{i-1},h_{j}) & = v_{a}f(W_{a}s_{t-1} + U_{a}h_{j})
e_{i,j} = a(s_{i-1}, h_{j})
.. math:: e_{i,j} & = a(s_{i-1}, h_{j})
a_{i,j} = \\frac{exp(e_{i,i})}{\\sum_{k=1}^{T_{x}{exp(e_{i,k})}}}
.. math:: a_{i,j} & = \\frac{exp(e_{i,i})}{\\sum_{k=1}^{T_{x}{exp(e_{i,k})}}}
c_{i} = \\sum_{j=1}^{T_{x}}a_{i,j}h_{j}
c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}h_{j}
where :math:`h_{j}` is the jth element of encoded_sequence, where :math:`h_{j}` is the jth element of encoded_sequence,
:math:`U_{a}h_{j}` is the jth element of encoded_proj :math:`U_{a}h_{j}` is the jth element of encoded_proj
...@@ -778,6 +809,7 @@ def simple_attention(encoded_sequence, ...@@ -778,6 +809,7 @@ def simple_attention(encoded_sequence,
https://arxiv.org/abs/1409.0473. https://arxiv.org/abs/1409.0473.
The example usage is: The example usage is:
.. code-block:: python .. code-block:: python
context = simple_attention(encoded_sequence=enc_seq, context = simple_attention(encoded_sequence=enc_seq,
......
...@@ -61,7 +61,7 @@ class BaseSGDOptimizer(Optimizer): ...@@ -61,7 +61,7 @@ class BaseSGDOptimizer(Optimizer):
.. math:: .. math::
w:= w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w) w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
where :math:`\\eta` is learning rate. And :math:`n` is batch size. where :math:`\\eta` is learning rate. And :math:`n` is batch size.
...@@ -99,9 +99,9 @@ class AdamOptimizer(BaseSGDOptimizer): ...@@ -99,9 +99,9 @@ class AdamOptimizer(BaseSGDOptimizer):
.. math:: .. math::
m(w, t) &:= \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\ m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
v(w, t) &:= \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\ v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
w &:= w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}} w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
:param beta1: the :math:`\\beta_1` in equation. :param beta1: the :math:`\\beta_1` in equation.
:type beta1: float :type beta1: float
...@@ -136,11 +136,12 @@ class AdamaxOptimizer(BaseSGDOptimizer): ...@@ -136,11 +136,12 @@ class AdamaxOptimizer(BaseSGDOptimizer):
The details of please refer this `Adam: A Method for Stochastic Optimization The details of please refer this `Adam: A Method for Stochastic Optimization
<https://arxiv.org/abs/1412.6980>`_ <https://arxiv.org/abs/1412.6980>`_
.. math:: .. math::
m_t &:= \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\ m_t & = \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
u_t &:= max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\ u_t & = max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
w_t &:= w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t w_t & = w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t
:param beta1: the :math:`\\beta_1` in the equation. :param beta1: the :math:`\\beta_1` in the equation.
:type beta1: float :type beta1: float
...@@ -175,7 +176,7 @@ class AdaGradOptimizer(BaseSGDOptimizer): ...@@ -175,7 +176,7 @@ class AdaGradOptimizer(BaseSGDOptimizer):
.. math:: .. math::
G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\ G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\
w &:= w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g w & = w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
""" """
def to_setting_kwargs(self): def to_setting_kwargs(self):
...@@ -197,8 +198,8 @@ class RMSPropOptimizer(BaseSGDOptimizer): ...@@ -197,8 +198,8 @@ class RMSPropOptimizer(BaseSGDOptimizer):
.. math:: .. math::
v(w, t) &:= \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\ v(w, t) & = \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
w &:= w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w) w & = w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)
:param rho: the :math:`\\rho` in the equation. The forgetting factor. :param rho: the :math:`\\rho` in the equation. The forgetting factor.
:type rho: float :type rho: float
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册