提交 260c734c 编写于 作者: L luotao02

fix bug in trainer_config_helpers

ISSUE=4592807 

git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1423 1ad973e4-5ce8-4261-8a94-b56d1f490c56
上级 9c0895ee
......@@ -51,7 +51,7 @@ SequenceSoftmaxActivation
=========================
.. automodule:: paddle.trainer_config_helpers.activations
:members: SequenceSoftmax
:members: SequenceSoftmaxActivation
:noindex:
ReluActivation
......
......@@ -136,6 +136,18 @@ gru_step_layer
Recurrent Layer Group
=====================
recurrent_group
---------------
.. automodule:: paddle.trainer_config_helpers.layers
:members: recurrent_group
:noindex:
beam_search
------------
.. automodule:: paddle.trainer_config_helpers.layers
:members: beam_search
:noindex:
get_output_layer
-----------------
.. automodule:: paddle.trainer_config_helpers.layers
......
......@@ -43,34 +43,52 @@ vgg_16_network
Recurrent
=========
LSTM
----
lstmemory_unit
--------------
``````````````
.. automodule:: paddle.trainer_config_helpers.networks
:members: lstmemory_unit
:noindex:
lstmemory_group
---------------
```````````````
.. automodule:: paddle.trainer_config_helpers.networks
:members: lstmemory_group
:noindex:
simple_lstm
```````````
.. automodule:: paddle.trainer_config_helpers.networks
:members: simple_lstm
:noindex:
bidirectional_lstm
``````````````````
.. automodule:: paddle.trainer_config_helpers.networks
:members: bidirectional_lstm
:noindex:
GRU
---
gru_unit
---------
````````
.. automodule:: paddle.trainer_config_helpers.networks
:members: gru_unit
:noindex:
simple_lstm
-----------
gru_group
`````````
.. automodule:: paddle.trainer_config_helpers.networks
:members: simple_lstm
:members: gru_group
:noindex:
bidirectional_lstm
------------------
simple_gru
``````````
.. automodule:: paddle.trainer_config_helpers.networks
:members: bidirectional_lstm
:members: simple_gru
:noindex:
simple_attention
......
......@@ -10,10 +10,10 @@ AdamOptimizer
:members: AdamOptimizer
:noindex:
AdamxOptimizer
AdamaxOptimizer
================
.. automodule:: paddle.trainer_config_helpers.optimizers
:members: AdamxOptimizer
:members: AdamaxOptimizer
:noindex:
AdaGradOptimizer
......
......@@ -28,9 +28,8 @@ ld-linux-x86-64.so.2
x86_64-scm-linux-gnu/
.lint.*.md5
examples/crf/*.bin
.idea/
.test_env
Paddle_wrap.cxx
Paddle_wrap.h
paddle.py
......
......@@ -97,13 +97,13 @@ protected:
* @param starts Each start position of each samples.
* @param inputValue The input values.
*/
void forwardSequence(int batchSize, size_t numSequences,
const int *starts, MatrixPtr inputValue);
void forwardSequence(int batchSize, size_t numSequences, const int *starts,
MatrixPtr inputValue);
/**
* Compute lstm backward one sequence by one sequence.
*/
void backwardSequence(int batchSize, size_t numSequences,
const int *starts, MatrixPtr inputGrad);
void backwardSequence(int batchSize, size_t numSequences, const int *starts,
MatrixPtr inputGrad);
/**
* Compute lstm forward one batch by one batch. The batch value is
......@@ -121,21 +121,21 @@ protected:
* }
* @endcode
*/
void forwardBatch(int batchSize, size_t numSequences,
const int *starts, MatrixPtr inputValue);
void forwardBatch(int batchSize, size_t numSequences, const int *starts,
MatrixPtr inputValue);
/**
* Compute lstm backward one batch by one batch.
*/
void backwardBatch(int batchSize, size_t numSequences,
const int *starts, MatrixPtr inputGrad);
void backwardBatch(int batchSize, size_t numSequences, const int *starts,
MatrixPtr inputGrad);
/**
* This function only supports GPU. It not need to reorganize input into
* batch value. It will launch one kernel to parallelly compute forward
* propagation in sequence level.
*/
void forwardSeqParallel(int batchSize, size_t numSequences,
const int *starts, MatrixPtr inputValue);
void forwardSeqParallel(int batchSize, size_t numSequences, const int *starts,
MatrixPtr inputValue);
/**
* Backward propagation corresponding to forwardSeqParallel.
*/
......@@ -157,7 +157,8 @@ protected:
/// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
std::unique_ptr<Weight> weight_;
/// Learned bias parameter, shape: (1, 7 * size).
/// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$.
/// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf},
/// W_{co}\f$.
std::unique_ptr<Weight> bias_;
/// The reeal bias, point to \f$b_i, b_f, b_c, b_o\f$.
MatrixPtr localBias_;
......
......@@ -795,15 +795,15 @@ def lstmemory(input, name=None, reverse=False, act=None,
.. math::
i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
h_t = o_t tanh(c_t)
h_t & = o_t tanh(c_t)
NOTE: In paddle's implementation, the multiply operation
......@@ -1294,8 +1294,6 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=No
label=data_layer,
num_classes=3)
:param name: layer name
:type name: basestring
:param input: Input layers. It could be a LayerOutput or list/tuple of
LayerOutput.
:type input: LayerOutput|list|tuple
......@@ -1303,6 +1301,8 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=No
:type label: LayerOutput
:param num_classes: number of classes.
:type num_classes: int
:param name: layer name
:type name: basestring
:param bias_attr: Bias attribute. None means default bias.
False means no bias.
:type bias_attr: ParameterAttribute|False
......@@ -1943,18 +1943,18 @@ def lstm_step_layer(input, state, size, act=None,
.. math::
i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
h_t = o_t tanh(c_t)
h_t & = o_t tanh(c_t)
The input\_ of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
The input of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
:code:`mixed_layer` and :code:`full_matrix_projection` to calculate these
input vector.
......@@ -2347,12 +2347,12 @@ def eos_layer(input, eos_id, name=None, layer_attr=None):
eos = eos_layer(input=layer, eos_id=id)
:param name: Layer name.
:type name: basestring
:param input: Input layer name.
:type input: LayerOutput
:param eos_id: end id of sequence
:type eos_id: int
:param name: Layer name.
:type name: basestring
:param layer_attr: extra layer attributes.
:type layer_attr: ExtraLayerAttribute.
:return: layer name.
......@@ -2529,11 +2529,11 @@ def conv_operator(input, filter_size, num_filters,
:param num_filter: channel of output data.
:type num_filter: int
:param num_channel: channel of input data.
:rtype num_channel: int
:type num_channel: int
:param stride: The x dimension of the stride.
:rtype stride: int
:type stride: int
:param stride_y: The y dimension of the stride.
:rtype stride_y: int
:type stride_y: int
:param padding: The x dimension of padding.
:type padding: int
:param padding_y: The y dimension of padding.
......@@ -2632,7 +2632,7 @@ def tensor_layer(input, size, act=None, name=None,
:param input: Input layer.
:type input: LayerOutput|list|tuple.
:param size: the layer dimension.
:rtype: int.
:type size: int.
:param act: Activation Type. Default is tanh.
:type act: BaseActivation
:param param_attr: The Parameter Attribute.
......@@ -2840,7 +2840,7 @@ def convex_comb_layer(input, size, name=None):
"""
A layer for convex weighted average of vectors takes two inputs.
- Input: a vector containing the convex weights (batchSize x weightdim),
and a matrix in a vector form (batchSize x (weightdim*datadim)).
and a matrix in a vector form (batchSize x (weightdim * datadim)).
- Output: a vector (batchSize * datadim).
.. math::
......@@ -3100,11 +3100,11 @@ def rank_cost(left, right, lable, weight=None, name=None, coeff=1.0):
.. math::
C_{i,j} = -\\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}})
C_{i,j} & = -\\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}})
o_{i,j} = o_i - o_j
o_{i,j} & = o_i - o_j
\\tilde{P_{i,j}} = \\{0, 0.5, 1\\} \ or \ \\{0, 1\\}
\\tilde{P_{i,j}} & = \\{0, 0.5, 1\\} \ or \ \\{0, 1\\}
In this formula:
- :math:`C_{i,j}` is the cross entropy cost.
......
......@@ -440,20 +440,20 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
"""
Simple LSTM Cell.
It just combine a mix_layer with fully_matrix_projection and a lstmemory
It just combine a mixed layer with fully_matrix_projection and a lstmemory
layer. The simple lstm cell was implemented as follow equations.
.. math::
i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
h_t = o_t tanh(c_t)
h_t & = o_t tanh(c_t)
Please refer **Generating Sequences With Recurrent Neural Networks** if you
want to know what lstm is. Link_ is here.
......@@ -502,28 +502,42 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
@wrap_name_default('lstm_unit')
def lstmemory_unit(input, name=None, size=None,
mixed_bias_attr=None, mixed_layer_attr=None,
param_attr=None, lstm_bias_attr=None,
act=None, gate_act=None,
state_act=None, lstm_layer_attr=None,
def lstmemory_unit(input, name=None, size=None, param_attr=None,
act=None, gate_act=None, state_act=None,
mixed_bias_attr=None, lstm_bias_attr=None,
mixed_layer_attr=None,lstm_layer_attr=None,
get_output_layer_attr=None):
"""
TODO(yuyang18): complete docs
@param input:
@param name:
@param size:
@param mixed_bias_attr:
@param mixed_layer_attr:
@param param_attr:
@param lstm_bias_attr:
@param act:
@param gate_act:
@param state_act:
@param lstm_layer_attr:
@param get_output_layer_attr:
@return:
:param input: input layer name.
:type input: LayerOutput
:param name: lstmemory unit name.
:type name: basestring
:param size: lstmemory unit size.
:type size: int
:param param_attr: Parameter config, None if use default.
:type param_attr: ParameterAttribute
:param act: lstm final activate type
:type act: BaseActivation
:param gate_act: lstm gate activate type
:type gate_act: BaseActivation
:param state_act: lstm state activate type.
:type state_act: BaseActivation
:param mixed_bias_attr: bias parameter attribute of mixed layer.
False means no bias, None means default bias.
:type mixed_bias_attr: ParameterAttribute|False
:param lstm_bias_attr: bias parameter attribute of lstm layer.
False means no bias, None means default bias.
:type lstm_bias_attr: ParameterAttribute|False
:param mixed_layer_attr: mixed layer's extra attribute.
:type mixed_layer_attr: ExtraLayerAttribute
:param lstm_layer_attr: lstm layer's extra attribute.
:type lstm_layer_attr: ExtraLayerAttribute
:param get_output_layer_attr: get output layer's extra attribute.
:type get_output_layer_attr: ExtraLayerAttribute
:return: lstmemory unit name.
:rtype: LayerOutput
"""
if size is None:
assert input.size % 4 == 0
......@@ -560,32 +574,48 @@ def lstmemory_unit(input, name=None, size=None,
@wrap_name_default('lstm_group')
def lstmemory_group(input, size=None, name=None,
reverse=False, param_attr=None,
mix_bias_attr=None, lstm_bias_attr=None,
act=None, gate_act=None, state_act=None,
mixed_bias_attr=None, lstm_bias_attr=None,
mixed_layer_attr=None, lstm_layer_attr=None,
get_output_layer_attr=None):
"""
TODO(yuyang18): complete docs
@param input:
@param size:
@param name:
@param reverse:
@param param_attr:
@param mix_bias_attr:
@param lstm_bias_attr:
@param act:
@param gate_act:
@param state_act:
@param mixed_layer_attr:
@param lstm_layer_attr:
@param get_output_layer_attr:
@return:
:param input: input layer name.
:type input: LayerOutput
:param name: lstmemory group name.
:type name: basestring
:param size: lstmemory group size.
:type size: int
:param reverse: is lstm reversed
:type reverse: bool
:param param_attr: Parameter config, None if use default.
:type param_attr: ParameterAttribute
:param act: lstm final activate type
:type act: BaseActivation
:param gate_act: lstm gate activate type
:type gate_act: BaseActivation
:param state_act: lstm state activate type.
:type state_act: BaseActivation
:param mixed_bias_attr: bias parameter attribute of mixed layer.
False means no bias, None means default bias.
:type mixed_bias_attr: ParameterAttribute|False
:param lstm_bias_attr: bias parameter attribute of lstm layer.
False means no bias, None means default bias.
:type lstm_bias_attr: ParameterAttribute|False
:param mixed_layer_attr: mixed layer's extra attribute.
:type mixed_layer_attr: ExtraLayerAttribute
:param lstm_layer_attr: lstm layer's extra attribute.
:type lstm_layer_attr: ExtraLayerAttribute
:param get_output_layer_attr: get output layer's extra attribute.
:type get_output_layer_attr: ExtraLayerAttribute
:return: lstmemory group name.
:rtype: LayerOutput
"""
def __lstm_step__(ipt):
return lstmemory_unit(input=ipt, name=name,
size=size, mixed_bias_attr=mix_bias_attr,
size=size, mixed_bias_attr=mixed_bias_attr,
mixed_layer_attr=mixed_layer_attr,
param_attr=param_attr,
lstm_bias_attr=lstm_bias_attr,
......@@ -760,13 +790,14 @@ def simple_attention(encoded_sequence,
Size of the context vector equals to size of encoded_sequence.
.. math::
a(s_{i-1},h_{j}) = v_{a}f(W_{a}s_{t-1} + U_{a}h_{j})
.. math::
e_{i,j} = a(s_{i-1}, h_{j})
.. math::
a_{i,j} = \\frac{exp(e_{i,i})}{\\sum_{k=1}^{T_{x}{exp(e_{i,k})}}}
.. math::
c_{i} = \\sum_{j=1}^{T_{x}}a_{i,j}h_{j}
a(s_{i-1},h_{j}) & = v_{a}f(W_{a}s_{t-1} + U_{a}h_{j})
e_{i,j} & = a(s_{i-1}, h_{j})
a_{i,j} & = \\frac{exp(e_{i,i})}{\\sum_{k=1}^{T_{x}{exp(e_{i,k})}}}
c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}h_{j}
where :math:`h_{j}` is the jth element of encoded_sequence,
:math:`U_{a}h_{j}` is the jth element of encoded_proj
......@@ -778,6 +809,7 @@ def simple_attention(encoded_sequence,
https://arxiv.org/abs/1409.0473.
The example usage is:
.. code-block:: python
context = simple_attention(encoded_sequence=enc_seq,
......
......@@ -61,7 +61,7 @@ class BaseSGDOptimizer(Optimizer):
.. math::
w:= w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
where :math:`\\eta` is learning rate. And :math:`n` is batch size.
......@@ -99,9 +99,9 @@ class AdamOptimizer(BaseSGDOptimizer):
.. math::
m(w, t) &:= \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
v(w, t) &:= \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
w &:= w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
:param beta1: the :math:`\\beta_1` in equation.
:type beta1: float
......@@ -136,11 +136,12 @@ class AdamaxOptimizer(BaseSGDOptimizer):
The details of please refer this `Adam: A Method for Stochastic Optimization
<https://arxiv.org/abs/1412.6980>`_
.. math::
m_t &:= \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
u_t &:= max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
w_t &:= w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t
m_t & = \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
u_t & = max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
w_t & = w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t
:param beta1: the :math:`\\beta_1` in the equation.
:type beta1: float
......@@ -175,7 +176,7 @@ class AdaGradOptimizer(BaseSGDOptimizer):
.. math::
G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\
w &:= w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
w & = w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
"""
def to_setting_kwargs(self):
......@@ -197,8 +198,8 @@ class RMSPropOptimizer(BaseSGDOptimizer):
.. math::
v(w, t) &:= \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
w &:= w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)
v(w, t) & = \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
w & = w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)
:param rho: the :math:`\\rho` in the equation. The forgetting factor.
:type rho: float
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册