fix bug in trainer_config_helpers

ISSUE=4592807 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1423 1ad973e4-5ce8-4261-8a94-b56d1f490c56

fix bug in trainer_config_helpers
ISSUE=4592807 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1423 1ad973e4-5ce8-4261-8a94-b56d1f490c56
260c734c · luotao02 · 9c0895ee · 260c734c · 260c734c · 260c734c
9 changed file
--- a/doc/ui/api/trainer_config_helpers/activations.rst
+++ b/doc/ui/api/trainer_config_helpers/activations.rst
@@ -51,7 +51,7 @@ SequenceSoftmaxActivation
 =========================
 ..  automodule:: paddle.trainer_config_helpers.activations
-    :members: SequenceSoftmax
+    :members: SequenceSoftmaxActivation
    :noindex:
 ReluActivation

--- a/doc/ui/api/trainer_config_helpers/layers.rst
+++ b/doc/ui/api/trainer_config_helpers/layers.rst
@@ -136,6 +136,18 @@ gru_step_layer
 Recurrent Layer Group
 =====================
+recurrent_group
+---------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: recurrent_group
+    :noindex:
+beam_search
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: beam_search
+    :noindex:
 get_output_layer
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers

--- a/doc/ui/api/trainer_config_helpers/networks.rst
+++ b/doc/ui/api/trainer_config_helpers/networks.rst
@@ -43,34 +43,52 @@ vgg_16_network
 Recurrent
 =========
+LSTM
+----
 lstmemory_unit
--------------
+``````````````
 ..  automodule:: paddle.trainer_config_helpers.networks
    :members: lstmemory_unit
    :noindex:
 lstmemory_group
---------------
+```````````````
 ..  automodule:: paddle.trainer_config_helpers.networks
    :members: lstmemory_group
    :noindex:
+simple_lstm
+```````````
+..  automodule:: paddle.trainer_config_helpers.networks
+    :members: simple_lstm
+    :noindex:
+bidirectional_lstm
+``````````````````
+..  automodule:: paddle.trainer_config_helpers.networks
+    :members: bidirectional_lstm
+    :noindex:
+GRU
+---
 gru_unit
---------
+````````
 ..  automodule:: paddle.trainer_config_helpers.networks
    :members: gru_unit
    :noindex:
-simple_lstm
+gru_group
-----------
+`````````
 ..  automodule:: paddle.trainer_config_helpers.networks
-    :members: simple_lstm
+    :members: gru_group
    :noindex:
-bidirectional_lstm
+simple_gru
------------------
+``````````
 ..  automodule:: paddle.trainer_config_helpers.networks
-    :members: bidirectional_lstm
+    :members: simple_gru
    :noindex:
 simple_attention

--- a/doc/ui/api/trainer_config_helpers/optimizers.rst
+++ b/doc/ui/api/trainer_config_helpers/optimizers.rst
@@ -10,10 +10,10 @@ AdamOptimizer
    :members: AdamOptimizer
    :noindex:
-AdamxOptimizer
+AdamaxOptimizer
 ================
 ..  automodule:: paddle.trainer_config_helpers.optimizers
-    :members: AdamxOptimizer
+    :members: AdamaxOptimizer
    :noindex:
 AdaGradOptimizer

--- a/paddle/.gitignore
+++ b/paddle/.gitignore
@@ -28,9 +28,8 @@ ld-linux-x86-64.so.2
 x86_64-scm-linux-gnu/
 .lint.*.md5
-examples/crf/*.bin
 .idea/
+.test_env
 Paddle_wrap.cxx
 Paddle_wrap.h
 paddle.py

--- a/paddle/gserver/layers/LstmLayer.h
+++ b/paddle/gserver/layers/LstmLayer.h
@@ -97,13 +97,13 @@ protected:
   * @param starts Each start position of each samples.
   * @param inputValue The input values.
   */
-  void forwardSequence(int batchSize, size_t numSequences,
+  void forwardSequence(int batchSize, size_t numSequences, const int *starts,
-                       const int *starts, MatrixPtr inputValue);
+                       MatrixPtr inputValue);
  /**
   * Compute lstm backward one sequence by one sequence.
   */
-  void backwardSequence(int batchSize, size_t numSequences,
+  void backwardSequence(int batchSize, size_t numSequences, const int *starts,
-                        const int *starts, MatrixPtr inputGrad);
+                        MatrixPtr inputGrad);
  /**
   * Compute lstm forward one batch by one batch. The batch value is
@@ -121,21 +121,21 @@ protected:
   * }
   * @endcode
   */
-  void forwardBatch(int batchSize, size_t numSequences,
+  void forwardBatch(int batchSize, size_t numSequences, const int *starts,
-                    const int *starts, MatrixPtr inputValue);
+                    MatrixPtr inputValue);
  /**
   * Compute lstm backward one batch by one batch.
   */
-  void backwardBatch(int batchSize, size_t numSequences,
+  void backwardBatch(int batchSize, size_t numSequences, const int *starts,
-                     const int *starts, MatrixPtr inputGrad);
+                     MatrixPtr inputGrad);
  /**
   * This function only supports GPU. It not need to reorganize input into
   * batch value. It will launch one kernel to parallelly compute forward
   * propagation in sequence level.
   */
-  void forwardSeqParallel(int batchSize, size_t numSequences,
+  void forwardSeqParallel(int batchSize, size_t numSequences, const int *starts,
-                          const int *starts, MatrixPtr inputValue);
+                          MatrixPtr inputValue);
  /**
   * Backward propagation corresponding to forwardSeqParallel.
   */
@@ -157,7 +157,8 @@ protected:
  /// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
  std::unique_ptr<Weight> weight_;
  /// Learned bias parameter, shape: (1, 7 * size).
-  /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$.
+  /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf},
+  /// W_{co}\f$.
  std::unique_ptr<Weight> bias_;
  /// The reeal bias, point to \f$b_i, b_f, b_c, b_o\f$.
  MatrixPtr localBias_;

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -669,7 +669,7 @@ def fc_layer(input, size, act=None, name=None,
                     act=LinearActivation(),
                     bias_attr=False)
-   which is equal to:
+    which is equal to:
    .. code-block:: python
@@ -795,15 +795,15 @@ def lstmemory(input, name=None, reverse=False, act=None,
    ..  math::
-        i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
-        f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
-        c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
-        o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
-        h_t = o_t tanh(c_t)
+        h_t & = o_t tanh(c_t)
    NOTE: In paddle's implementation, the multiply operation
@@ -1294,8 +1294,6 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=No
                        label=data_layer,
                        num_classes=3)
-    :param name: layer name
-    :type name: basestring
    :param input: Input layers. It could be a LayerOutput or list/tuple of
                 LayerOutput.
    :type input: LayerOutput|list|tuple
@@ -1303,6 +1301,8 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=No
    :type label: LayerOutput
    :param num_classes: number of classes.
    :type num_classes: int
+    :param name: layer name
+    :type name: basestring
    :param bias_attr: Bias attribute. None means default bias.
                      False means no bias.
    :type bias_attr: ParameterAttribute|False
@@ -1943,18 +1943,18 @@ def lstm_step_layer(input, state, size, act=None,
    ..  math::
-        i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
-        f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
-        c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
-        o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
-        h_t = o_t tanh(c_t)
+        h_t & = o_t tanh(c_t)
-    The input\_ of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
+    The input of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
    :code:`mixed_layer` and :code:`full_matrix_projection` to calculate these
    input vector.
@@ -2347,12 +2347,12 @@ def eos_layer(input, eos_id, name=None, layer_attr=None):
       eos = eos_layer(input=layer, eos_id=id)
+    :param name: Layer name.
+    :type name: basestring
    :param input: Input layer name.
    :type input: LayerOutput
    :param eos_id: end id of sequence
    :type eos_id: int
-    :param name: Layer name.
-    :type name: basestring
    :param layer_attr: extra layer attributes.
    :type layer_attr: ExtraLayerAttribute.
    :return: layer name.
@@ -2529,11 +2529,11 @@ def conv_operator(input, filter_size, num_filters,
    :param num_filter: channel of output data.
    :type num_filter: int
    :param num_channel: channel of input data.
-    :rtype num_channel: int
+    :type num_channel: int
    :param stride: The x dimension of the stride.
-    :rtype stride: int
+    :type stride: int
    :param stride_y: The y dimension of the stride.
-    :rtype stride_y: int
+    :type stride_y: int
    :param padding: The x dimension of padding.
    :type padding: int
    :param padding_y: The y dimension of padding.
@@ -2632,7 +2632,7 @@ def tensor_layer(input, size, act=None, name=None,
    :param input: Input layer.
    :type input: LayerOutput|list|tuple.
    :param size: the layer dimension.
-    :rtype: int.
+    :type size: int.
    :param act: Activation Type. Default is tanh.
    :type act: BaseActivation
    :param param_attr: The Parameter Attribute.
@@ -2840,7 +2840,7 @@ def convex_comb_layer(input, size, name=None):
    """
    A layer for convex weighted average of vectors takes two inputs.
      - Input: a vector containing the convex weights (batchSize x weightdim),
-             and a matrix in a vector form (batchSize x (weightdim*datadim)).
+               and a matrix in a vector form (batchSize x (weightdim * datadim)).
      - Output: a vector (batchSize * datadim).
    .. math::
@@ -2893,8 +2893,8 @@ def block_expand_layer(input,
                       name=None):
    """
    Expand feature map to minibatch matrix.
-      - matrix width is: block_y * block_x * channel
+       - matrix width is: block_y * block_x * channel
-      - matirx height is: outputH * outputW
+       - matirx height is: outputH * outputW
    .. math::
@@ -3100,11 +3100,11 @@ def rank_cost(left, right, lable, weight=None, name=None, coeff=1.0):
    .. math::
-       C_{i,j} = -\\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}})
+       C_{i,j} & = -\\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}})
-       o_{i,j} =  o_i - o_j
+       o_{i,j} & =  o_i - o_j
-       \\tilde{P_{i,j}} = \\{0, 0.5, 1\\} \ or \ \\{0, 1\\}
+       \\tilde{P_{i,j}} & = \\{0, 0.5, 1\\} \ or \ \\{0, 1\\}
    In this formula:
      - :math:`C_{i,j}` is the cross entropy cost.

--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -440,20 +440,20 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
    """
    Simple LSTM Cell.
-    It just combine a mix_layer with fully_matrix_projection and a lstmemory
+    It just combine a mixed layer with fully_matrix_projection and a lstmemory
    layer. The simple lstm cell was implemented as follow equations.
    ..  math::
-        i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
-        f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
-        c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
-        o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
-        h_t = o_t tanh(c_t)
+        h_t & = o_t tanh(c_t)
    Please refer **Generating Sequences With Recurrent Neural Networks** if you
    want to know what lstm is. Link_ is here.
@@ -502,28 +502,42 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
 @wrap_name_default('lstm_unit')
-def lstmemory_unit(input, name=None, size=None,
+def lstmemory_unit(input, name=None, size=None, param_attr=None,
-                   mixed_bias_attr=None, mixed_layer_attr=None,
+                   act=None, gate_act=None, state_act=None, 
-                   param_attr=None, lstm_bias_attr=None,
+                   mixed_bias_attr=None, lstm_bias_attr=None,
-                   act=None, gate_act=None,
+                   mixed_layer_attr=None,lstm_layer_attr=None,
-                   state_act=None, lstm_layer_attr=None,
                   get_output_layer_attr=None):
    """
    TODO(yuyang18): complete docs
-    @param input:
+    :param input: input layer name.
-    @param name:
+    :type input: LayerOutput
-    @param size:
+    :param name: lstmemory unit name.
-    @param mixed_bias_attr:
+    :type name: basestring
-    @param mixed_layer_attr:
+    :param size: lstmemory unit size.
-    @param param_attr:
+    :type size: int
-    @param lstm_bias_attr:
+    :param param_attr: Parameter config, None if use default.
-    @param act:
+    :type param_attr: ParameterAttribute
-    @param gate_act:
+    :param act: lstm final activate type
-    @param state_act:
+    :type act: BaseActivation
-    @param lstm_layer_attr:
+    :param gate_act: lstm gate activate type
-    @param get_output_layer_attr:
+    :type gate_act: BaseActivation
-    @return:
+    :param state_act: lstm state activate type.
+    :type state_act: BaseActivation
+    :param mixed_bias_attr: bias parameter attribute of mixed layer. 
+                            False means no bias, None means default bias.
+    :type mixed_bias_attr: ParameterAttribute|False
+    :param lstm_bias_attr: bias parameter attribute of lstm layer.
+                           False means no bias, None means default bias.
+    :type lstm_bias_attr: ParameterAttribute|False
+    :param mixed_layer_attr: mixed layer's extra attribute.
+    :type mixed_layer_attr: ExtraLayerAttribute
+    :param lstm_layer_attr: lstm layer's extra attribute.
+    :type lstm_layer_attr: ExtraLayerAttribute
+    :param get_output_layer_attr: get output layer's extra attribute.
+    :type get_output_layer_attr: ExtraLayerAttribute
+    :return: lstmemory unit name.
+    :rtype: LayerOutput
    """
    if size is None:
        assert input.size % 4 == 0
@@ -560,32 +574,48 @@ def lstmemory_unit(input, name=None, size=None,
 @wrap_name_default('lstm_group')
 def lstmemory_group(input, size=None, name=None,
                    reverse=False, param_attr=None,
-                    mix_bias_attr=None, lstm_bias_attr=None,
                    act=None, gate_act=None, state_act=None,
+                    mixed_bias_attr=None, lstm_bias_attr=None,
                    mixed_layer_attr=None, lstm_layer_attr=None,
                    get_output_layer_attr=None):
    """
    TODO(yuyang18): complete docs
-    @param input:
+    :param input: input layer name.
-    @param size:
+    :type input: LayerOutput
-    @param name:
+    :param name: lstmemory group name.
-    @param reverse:
+    :type name: basestring
-    @param param_attr:
+    :param size: lstmemory group size.
-    @param mix_bias_attr:
+    :type size: int
-    @param lstm_bias_attr:
+    :param reverse: is lstm reversed
-    @param act:
+    :type reverse: bool
-    @param gate_act:
+    :param param_attr: Parameter config, None if use default.
-    @param state_act:
+    :type param_attr: ParameterAttribute
-    @param mixed_layer_attr:
+    :param act: lstm final activate type
-    @param lstm_layer_attr:
+    :type act: BaseActivation
-    @param get_output_layer_attr:
+    :param gate_act: lstm gate activate type
-    @return:
+    :type gate_act: BaseActivation
+    :param state_act: lstm state activate type.
+    :type state_act: BaseActivation
+    :param mixed_bias_attr: bias parameter attribute of mixed layer. 
+                            False means no bias, None means default bias.
+    :type mixed_bias_attr: ParameterAttribute|False
+    :param lstm_bias_attr: bias parameter attribute of lstm layer.
+                           False means no bias, None means default bias.
+    :type lstm_bias_attr: ParameterAttribute|False
+    :param mixed_layer_attr: mixed layer's extra attribute.
+    :type mixed_layer_attr: ExtraLayerAttribute
+    :param lstm_layer_attr: lstm layer's extra attribute.
+    :type lstm_layer_attr: ExtraLayerAttribute
+    :param get_output_layer_attr: get output layer's extra attribute.
+    :type get_output_layer_attr: ExtraLayerAttribute
+    :return: lstmemory group name.
+    :rtype: LayerOutput
    """
    def __lstm_step__(ipt):
        return lstmemory_unit(input=ipt, name=name,
-                              size=size, mixed_bias_attr=mix_bias_attr,
+                              size=size, mixed_bias_attr=mixed_bias_attr,
                              mixed_layer_attr=mixed_layer_attr,
                              param_attr=param_attr,
                              lstm_bias_attr=lstm_bias_attr,
@@ -760,13 +790,14 @@ def simple_attention(encoded_sequence,
    Size of the context vector equals to size of encoded_sequence.
    ..  math::
-        a(s_{i-1},h_{j}) = v_{a}f(W_{a}s_{t-1} + U_{a}h_{j})
-    ..  math::
+        a(s_{i-1},h_{j}) & = v_{a}f(W_{a}s_{t-1} + U_{a}h_{j})
-        e_{i,j} = a(s_{i-1}, h_{j})
-    ..  math::
+        e_{i,j} & = a(s_{i-1}, h_{j})
-        a_{i,j} = \\frac{exp(e_{i,i})}{\\sum_{k=1}^{T_{x}{exp(e_{i,k})}}}
-    ..  math::
+        a_{i,j} & = \\frac{exp(e_{i,i})}{\\sum_{k=1}^{T_{x}{exp(e_{i,k})}}}
-        c_{i} = \\sum_{j=1}^{T_{x}}a_{i,j}h_{j}
+        c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}h_{j}
    where :math:`h_{j}` is the jth element of encoded_sequence,
    :math:`U_{a}h_{j}` is the jth element of encoded_proj
@@ -778,6 +809,7 @@ def simple_attention(encoded_sequence,
    https://arxiv.org/abs/1409.0473.
    The example usage is:
    ..  code-block:: python
        context = simple_attention(encoded_sequence=enc_seq,

--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -61,7 +61,7 @@ class BaseSGDOptimizer(Optimizer):
    ..  math::
-        w:= w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
+        w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
    where :math:`\\eta` is learning rate. And :math:`n` is batch size.
@@ -99,9 +99,9 @@ class AdamOptimizer(BaseSGDOptimizer):
    ..  math::
-        m(w, t) &:= \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
+        m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
-        v(w, t) &:= \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
+        v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
-        w &:= w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
+        w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
    :param beta1: the :math:`\\beta_1` in equation.
    :type beta1: float
@@ -136,11 +136,12 @@ class AdamaxOptimizer(BaseSGDOptimizer):
    The details of please refer this `Adam: A Method for Stochastic Optimization
    <https://arxiv.org/abs/1412.6980>`_
    ..  math::
-        m_t &:= \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
+        m_t & = \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
-        u_t &:= max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
+        u_t & = max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
-        w_t &:= w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t
+        w_t & = w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t
    :param beta1: the :math:`\\beta_1` in the equation.
    :type beta1: float
@@ -175,7 +176,7 @@ class AdaGradOptimizer(BaseSGDOptimizer):
    ..  math::
        G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\
-        w &:= w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
+        w & = w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
    """
    def to_setting_kwargs(self):
@@ -197,8 +198,8 @@ class RMSPropOptimizer(BaseSGDOptimizer):
    ..  math::
-        v(w, t) &:= \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
+        v(w, t) & = \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
-        w &:= w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)
+        w & = w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)
    :param rho: the :math:`\\rho` in the equation. The forgetting factor.
    :type rho: float