From af0bbfa2673a6ffdf996638e6fb5e0b4627eb8b0 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@1ad973e4-5ce8-4261-8a94-b56d1f490c56>
Date: Thu, 1 Sep 2016 02:49:44 +0000
Subject: [PATCH] add comments for networks.py

ISSUE=4611081


git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1475 1ad973e4-5ce8-4261-8a94-b56d1f490c56
---
 .../paddle/trainer_config_helpers/layers.py   |  96 ++++----
 .../paddle/trainer_config_helpers/networks.py | 214 +++++++++++++++---
 2 files changed, 243 insertions(+), 67 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 44e6a31364..f3f7c83f1d 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -743,7 +743,8 @@ def pooling_layer(input, pooling_type=None, name=None, bias_attr=None,
                                 pooling_type=AvgPooling(),
                                 agg_level=AggregateLevel.EACH_SEQUENCE)
 
-    :param agg_level: AggregateLevel.EACH_TIMESTEP or AggregateLevel.EACH_SEQUENCE
+    :param agg_level: AggregateLevel.EACH_TIMESTEP or
+                      AggregateLevel.EACH_SEQUENCE
     :type agg_level: AggregateLevel
     :param name: layer name.
     :type name: basestring
@@ -806,21 +807,24 @@ def lstmemory(input, name=None, reverse=False, act=None,
         h_t & = o_t tanh(c_t)
 
 
-    NOTE: In paddle's implementation, the multiply operation
+    NOTE: In PaddlePaddle's implementation, the multiplications
     :math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`,
-    :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` is not done by
-    lstmemory layer, so it must use a mixed_layer do this full_matrix_projection
-    before lstm is used.
+    :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` are not done in the lstmemory layer,
+    so an additional mixed_layer with full_matrix_projection or a fc_layer must
+    be included in the configuration file to complete the input-to-hidden
+    mappings before lstmemory is called.
 
-    NOTE: This is a low level user interface. You may use network.simple_lstm
+    NOTE: This is a low level user interface. You can use network.simple_lstm
     to config a simple plain lstm layer.
 
-    Please refer **Generating Sequences With Recurrent Neural Networks** if you
-    want to know what lstm is. Link_ is here.
+    Please refer to **Generating Sequences With Recurrent Neural Networks** for
+    more details about LSTM.
+
+    Link_ goes as below.
 
     .. _Link: http://arxiv.org/abs/1308.0850
 
-    TODO(yuyang18): Check lstm can input multiple values or not?
+    TODO(yuyang18): Check lstm can take multiple input values or not?
 
     :param name: The lstmemory layer name.
     :type name: basestring
@@ -894,28 +898,30 @@ def grumemory(input, name=None, reverse=False, act=None,
 
         r_t = \\sigma(W_{r}x_{t} + U_{r}h_{t-1} + b_r)
 
-    3. The candidate activation :math:`\\tilde{h_t}` is computed similarly to that
-    of the traditional recurrent unit:
+    3. The candidate activation :math:`\\tilde{h_t}` is computed similarly to
+    that of the traditional recurrent unit:
 
     ..  math::
 
         {\\tilde{h_t}} = tanh(W x_{t} + U (r_{t} \odot h_{t-1}) + b)
 
-    4. The hidden activation :math:`h_t` of the GRU at time t is a linear interpolation
-    between the previous activation :math:`h_{t-1}` and the candidate activation
-    :math:`\\tilde{h_t}`:
+    4. The hidden activation :math:`h_t` of the GRU at time t is a linear
+    interpolation between the previous activation :math:`h_{t-1}` and the
+    candidate activation :math:`\\tilde{h_t}`:
 
     ..  math::
 
         h_t = (1 - z_t) h_{t-1} + z_t {\\tilde{h_t}}
 
-    NOTE: In paddle's implementation, the multiply operation
+    NOTE: In PaddlePaddle's implementation, the multiplication operations
     :math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not computed in
-    gate_recurrent layer. So it must use a mixed_layer with full_matrix_projection
-    or fc_layer to compute them before GRU.
+    gate_recurrent layer. Consequently, an additional mixed_layer with
+    full_matrix_projection or a fc_layer must be included before grumemory
+    is called.
 
-    The details can refer to `Empirical Evaluation of Gated Recurrent
-    Neural Networks on Sequence Modeling. <https://arxiv.org/abs/1412.3555>`_
+    More details can be found by referring to `Empirical Evaluation of Gated
+    Recurrent Neural Networks on Sequence Modeling.
+    <https://arxiv.org/abs/1412.3555>`_
 
     The simple usage is:
 
@@ -1279,7 +1285,8 @@ def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
 @wrap_name_default()
 @wrap_bias_attr_default(has_bias=True)
 @layer_support()
-def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=None):
+def hsigmoid(input, label, num_classes, name=None, bias_attr=None,
+             layer_attr=None):
     """
     Organize the classes into a binary tree. At each node, a sigmoid function
     is used to calculate the probability of belonging to the right branch.
@@ -1358,12 +1365,12 @@ def img_conv_layer(input, filter_size, num_filters,
     input is raw pixels of image(mono or RGB), or it may be the previous layer's
     num_filters * num_group.
 
-    There are several group of filter in paddle
-    implementation. Each group will process some channel of inputs. For example,
-    if input num_channel = 256, group = 4, num_filter=32, the paddle will create
+    There are several group of filter in PaddlePaddle implementation.
+    Each group will process some channel of the inputs. For example, if an input
+    num_channel = 256, group = 4, num_filter=32, the PaddlePaddle will create
     32*4 = 128 filters to process inputs. The channels will be split into 4
-    pieces. First 256/4 = 64 channels will process by first 32 filters. The rest
-    channels will be processed by rest group of filters.
+    pieces. First 256/4 = 64 channels will process by first 32 filters. The
+    rest channels will be processed by rest group of filters.
 
     :param name: Layer name.
     :type name: basestring
@@ -1371,9 +1378,9 @@ def img_conv_layer(input, filter_size, num_filters,
     :type input: LayerOutput
     :param filter_size: The x dimension of a filter kernel.
     :type filter_size: int
-    :param filter_size_y: The y dimension of a filter kernel. Since paddle now
-                        support rectangular filters, the filter's shape
-                        will be (filter_size, filter_size_y).
+    :param filter_size_y: The y dimension of a filter kernel. Since PaddlePaddle
+                        currently supports rectangular filters, the filter's
+                        shape will be (filter_size, filter_size_y).
     :type filter_size_y: int
     :param num_filters: Each filter group's number of filter
     :param act: Activation type. Default is tanh
@@ -1744,11 +1751,13 @@ def addto_layer(input, act=None, name=None, bias_attr=None,
     inputs. Each input of this layer should be the same size, which is also the
     output size of this layer.
 
-    There is no weight matrix for each input, because it just a simple add operation.
-    If you want to a complicated operation before add, please use mixed_layer.
+    There is no weight matrix for each input, because it just a simple add
+    operation. If you want a complicated operation before add, please use
+    mixed_layer.
 
     It is a very good way to set dropout outside the layers. Since not all
-    paddle layer support dropout, you can add an add_to layer, set dropout here.
+    PaddlePaddle layer support dropout, you can add an add_to layer, set
+    dropout here.
     Please refer to dropout_layer for details.
 
     :param name: Layer name.
@@ -2063,9 +2072,10 @@ def gru_step_layer(input, output_mem, size=None, act=None,
 @layer_support()
 def get_output_layer(input, arg_name, name=None, layer_attr=None):
     """
-    Get layer's output by name. In paddle, a layer might return multiple value,
-    but return one layer output. If user want to reference another output beside
-    default output, use get_output_layer first to get another output from input.
+    Get layer's output by name. In PaddlePaddle, a layer might return multiple
+    values, but returns one layer's output. If the user wants to use another
+    output besides the default one, please use get_output_layer first to get
+    the output from input.
 
     :param name: Layer's name.
     :type name: basestring
@@ -2155,7 +2165,11 @@ class SubsequenceInput(object):
 @wrap_name_default("recurrent_group")
 def recurrent_group(step, input, reverse=False, name=None):
     """
-    Recurrent Group. It supports time steps and sequence steps mechanisms.
+    Recurrent layer group is an extremely flexible recurrent unit in
+    PaddlePaddle. As long as the user defines the calculation done within a
+    time step, PaddlePaddle will iterate such a recurrent calculation over
+    sequence input. This is extremely usefull for attention based model, or
+    Neural Turning Machine like models.
 
     The basic usage (time steps) is:
 
@@ -2603,9 +2617,9 @@ def conv_operator(input, filter_size, num_filters,
     :type input: LayerOutput|list|tuple
     :param filter_size: The x dimension of a filter kernel.
     :type filter_size: int
-    :param filter_size_y: The y dimension of a filter kernel. Since paddle now
-                        support rectangular filters, the filter's shape
-                        will be (filter_size, filter_size_y).
+    :param filter_size_y: The y dimension of a filter kernel. Since
+                        PaddlePaddle now supports rectangular filters,
+                        the filter's shape can be (filter_size, filter_size_y).
     :type filter_size_y: int
     :param num_filter: channel of output data.
     :type num_filter: int
@@ -3264,9 +3278,9 @@ def lambda_cost(input, score, NDCG_num=5, max_sort_size=-1, coeff=1.0):
                           If max_sort_size = -1, then for each list, the
                           algorithm will sort the entire list to get gradient.
                           In other cases, max_sort_size must be greater than or
-                          equal to NDCG_num. And if max_sort_size is greater than
-                          the size of a list, the algorithm will sort the entire
-                          list of get gradient.
+                          equal to NDCG_num. And if max_sort_size is greater
+                          than the size of a list, the algorithm will sort the
+                          entire list of get gradient.
     :type max_sort_size: int
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 1d0a1d52a9..94b5245aba 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -466,7 +466,7 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
     :type input: LayerOutput
     :param size: lstm layer size.
     :type size: int
-    :param reverse: is lstm reversed
+    :param reverse: whether to process the input data in a reverse order
     :type reverse: bool
     :param mat_param_attr: mixed layer's matrix projection parameter attribute.
     :type mat_param_attr: ParameterAttribute
@@ -475,11 +475,11 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
     :type bias_param_attr: ParameterAttribute|False
     :param inner_param_attr: lstm cell parameter attribute.
     :type inner_param_attr: ParameterAttribute
-    :param act: lstm final activate type
+    :param act: lstm final activiation type
     :type act: BaseActivation
-    :param gate_act: lstm gate activate type
+    :param gate_act: lstm gate activiation type
     :type gate_act: BaseActivation
-    :param state_act: lstm state activate type.
+    :param state_act: lstm state activiation type.
     :type state_act: BaseActivation
     :param mixed_layer_attr: mixed layer's extra attribute.
     :type mixed_layer_attr: ExtraLayerAttribute
@@ -503,12 +503,43 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
 
 @wrap_name_default('lstm_unit')
 def lstmemory_unit(input, name=None, size=None, param_attr=None,
-                   act=None, gate_act=None, state_act=None, 
+                   act=None, gate_act=None, state_act=None,
                    mixed_bias_attr=None, lstm_bias_attr=None,
                    mixed_layer_attr=None,lstm_layer_attr=None,
                    get_output_layer_attr=None):
     """
-    TODO(yuyang18): complete docs
+    Define calculations that a LSTM unit performs in a single time step.
+    This function itself is not a recurrent layer, so that it can not be
+    directly applied to sequence input. This function is always used in
+    recurrent_group (see layers.py for more details) to implement attention
+    mechanism.
+
+    Please refer to  **Generating Sequences With Recurrent Neural Networks**
+    for more details about LSTM. The link goes as follows:
+    .. _Link: https://arxiv.org/abs/1308.0850
+
+    ..  math::
+
+        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+
+        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+
+        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+
+        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+
+        h_t & = o_t tanh(c_t)
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        lstm_step = lstmemory_unit(input=[layer1],
+                                   size=256,
+                                   act=TanhActivation(),
+                                   gate_act=SigmoidActivation(),
+                                   state_act=TanhActivation())
+
 
     :param input: input layer name.
     :type input: LayerOutput
@@ -518,11 +549,11 @@ def lstmemory_unit(input, name=None, size=None, param_attr=None,
     :type size: int
     :param param_attr: Parameter config, None if use default.
     :type param_attr: ParameterAttribute
-    :param act: lstm final activate type
+    :param act: lstm final activiation type
     :type act: BaseActivation
-    :param gate_act: lstm gate activate type
+    :param gate_act: lstm gate activiation type
     :type gate_act: BaseActivation
-    :param state_act: lstm state activate type.
+    :param state_act: lstm state activiation type.
     :type state_act: BaseActivation
     :param mixed_bias_attr: bias parameter attribute of mixed layer. 
                             False means no bias, None means default bias.
@@ -579,7 +610,31 @@ def lstmemory_group(input, size=None, name=None,
                     mixed_layer_attr=None, lstm_layer_attr=None,
                     get_output_layer_attr=None):
     """
-    TODO(yuyang18): complete docs
+    lstm_group is a recurrent layer group version Long Short Term Memory. It
+    does exactly the same calculation as the lstmemory layer (see lstmemory in
+    layers.py for the maths) does. A promising benefit is that LSTM memory
+    cell states, or hidden states in every time step are accessible to for the
+    user. This is especially useful in attention model. If you do not need to
+    access to the internal states of the lstm, but merely use its outputs,
+    it is recommanded to use the lstmemory, which is relatively faster than
+    lstmemory_group.
+
+    NOTE: In PaddlePaddle's implementation, the following input-to-hidden
+    multiplications:
+    :math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`,
+    :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` are not done in lstmemory_unit to
+    speed up the calculations. Consequently, an additional mixed_layer with
+    full_matrix_projection must be included before lstmemory_unit is called.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        lstm_step = lstmemory_group(input=[layer1],
+                                    size=256,
+                                    act=TanhActivation(),
+                                    gate_act=SigmoidActivation(),
+                                    state_act=TanhActivation())
 
     :param input: input layer name.
     :type input: LayerOutput
@@ -591,13 +646,13 @@ def lstmemory_group(input, size=None, name=None,
     :type reverse: bool
     :param param_attr: Parameter config, None if use default.
     :type param_attr: ParameterAttribute
-    :param act: lstm final activate type
+    :param act: lstm final activiation type
     :type act: BaseActivation
-    :param gate_act: lstm gate activate type
+    :param gate_act: lstm gate activiation type
     :type gate_act: BaseActivation
-    :param state_act: lstm state activate type.
+    :param state_act: lstm state activiation type.
     :type state_act: BaseActivation
-    :param mixed_bias_attr: bias parameter attribute of mixed layer. 
+    :param mixed_bias_attr: bias parameter attribute of mixed layer.
                             False means no bias, None means default bias.
     :type mixed_bias_attr: ParameterAttribute|False
     :param lstm_bias_attr: bias parameter attribute of lstm layer.
@@ -609,7 +664,7 @@ def lstmemory_group(input, size=None, name=None,
     :type lstm_layer_attr: ExtraLayerAttribute
     :param get_output_layer_attr: get output layer's extra attribute.
     :type get_output_layer_attr: ExtraLayerAttribute
-    :return: lstmemory group name.
+    :return: the lstmemory group.
     :rtype: LayerOutput
     """
 
@@ -639,16 +694,28 @@ def gru_unit(input,
              gate_act=None,
              gru_layer_attr=None):
     """
+    Define calculations that a gated recurrent unit performs in a single time
+    step. This function itself is not a recurrent layer, so that it can not be
+    directly applied to sequence input. This function is almost always used in
+    the recurrent_group (see layers.py for more details) to implement attention
+    mechanism.
 
-    :param input:
+    Please see grumemory in layers.py for the details about the maths.
+
+    :param input: input layer name.
     :type input: LayerOutput
-    :param name:
-    :param size:
-    :param gru_bias_attr:
-    :param act:
-    :param gate_act:
-    :param gru_layer_attr:
-    :return:
+    :param name: name of the gru group.
+    :type name: basestring
+    :param size: hidden size of the gru.
+    :type size: int
+    :param act: type of the activation
+    :type act: BaseActivation
+    :param gate_act: type of the gate activation
+    :type gate_act: BaseActivation
+    :param gru_layer_attr: Extra parameter attribute of the gru layer.
+    :type gru_layer_attr: ParameterAttribute|False
+    :return: the gru output layer.
+    :rtype: LayerOutput
     """
 
     assert input.size % 3 == 0
@@ -678,6 +745,46 @@ def gru_group(input,
               gru_bias_attr=None,
               act=None, gate_act=None,
               gru_layer_attr=None):
+
+    """
+    gru_group is a recurrent layer group version Gated Recurrent Unit. It
+    does exactly the same calculation as the grumemory layer does. A promising
+    benefit is that gru hidden sates are accessible to for the user. This is
+    especially useful in attention model. If you do not need to access to
+    any internal state, but merely use the outputs of a GRU, it is recommanded
+    to use the grumemory, which is relatively faster.
+
+    Please see grumemory in layers.py for more detail about the maths.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        gru = gur_group(input=[layer1],
+                        size=256,
+                        act=TanhActivation(),
+                        gate_act=SigmoidActivation())
+
+    :param input: input layer name.
+    :type input: LayerOutput
+    :param name: name of the gru group.
+    :type name: basestring
+    :param size: hidden size of the gru.
+    :type size: int
+    :param reverse: whether to process the input data in a reverse order
+    :type reverse: bool
+    :param act: type of the activiation
+    :type act: BaseActivation
+    :param gate_act: type of the gate activiation
+    :type gate_act: BaseActivation
+    :param gru_bias_attr: bias. False means no bias, None means default bias.
+    :type gru_bias_attr: ParameterAttribute|False
+    :param gru_layer_attr: Extra parameter attribute of the gru layer.
+    :type gru_layer_attr: ParameterAttribute|False
+    :return: the gru group.
+    :rtype: LayerOutput
+    """
+
     def __gru_step__(ipt):
         return gru_unit(
             input=ipt,
@@ -708,6 +815,43 @@ def simple_gru(input,
                gate_act=None,
                gru_layer_attr=None
                ):
+    """
+    simple_gru is also a recurrent layer group version Gated Recurrent Unit as
+    gru_group. The difference only lies in implemention details.
+    The computational speed is that, grumemory is relatively better than
+    gru_group, and gru_group is relatively better than simple_gru.
+
+    simple_gru does exactly the same calculation as the grumemory layer does.
+    Please see grumemory in layers.py for more detail about the maths.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        gru = gur_group(input=[layer1],
+                        size=256,
+                        act=TanhActivation(),
+                        gate_act=SigmoidActivation())
+
+    :param input: input layer name.
+    :type input: LayerOutput
+    :param name: name of the gru group.
+    :type name: basestring
+    :param size: hidden size of the gru.
+    :type size: int
+    :param reverse: whether to process the input data in a reverse order
+    :type reverse: bool
+    :param act: type of the activiation
+    :type act: BaseActivation
+    :param gate_act: type of the gate activiation
+    :type gate_act: BaseActivation
+    :param gru_bias_attr: bias. False means no bias, None means default bias.
+    :type gru_bias_attr: ParameterAttribute|False
+    :param gru_layer_attr: Extra parameter attribute of the gru layer.
+    :type gru_layer_attr: ParameterAttribute|False
+    :return: the gru group.
+    :rtype: LayerOutput
+    """
     with mixed_layer(name='%s_transform' % name,
                      size=size * 3,
                      bias_attr=mixed_bias_param_attr,
@@ -739,7 +883,22 @@ def bidirectional_lstm(input, size, name=None, return_seq=False,
                        last_seq_attr=None, first_seq_attr=None,
                        concat_attr=None, concat_act=None):
     """
-    TODO(yuyang18): Complete docs
+    A bidirectional_lstm is a recurrent unit that iterates over the input
+    sequence both in forward and bardward orders, and then concatenate two
+    outputs form a final output. However, concatenation of two outputs
+    is not the only way to form the final output, you can also, for example,
+    just add them together.
+
+    Please refer to  **Neural Machine Translation by Jointly Learning to Align
+    and Translate** for more details about the bidirectional lstm.
+    The link goes as follows:
+    .. _Link: https://arxiv.org/pdf/1409.0473v3.pdf
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        lstm_step = bidirectional_lstm(input=[input1], size=512)
 
     :param name: bidirectional lstm layer name.
     :type name: basestring
@@ -747,8 +906,11 @@ def bidirectional_lstm(input, size, name=None, return_seq=False,
     :type input: LayerOutput
     :param size: lstm layer size.
     :type size: int
-    :param return_seq: If False, concat word in last time step and return.
-                       If True, concat sequnce in all time step and return.
+    :param return_seq: If set False, outputs of the last time step are
+                       concatenated and returned.
+                       If set True, the entire output sequences that are
+                       processed in forward and backward directions are
+                       concatenated and returned.
     :type return_seq: bool
     :return: lstm layer name.
     :rtype: LayerOutput
-- 
GitLab