From af0bbfa2673a6ffdf996638e6fb5e0b4627eb8b0 Mon Sep 17 00:00:00 2001 From: caoying03 Date: Thu, 1 Sep 2016 02:49:44 +0000 Subject: [PATCH] add comments for networks.py ISSUE=4611081 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1475 1ad973e4-5ce8-4261-8a94-b56d1f490c56 --- .../paddle/trainer_config_helpers/layers.py | 96 ++++---- .../paddle/trainer_config_helpers/networks.py | 214 +++++++++++++++--- 2 files changed, 243 insertions(+), 67 deletions(-) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 44e6a31364..f3f7c83f1d 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -743,7 +743,8 @@ def pooling_layer(input, pooling_type=None, name=None, bias_attr=None, pooling_type=AvgPooling(), agg_level=AggregateLevel.EACH_SEQUENCE) - :param agg_level: AggregateLevel.EACH_TIMESTEP or AggregateLevel.EACH_SEQUENCE + :param agg_level: AggregateLevel.EACH_TIMESTEP or + AggregateLevel.EACH_SEQUENCE :type agg_level: AggregateLevel :param name: layer name. :type name: basestring @@ -806,21 +807,24 @@ def lstmemory(input, name=None, reverse=False, act=None, h_t & = o_t tanh(c_t) - NOTE: In paddle's implementation, the multiply operation + NOTE: In PaddlePaddle's implementation, the multiplications :math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`, - :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` is not done by - lstmemory layer, so it must use a mixed_layer do this full_matrix_projection - before lstm is used. + :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` are not done in the lstmemory layer, + so an additional mixed_layer with full_matrix_projection or a fc_layer must + be included in the configuration file to complete the input-to-hidden + mappings before lstmemory is called. - NOTE: This is a low level user interface. You may use network.simple_lstm + NOTE: This is a low level user interface. You can use network.simple_lstm to config a simple plain lstm layer. - Please refer **Generating Sequences With Recurrent Neural Networks** if you - want to know what lstm is. Link_ is here. + Please refer to **Generating Sequences With Recurrent Neural Networks** for + more details about LSTM. + + Link_ goes as below. .. _Link: http://arxiv.org/abs/1308.0850 - TODO(yuyang18): Check lstm can input multiple values or not? + TODO(yuyang18): Check lstm can take multiple input values or not? :param name: The lstmemory layer name. :type name: basestring @@ -894,28 +898,30 @@ def grumemory(input, name=None, reverse=False, act=None, r_t = \\sigma(W_{r}x_{t} + U_{r}h_{t-1} + b_r) - 3. The candidate activation :math:`\\tilde{h_t}` is computed similarly to that - of the traditional recurrent unit: + 3. The candidate activation :math:`\\tilde{h_t}` is computed similarly to + that of the traditional recurrent unit: .. math:: {\\tilde{h_t}} = tanh(W x_{t} + U (r_{t} \odot h_{t-1}) + b) - 4. The hidden activation :math:`h_t` of the GRU at time t is a linear interpolation - between the previous activation :math:`h_{t-1}` and the candidate activation - :math:`\\tilde{h_t}`: + 4. The hidden activation :math:`h_t` of the GRU at time t is a linear + interpolation between the previous activation :math:`h_{t-1}` and the + candidate activation :math:`\\tilde{h_t}`: .. math:: h_t = (1 - z_t) h_{t-1} + z_t {\\tilde{h_t}} - NOTE: In paddle's implementation, the multiply operation + NOTE: In PaddlePaddle's implementation, the multiplication operations :math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not computed in - gate_recurrent layer. So it must use a mixed_layer with full_matrix_projection - or fc_layer to compute them before GRU. + gate_recurrent layer. Consequently, an additional mixed_layer with + full_matrix_projection or a fc_layer must be included before grumemory + is called. - The details can refer to `Empirical Evaluation of Gated Recurrent - Neural Networks on Sequence Modeling. `_ + More details can be found by referring to `Empirical Evaluation of Gated + Recurrent Neural Networks on Sequence Modeling. + `_ The simple usage is: @@ -1279,7 +1285,8 @@ def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None): @wrap_name_default() @wrap_bias_attr_default(has_bias=True) @layer_support() -def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=None): +def hsigmoid(input, label, num_classes, name=None, bias_attr=None, + layer_attr=None): """ Organize the classes into a binary tree. At each node, a sigmoid function is used to calculate the probability of belonging to the right branch. @@ -1358,12 +1365,12 @@ def img_conv_layer(input, filter_size, num_filters, input is raw pixels of image(mono or RGB), or it may be the previous layer's num_filters * num_group. - There are several group of filter in paddle - implementation. Each group will process some channel of inputs. For example, - if input num_channel = 256, group = 4, num_filter=32, the paddle will create + There are several group of filter in PaddlePaddle implementation. + Each group will process some channel of the inputs. For example, if an input + num_channel = 256, group = 4, num_filter=32, the PaddlePaddle will create 32*4 = 128 filters to process inputs. The channels will be split into 4 - pieces. First 256/4 = 64 channels will process by first 32 filters. The rest - channels will be processed by rest group of filters. + pieces. First 256/4 = 64 channels will process by first 32 filters. The + rest channels will be processed by rest group of filters. :param name: Layer name. :type name: basestring @@ -1371,9 +1378,9 @@ def img_conv_layer(input, filter_size, num_filters, :type input: LayerOutput :param filter_size: The x dimension of a filter kernel. :type filter_size: int - :param filter_size_y: The y dimension of a filter kernel. Since paddle now - support rectangular filters, the filter's shape - will be (filter_size, filter_size_y). + :param filter_size_y: The y dimension of a filter kernel. Since PaddlePaddle + currently supports rectangular filters, the filter's + shape will be (filter_size, filter_size_y). :type filter_size_y: int :param num_filters: Each filter group's number of filter :param act: Activation type. Default is tanh @@ -1744,11 +1751,13 @@ def addto_layer(input, act=None, name=None, bias_attr=None, inputs. Each input of this layer should be the same size, which is also the output size of this layer. - There is no weight matrix for each input, because it just a simple add operation. - If you want to a complicated operation before add, please use mixed_layer. + There is no weight matrix for each input, because it just a simple add + operation. If you want a complicated operation before add, please use + mixed_layer. It is a very good way to set dropout outside the layers. Since not all - paddle layer support dropout, you can add an add_to layer, set dropout here. + PaddlePaddle layer support dropout, you can add an add_to layer, set + dropout here. Please refer to dropout_layer for details. :param name: Layer name. @@ -2063,9 +2072,10 @@ def gru_step_layer(input, output_mem, size=None, act=None, @layer_support() def get_output_layer(input, arg_name, name=None, layer_attr=None): """ - Get layer's output by name. In paddle, a layer might return multiple value, - but return one layer output. If user want to reference another output beside - default output, use get_output_layer first to get another output from input. + Get layer's output by name. In PaddlePaddle, a layer might return multiple + values, but returns one layer's output. If the user wants to use another + output besides the default one, please use get_output_layer first to get + the output from input. :param name: Layer's name. :type name: basestring @@ -2155,7 +2165,11 @@ class SubsequenceInput(object): @wrap_name_default("recurrent_group") def recurrent_group(step, input, reverse=False, name=None): """ - Recurrent Group. It supports time steps and sequence steps mechanisms. + Recurrent layer group is an extremely flexible recurrent unit in + PaddlePaddle. As long as the user defines the calculation done within a + time step, PaddlePaddle will iterate such a recurrent calculation over + sequence input. This is extremely usefull for attention based model, or + Neural Turning Machine like models. The basic usage (time steps) is: @@ -2603,9 +2617,9 @@ def conv_operator(input, filter_size, num_filters, :type input: LayerOutput|list|tuple :param filter_size: The x dimension of a filter kernel. :type filter_size: int - :param filter_size_y: The y dimension of a filter kernel. Since paddle now - support rectangular filters, the filter's shape - will be (filter_size, filter_size_y). + :param filter_size_y: The y dimension of a filter kernel. Since + PaddlePaddle now supports rectangular filters, + the filter's shape can be (filter_size, filter_size_y). :type filter_size_y: int :param num_filter: channel of output data. :type num_filter: int @@ -3264,9 +3278,9 @@ def lambda_cost(input, score, NDCG_num=5, max_sort_size=-1, coeff=1.0): If max_sort_size = -1, then for each list, the algorithm will sort the entire list to get gradient. In other cases, max_sort_size must be greater than or - equal to NDCG_num. And if max_sort_size is greater than - the size of a list, the algorithm will sort the entire - list of get gradient. + equal to NDCG_num. And if max_sort_size is greater + than the size of a list, the algorithm will sort the + entire list of get gradient. :type max_sort_size: int :param name: The name of this layers. It is not necessary. :type name: None|basestring diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index 1d0a1d52a9..94b5245aba 100644 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -466,7 +466,7 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None, :type input: LayerOutput :param size: lstm layer size. :type size: int - :param reverse: is lstm reversed + :param reverse: whether to process the input data in a reverse order :type reverse: bool :param mat_param_attr: mixed layer's matrix projection parameter attribute. :type mat_param_attr: ParameterAttribute @@ -475,11 +475,11 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None, :type bias_param_attr: ParameterAttribute|False :param inner_param_attr: lstm cell parameter attribute. :type inner_param_attr: ParameterAttribute - :param act: lstm final activate type + :param act: lstm final activiation type :type act: BaseActivation - :param gate_act: lstm gate activate type + :param gate_act: lstm gate activiation type :type gate_act: BaseActivation - :param state_act: lstm state activate type. + :param state_act: lstm state activiation type. :type state_act: BaseActivation :param mixed_layer_attr: mixed layer's extra attribute. :type mixed_layer_attr: ExtraLayerAttribute @@ -503,12 +503,43 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None, @wrap_name_default('lstm_unit') def lstmemory_unit(input, name=None, size=None, param_attr=None, - act=None, gate_act=None, state_act=None, + act=None, gate_act=None, state_act=None, mixed_bias_attr=None, lstm_bias_attr=None, mixed_layer_attr=None,lstm_layer_attr=None, get_output_layer_attr=None): """ - TODO(yuyang18): complete docs + Define calculations that a LSTM unit performs in a single time step. + This function itself is not a recurrent layer, so that it can not be + directly applied to sequence input. This function is always used in + recurrent_group (see layers.py for more details) to implement attention + mechanism. + + Please refer to **Generating Sequences With Recurrent Neural Networks** + for more details about LSTM. The link goes as follows: + .. _Link: https://arxiv.org/abs/1308.0850 + + .. math:: + + i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) + + f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) + + c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) + + o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) + + h_t & = o_t tanh(c_t) + + The example usage is: + + .. code-block:: python + + lstm_step = lstmemory_unit(input=[layer1], + size=256, + act=TanhActivation(), + gate_act=SigmoidActivation(), + state_act=TanhActivation()) + :param input: input layer name. :type input: LayerOutput @@ -518,11 +549,11 @@ def lstmemory_unit(input, name=None, size=None, param_attr=None, :type size: int :param param_attr: Parameter config, None if use default. :type param_attr: ParameterAttribute - :param act: lstm final activate type + :param act: lstm final activiation type :type act: BaseActivation - :param gate_act: lstm gate activate type + :param gate_act: lstm gate activiation type :type gate_act: BaseActivation - :param state_act: lstm state activate type. + :param state_act: lstm state activiation type. :type state_act: BaseActivation :param mixed_bias_attr: bias parameter attribute of mixed layer. False means no bias, None means default bias. @@ -579,7 +610,31 @@ def lstmemory_group(input, size=None, name=None, mixed_layer_attr=None, lstm_layer_attr=None, get_output_layer_attr=None): """ - TODO(yuyang18): complete docs + lstm_group is a recurrent layer group version Long Short Term Memory. It + does exactly the same calculation as the lstmemory layer (see lstmemory in + layers.py for the maths) does. A promising benefit is that LSTM memory + cell states, or hidden states in every time step are accessible to for the + user. This is especially useful in attention model. If you do not need to + access to the internal states of the lstm, but merely use its outputs, + it is recommanded to use the lstmemory, which is relatively faster than + lstmemory_group. + + NOTE: In PaddlePaddle's implementation, the following input-to-hidden + multiplications: + :math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`, + :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` are not done in lstmemory_unit to + speed up the calculations. Consequently, an additional mixed_layer with + full_matrix_projection must be included before lstmemory_unit is called. + + The example usage is: + + .. code-block:: python + + lstm_step = lstmemory_group(input=[layer1], + size=256, + act=TanhActivation(), + gate_act=SigmoidActivation(), + state_act=TanhActivation()) :param input: input layer name. :type input: LayerOutput @@ -591,13 +646,13 @@ def lstmemory_group(input, size=None, name=None, :type reverse: bool :param param_attr: Parameter config, None if use default. :type param_attr: ParameterAttribute - :param act: lstm final activate type + :param act: lstm final activiation type :type act: BaseActivation - :param gate_act: lstm gate activate type + :param gate_act: lstm gate activiation type :type gate_act: BaseActivation - :param state_act: lstm state activate type. + :param state_act: lstm state activiation type. :type state_act: BaseActivation - :param mixed_bias_attr: bias parameter attribute of mixed layer. + :param mixed_bias_attr: bias parameter attribute of mixed layer. False means no bias, None means default bias. :type mixed_bias_attr: ParameterAttribute|False :param lstm_bias_attr: bias parameter attribute of lstm layer. @@ -609,7 +664,7 @@ def lstmemory_group(input, size=None, name=None, :type lstm_layer_attr: ExtraLayerAttribute :param get_output_layer_attr: get output layer's extra attribute. :type get_output_layer_attr: ExtraLayerAttribute - :return: lstmemory group name. + :return: the lstmemory group. :rtype: LayerOutput """ @@ -639,16 +694,28 @@ def gru_unit(input, gate_act=None, gru_layer_attr=None): """ + Define calculations that a gated recurrent unit performs in a single time + step. This function itself is not a recurrent layer, so that it can not be + directly applied to sequence input. This function is almost always used in + the recurrent_group (see layers.py for more details) to implement attention + mechanism. - :param input: + Please see grumemory in layers.py for the details about the maths. + + :param input: input layer name. :type input: LayerOutput - :param name: - :param size: - :param gru_bias_attr: - :param act: - :param gate_act: - :param gru_layer_attr: - :return: + :param name: name of the gru group. + :type name: basestring + :param size: hidden size of the gru. + :type size: int + :param act: type of the activation + :type act: BaseActivation + :param gate_act: type of the gate activation + :type gate_act: BaseActivation + :param gru_layer_attr: Extra parameter attribute of the gru layer. + :type gru_layer_attr: ParameterAttribute|False + :return: the gru output layer. + :rtype: LayerOutput """ assert input.size % 3 == 0 @@ -678,6 +745,46 @@ def gru_group(input, gru_bias_attr=None, act=None, gate_act=None, gru_layer_attr=None): + + """ + gru_group is a recurrent layer group version Gated Recurrent Unit. It + does exactly the same calculation as the grumemory layer does. A promising + benefit is that gru hidden sates are accessible to for the user. This is + especially useful in attention model. If you do not need to access to + any internal state, but merely use the outputs of a GRU, it is recommanded + to use the grumemory, which is relatively faster. + + Please see grumemory in layers.py for more detail about the maths. + + The example usage is: + + .. code-block:: python + + gru = gur_group(input=[layer1], + size=256, + act=TanhActivation(), + gate_act=SigmoidActivation()) + + :param input: input layer name. + :type input: LayerOutput + :param name: name of the gru group. + :type name: basestring + :param size: hidden size of the gru. + :type size: int + :param reverse: whether to process the input data in a reverse order + :type reverse: bool + :param act: type of the activiation + :type act: BaseActivation + :param gate_act: type of the gate activiation + :type gate_act: BaseActivation + :param gru_bias_attr: bias. False means no bias, None means default bias. + :type gru_bias_attr: ParameterAttribute|False + :param gru_layer_attr: Extra parameter attribute of the gru layer. + :type gru_layer_attr: ParameterAttribute|False + :return: the gru group. + :rtype: LayerOutput + """ + def __gru_step__(ipt): return gru_unit( input=ipt, @@ -708,6 +815,43 @@ def simple_gru(input, gate_act=None, gru_layer_attr=None ): + """ + simple_gru is also a recurrent layer group version Gated Recurrent Unit as + gru_group. The difference only lies in implemention details. + The computational speed is that, grumemory is relatively better than + gru_group, and gru_group is relatively better than simple_gru. + + simple_gru does exactly the same calculation as the grumemory layer does. + Please see grumemory in layers.py for more detail about the maths. + + The example usage is: + + .. code-block:: python + + gru = gur_group(input=[layer1], + size=256, + act=TanhActivation(), + gate_act=SigmoidActivation()) + + :param input: input layer name. + :type input: LayerOutput + :param name: name of the gru group. + :type name: basestring + :param size: hidden size of the gru. + :type size: int + :param reverse: whether to process the input data in a reverse order + :type reverse: bool + :param act: type of the activiation + :type act: BaseActivation + :param gate_act: type of the gate activiation + :type gate_act: BaseActivation + :param gru_bias_attr: bias. False means no bias, None means default bias. + :type gru_bias_attr: ParameterAttribute|False + :param gru_layer_attr: Extra parameter attribute of the gru layer. + :type gru_layer_attr: ParameterAttribute|False + :return: the gru group. + :rtype: LayerOutput + """ with mixed_layer(name='%s_transform' % name, size=size * 3, bias_attr=mixed_bias_param_attr, @@ -739,7 +883,22 @@ def bidirectional_lstm(input, size, name=None, return_seq=False, last_seq_attr=None, first_seq_attr=None, concat_attr=None, concat_act=None): """ - TODO(yuyang18): Complete docs + A bidirectional_lstm is a recurrent unit that iterates over the input + sequence both in forward and bardward orders, and then concatenate two + outputs form a final output. However, concatenation of two outputs + is not the only way to form the final output, you can also, for example, + just add them together. + + Please refer to **Neural Machine Translation by Jointly Learning to Align + and Translate** for more details about the bidirectional lstm. + The link goes as follows: + .. _Link: https://arxiv.org/pdf/1409.0473v3.pdf + + The example usage is: + + .. code-block:: python + + lstm_step = bidirectional_lstm(input=[input1], size=512) :param name: bidirectional lstm layer name. :type name: basestring @@ -747,8 +906,11 @@ def bidirectional_lstm(input, size, name=None, return_seq=False, :type input: LayerOutput :param size: lstm layer size. :type size: int - :param return_seq: If False, concat word in last time step and return. - If True, concat sequnce in all time step and return. + :param return_seq: If set False, outputs of the last time step are + concatenated and returned. + If set True, the entire output sequences that are + processed in forward and backward directions are + concatenated and returned. :type return_seq: bool :return: lstm layer name. :rtype: LayerOutput -- GitLab