From 9c0895ee455368f37f27e48f67fe7a35ee9b20c3 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@1ad973e4-5ce8-4261-8a94-b56d1f490c56>
Date: Tue, 30 Aug 2016 07:43:53 +0000
Subject: [PATCH] Add commits for recurrent layer and lstm layer. ISSUE=4539687

git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1422 1ad973e4-5ce8-4261-8a94-b56d1f490c56
---
 doc/source/gserver/layers/layer.rst           |   1 -
 paddle/gserver/layers/BlockExpandLayer.h      |  12 +-
 paddle/gserver/layers/LstmLayer.h             | 160 ++++++++++++++----
 paddle/gserver/layers/RecurrentLayer.cpp      |  78 +++++++--
 .../paddle/trainer_config_helpers/layers.py   |  12 +-
 5 files changed, 215 insertions(+), 48 deletions(-)

diff --git a/doc/source/gserver/layers/layer.rst b/doc/source/gserver/layers/layer.rst
index a864e18b9f..0406c75e4e 100644
--- a/doc/source/gserver/layers/layer.rst
+++ b/doc/source/gserver/layers/layer.rst
@@ -195,7 +195,6 @@ GruCompute
 ``````````
 ..  doxygenclass:: paddle::GruCompute
     :members:
-    
 
 Recurrent Layer Group
 =====================
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index 3b04c713e3..f8f8172127 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -24,13 +24,15 @@ namespace paddle {
  * @brief Expand feature map to minibatch matrix.
  * - matrix width is: blockH_ * blockW_ * channels_
  * - matirx height is: outputH_ * outputW_
+ *
  * \f[
- * outputH_ = 1 + (2 * paddingH_ + imgSizeH_ - blockH_ + strideH_ - 1) /
- *            strideH_;
- * outputW_ = 1 + (2 * paddingW_ + imgSizeW_ - blockW_ + strideW_ - 1) /
- *            strideW_;
+ * outputH\_ = 1 + (2 * paddingH\_ + imgSizeH\_ - blockH\_ + strideH\_ - 1) /
+ *             strideH\_ \\
+ * outputW\_ = 1 + (2 * paddingW\_ + imgSizeW\_ - blockW\_ + strideW\_ - 1) /
+ *             strideW\_
  * \f]
- * The expand method is same with ExpandConvLayer, but saved the transposed
+ *
+ * The expand method is the same with ExpandConvLayer, but saved the transposed
  * value. After expanding, output_.sequenceStartPositions will store timeline.
  * The number of time steps are outputH_ * outputW_ and the dimension of each
  * time step is blockH_ * blockW_ * channels_. This layer can be used after
diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h
index 75d73d365f..cb3c51c7bd 100644
--- a/paddle/gserver/layers/LstmLayer.h
+++ b/paddle/gserver/layers/LstmLayer.h
@@ -21,31 +21,54 @@ limitations under the License. */
 #include "LstmCompute.h"
 namespace paddle {
 
-/*
-LstmLayer takes 1 input layer with size * 4.
-Input layer is diveded into 4 equal parts:
-  (input_s, input_ig, input_fg, input_og)
-
-For each sequence [start, end] it performs the following computation:
-
-out_i   = actState(state_i) * actGate(outputGate_i)
-state_i = actInput(input_s_i + bias_s + output_{i-1} * recurrIW)
-          * actGate(inputGate_i) + actGate(forgetGate_i) * state_{i-1}
-inputGate = input_ig_i + bias_ig + output_{i-1} * recurrIGW
-            + state_{i-1} * inputCheck
-ouputGate = input_og_i + bias_og + output_{i-1} * recurrOGW
-            + state_{i} * outputCheck
-forgetGate = input_fg_i + bias_fg + output_{i-1} * recurrFGW
-             + state_{i-1} * forgetCheck
-
-parameter[0] consists of (recurrIW, recurrIGW, recurrFGW, recurrOGW)
-baisParameter consists of
-  (bias_s, bias_ig, bias_og, bias_fg, inputCheck, forgetCheck, outputCheck)
-
-actInput is defined by config active_type
-actState is defined by config active_state_type
-actGate is defined by config actvie_gate_type
-*/
+/**
+ * @brief LstmLayer takes 1 input layer with size * 4.
+ * Input layer is diveded into 4 equal parts:
+ *   (input_s, input_ig, input_fg, input_og)
+ *
+ * For each sequence [start, end] it performs the following computation:
+ * @code
+ * output_{i} = actState(state_{i}) * actGate(outputGate_{i})
+ * state_{i} = actInput(input_s_{i} + bias_s +
+ *             output_{i-1} * recurrIW) * actGate(inputGate_{i}) +
+ *             actGate(forgetGate_{i}) * state_{i-1}
+ * inputGate = input_ig_{i} + bias_ig + output_{i-1} * recurrIGW +
+ *             state_{i-1} * inputCheck
+ * ouputGate = input_og_{i} + bias_og + output_{i-1} * recurrOGW +
+ *             state_{i} * outputCheck
+ * forgetGate = input_fg_{i} + bias_fg + output_{i-1} * recurrFGW +
+ *              state_{i-1} * forgetCheck
+ * @endcode
+ *
+ * - parameter[0] consists of (recurrIW, recurrIGW, recurrFGW, recurrOGW)
+ * - baisParameter consists of
+ *   (bias_s, bias_ig, bias_og, bias_fg, inputCheck, forgetCheck, outputCheck)
+ *
+ * - actInput is defined by config active_type.
+ * - actState is defined by config active_state_type.
+ * - actGate is defined by config actvie_gate_type.
+ *
+ * There are two ways to compute, namely one sequence by one sequence or
+ * one batch by one batch. By default and no setting pre_batch_state true,
+ * it will compute batch by batch.
+ *
+ * The formula in the paper is as follows:
+ * \f[
+ * i_t = \sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) \\
+ * f_t = \sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) \\
+ * \tilde{c_t} = tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) \\
+ * o_t = \sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) \\
+ * c_t = f_t * c_{t-1} + i_t * \tilde{c_t} \\
+ * h_t = o_t tanh(c_t)
+ * \f]
+ *
+ * @note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
+ * operations on the input sequence were NOT included in LstmLayer. So
+ * users should use fc_layer or mixed_layer before lstm_later.
+ *
+ * The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
+ * The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$.
+ */
 
 class LstmLayer : public Layer, public LstmCompute {
 public:
@@ -64,43 +87,120 @@ public:
   LayerStatePtr getState();
 
 protected:
+  /**
+   * @brief Compute lstm forward one sequence by one sequence.
+   * @param batchSize The batchSize is not equal to the batch_size in
+   * the config file. It is the total words number of all samples
+   * in this forward batch.
+   * @param numSequences The sample number. It is equal to the batch_size
+   * in the config file.
+   * @param starts Each start position of each samples.
+   * @param inputValue The input values.
+   */
   void forwardSequence(int batchSize, size_t numSequences,
                        const int *starts, MatrixPtr inputValue);
+  /**
+   * Compute lstm backward one sequence by one sequence.
+   */
   void backwardSequence(int batchSize, size_t numSequences,
                         const int *starts, MatrixPtr inputGrad);
 
+  /**
+   * Compute lstm forward one batch by one batch. The batch value is
+   * reorganized by SequenceToBatch class. The batch output value will
+   * be convert into sequence value after finishing forward. Here, one
+   * batch contains one word of each sample. If the length of each sample
+   * is not equality, the batch will not pads zero and contains less words.
+   * The total batch numbers are the max length of the sequence. The details
+   * can refer to SequenceToBatch class. On GPU mode, it will launch GPU
+   * kernel for loop.
+   *
+   * @code
+   * for (int i = 0; i < numBatch(max_sequence_length); ++i) {
+   *   compute one batch.
+   * }
+   * @endcode
+   */
   void forwardBatch(int batchSize, size_t numSequences,
                     const int *starts, MatrixPtr inputValue);
+  /**
+   * Compute lstm backward one batch by one batch.
+   */
   void backwardBatch(int batchSize, size_t numSequences,
                      const int *starts, MatrixPtr inputGrad);
 
+  /**
+   * This function only supports GPU. It not need to reorganize input into
+   * batch value. It will launch one kernel to parallelly compute forward
+   * propagation in sequence level.
+   */
   void forwardSeqParallel(int batchSize, size_t numSequences,
                           const int *starts, MatrixPtr inputValue);
+  /**
+   * Backward propagation corresponding to forwardSeqParallel.
+   */
   void backwardSeqParallel(int batchSize, size_t numSequences,
                            const int *starts, MatrixPtr inputGrad);
+  /**
+   * This function is used for sequence generation and get output after
+   * forwardBatch.
+   */
   void getPrevBatchOutput(size_t numSequences);
+  /**
+   * This function is used for sequence generation and get state after
+   * forwardBatch.
+   */
   void getPrevBatchState(size_t numSequences);
 
 protected:
+  /// Learned parameters, shape: (size, 4*size).
+  /// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
   std::unique_ptr<Weight> weight_;
+  /// Learned bias parameter, shape: (1, 7 * size).
+  /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$.
   std::unique_ptr<Weight> bias_;
-  /* real bias and peephole for different gates */
-  MatrixPtr localBias_, checkIg_, checkFg_, checkOg_;
-  /* the gradient of, real bias and peephole for different gates */
-  MatrixPtr localBiasGrad_, checkIgGrad_, checkFgGrad_, checkOgGrad_;
-
+  /// The reeal bias, point to \f$b_i, b_f, b_c, b_o\f$.
+  MatrixPtr localBias_;
+  /// The peephole connection for input gate.
+  MatrixPtr checkIg_;
+  /// The peephole connection for forget gate.
+  MatrixPtr checkFg_;
+  /// The peephole connection for output gate.
+  MatrixPtr checkOg_;
+  /// The gradient of real bias
+  MatrixPtr localBiasGrad_;
+  /// The gradient of peephole connection for input gates.
+  MatrixPtr checkIgGrad_;
+  /// The gradient of peephole connection for forget gates.
+  MatrixPtr checkFgGrad_;
+  /// The gradient of peephole connection for output gates.
+  MatrixPtr checkOgGrad_;
+
+  /// Stores the cell state of previous time step, namely \f$c_{t-1}\f$.
   Argument state_;
+  /// Stores the hidden of previous time step, namely \f$h_{t-1}\f$.
   Argument preOutput_;
+  /// Stores the value and gradient of four gates, namely
+  /// \f$i_t, f_t, o_t, c_t\f$.
   Argument gate_;
+  /// Whether it is reversed lstm.
   bool reversed_;
+  /// Whether to use batch method to compute.
   bool useBatch_;
+  /// Whether to use sequence parallell method to compute.
   bool useSeqParallel_;
+  /// batchValue_ is used in method of batch calculation. It stores the
+  /// batch value after reorganized input.
   std::unique_ptr<SequenceToBatch> batchValue_;
+  /// The gradient of batchValue_.
   std::unique_ptr<SequenceToBatch> batchGrad_;
 
+  /// Used in generation and stores the state of previous time step.
   MatrixPtr prevState_;
+  /// Used in generation and stores the output of previous time step.
   MatrixPtr prevOutput_;
   MatrixPtr prevBatchOutput2_;
+  /// The total state.
   MatrixPtr totalState_;
 };
 
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp
index a7c7b89328..30ef679f92 100644
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -22,16 +22,26 @@ P_DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
 
 namespace paddle {
 
-/*
-RecurrentLayer takes 1 input layer with the same size.
-For each sequence [start, end] it performs the following computation:
-out_i = act(in_i)                 for i = start
-out_i = act(in_i + out_{i-1} * W) for start < i <= end
-
-If reversed is true, the order is reversed:
-out_i = act(in_i)                 for i = end
-out_i = act(in_i + out_{i+1} * W) for start <= i < end
-*/
+/**
+ * @brief RecurrentLayer takes 1 input layer. The output size is the same with
+ * input layer.
+ * For each sequence [start, end] it performs the following computation:
+ * \f[
+ *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
+ *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
+ *
+ * \f]
+ * If reversed is true, the order is reversed:
+ * \f[
+ *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
+ *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
+ * \f]
+ * There are two methods to calculate rnn. One way is to compute rnn one
+ * sequence by one sequence. The other way is to reorganize the input
+ * into batches, then compute rnn one batch by one batch. Users can select
+ * them by rnn_use_batch flag.
+ */
+
 class RecurrentLayer : public Layer {
 public:
   explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
@@ -49,23 +59,69 @@ public:
   LayerStatePtr getState();
 
 protected:
+  /**
+   * @brief If user do not set --rnn_use_batch=true, it will
+   * compute rnn forward one sequence by one sequence in default.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
   void forwardSequence(int batchSize, size_t numSequences, const int* starts);
+  /**
+   * @brief Compute rnn forward by one sequence.
+   * @param start The start position of this sequence (or sample).
+   * @param length The length of this sequence (or sample), namely the words
+   * number of this sequence.
+   */
   void forwardOneSequence(int start, int length);
+  /**
+   * @brief Compute rnn backward one sequence by onesequence.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
   void backwardSequence(int batchSize, size_t numSequences, const int* starts);
+  /**
+   * @brief Compute rnn backward by one sequence.
+   * @param start The start position of this sequence (or sample).
+   * @param length The length of this sequence (or sample), namely the words
+   * number of this sequence.
+   */
   void backwardOneSequence(int start, int length);
 
+  /**
+   * @brief Reorganize input into batches and compute rnn forward batch
+   * by batch. It will convert batch shape to sequence after finishing forward.
+   * The batch info can refer to SequenceToBatch class.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
   void forwardBatch(int batchSize, size_t numSequences, const int* starts);
+
+  /**
+   * @brief Reorganize input into batches and compute rnn forward batch
+   * by batch.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
   void backwardBatch(int batchSize, size_t numSequences, const int* starts);
 
 protected:
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> bias_;
 
-  // frameOutput_[i] is used to hold the i-th sample of output_
+  /// frameOutput_[i] is used to hold the i-th sample of output_
   std::vector<Argument> frameOutput_;
   MatrixPtr prevOutput_;
+  /// Whether compute rnn by reverse.
   bool reversed_;
+  /// If compute batch by batch, batchValue_ will be used to save the
+  /// reorganized input value.
   std::unique_ptr<SequenceToBatch> batchValue_;
+  /// If compute batch by batch, batchGrad_ will be used to save the
+  /// gradient with respect to reorganized input value.
   std::unique_ptr<SequenceToBatch> batchGrad_;
 };
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 85625c2f6a..d8903ff818 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -2617,7 +2617,6 @@ def tensor_layer(input, size, act=None, name=None,
     In this formular:
       - :math:`x_{1}`: the first input contains M elements.
       - :math:`x_{2}`: the second input contains N elements.
-      - y[out]: contains K elements.
       - :math:`y_{i}`: the i-th element of y.
       - :math:`W_{i}`: the i-th learned weight, shape if [M, N]
       - :math:`{x_{2}}^\mathrm{T}`: the transpose of :math:`x_{2}`.
@@ -2909,6 +2908,17 @@ def block_expand_layer(input,
     time step is block_y * block_x * channel. This layer can be used after
     convolution neural network, and before recurrent neural network.
 
+    The simple usage is:
+
+    .. code-block:: python
+
+       block_expand = block_expand_layer(input,
+                                         channel=128,
+                                         stride_x=1,
+                                         stride_y=1,
+                                         block_x=1,
+                                         block_x=3)
+
     :param input: The input layer.
     :type input: LayerOutput
     :param channel: The channel number of input layer.
-- 
GitLab