From 9c0895ee455368f37f27e48f67fe7a35ee9b20c3 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Tue, 30 Aug 2016 07:43:53 +0000 Subject: [PATCH] Add commits for recurrent layer and lstm layer. ISSUE=4539687 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1422 1ad973e4-5ce8-4261-8a94-b56d1f490c56 --- doc/source/gserver/layers/layer.rst | 1 - paddle/gserver/layers/BlockExpandLayer.h | 12 +- paddle/gserver/layers/LstmLayer.h | 160 ++++++++++++++---- paddle/gserver/layers/RecurrentLayer.cpp | 78 +++++++-- .../paddle/trainer_config_helpers/layers.py | 12 +- 5 files changed, 215 insertions(+), 48 deletions(-) diff --git a/doc/source/gserver/layers/layer.rst b/doc/source/gserver/layers/layer.rst index a864e18b9f..0406c75e4e 100644 --- a/doc/source/gserver/layers/layer.rst +++ b/doc/source/gserver/layers/layer.rst @@ -195,7 +195,6 @@ GruCompute `````````` .. doxygenclass:: paddle::GruCompute :members: - Recurrent Layer Group ===================== diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h index 3b04c713e3..f8f8172127 100644 --- a/paddle/gserver/layers/BlockExpandLayer.h +++ b/paddle/gserver/layers/BlockExpandLayer.h @@ -24,13 +24,15 @@ namespace paddle { * @brief Expand feature map to minibatch matrix. * - matrix width is: blockH_ * blockW_ * channels_ * - matirx height is: outputH_ * outputW_ + * * \f[ - * outputH_ = 1 + (2 * paddingH_ + imgSizeH_ - blockH_ + strideH_ - 1) / - * strideH_; - * outputW_ = 1 + (2 * paddingW_ + imgSizeW_ - blockW_ + strideW_ - 1) / - * strideW_; + * outputH\_ = 1 + (2 * paddingH\_ + imgSizeH\_ - blockH\_ + strideH\_ - 1) / + * strideH\_ \\ + * outputW\_ = 1 + (2 * paddingW\_ + imgSizeW\_ - blockW\_ + strideW\_ - 1) / + * strideW\_ * \f] - * The expand method is same with ExpandConvLayer, but saved the transposed + * + * The expand method is the same with ExpandConvLayer, but saved the transposed * value. After expanding, output_.sequenceStartPositions will store timeline. * The number of time steps are outputH_ * outputW_ and the dimension of each * time step is blockH_ * blockW_ * channels_. This layer can be used after diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h index 75d73d365f..cb3c51c7bd 100644 --- a/paddle/gserver/layers/LstmLayer.h +++ b/paddle/gserver/layers/LstmLayer.h @@ -21,31 +21,54 @@ limitations under the License. */ #include "LstmCompute.h" namespace paddle { -/* -LstmLayer takes 1 input layer with size * 4. -Input layer is diveded into 4 equal parts: - (input_s, input_ig, input_fg, input_og) - -For each sequence [start, end] it performs the following computation: - -out_i = actState(state_i) * actGate(outputGate_i) -state_i = actInput(input_s_i + bias_s + output_{i-1} * recurrIW) - * actGate(inputGate_i) + actGate(forgetGate_i) * state_{i-1} -inputGate = input_ig_i + bias_ig + output_{i-1} * recurrIGW - + state_{i-1} * inputCheck -ouputGate = input_og_i + bias_og + output_{i-1} * recurrOGW - + state_{i} * outputCheck -forgetGate = input_fg_i + bias_fg + output_{i-1} * recurrFGW - + state_{i-1} * forgetCheck - -parameter[0] consists of (recurrIW, recurrIGW, recurrFGW, recurrOGW) -baisParameter consists of - (bias_s, bias_ig, bias_og, bias_fg, inputCheck, forgetCheck, outputCheck) - -actInput is defined by config active_type -actState is defined by config active_state_type -actGate is defined by config actvie_gate_type -*/ +/** + * @brief LstmLayer takes 1 input layer with size * 4. + * Input layer is diveded into 4 equal parts: + * (input_s, input_ig, input_fg, input_og) + * + * For each sequence [start, end] it performs the following computation: + * @code + * output_{i} = actState(state_{i}) * actGate(outputGate_{i}) + * state_{i} = actInput(input_s_{i} + bias_s + + * output_{i-1} * recurrIW) * actGate(inputGate_{i}) + + * actGate(forgetGate_{i}) * state_{i-1} + * inputGate = input_ig_{i} + bias_ig + output_{i-1} * recurrIGW + + * state_{i-1} * inputCheck + * ouputGate = input_og_{i} + bias_og + output_{i-1} * recurrOGW + + * state_{i} * outputCheck + * forgetGate = input_fg_{i} + bias_fg + output_{i-1} * recurrFGW + + * state_{i-1} * forgetCheck + * @endcode + * + * - parameter[0] consists of (recurrIW, recurrIGW, recurrFGW, recurrOGW) + * - baisParameter consists of + * (bias_s, bias_ig, bias_og, bias_fg, inputCheck, forgetCheck, outputCheck) + * + * - actInput is defined by config active_type. + * - actState is defined by config active_state_type. + * - actGate is defined by config actvie_gate_type. + * + * There are two ways to compute, namely one sequence by one sequence or + * one batch by one batch. By default and no setting pre_batch_state true, + * it will compute batch by batch. + * + * The formula in the paper is as follows: + * \f[ + * i_t = \sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) \\ + * f_t = \sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) \\ + * \tilde{c_t} = tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) \\ + * o_t = \sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) \\ + * c_t = f_t * c_{t-1} + i_t * \tilde{c_t} \\ + * h_t = o_t tanh(c_t) + * \f] + * + * @note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$ + * operations on the input sequence were NOT included in LstmLayer. So + * users should use fc_layer or mixed_layer before lstm_later. + * + * The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$. + * The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$. + */ class LstmLayer : public Layer, public LstmCompute { public: @@ -64,43 +87,120 @@ public: LayerStatePtr getState(); protected: + /** + * @brief Compute lstm forward one sequence by one sequence. + * @param batchSize The batchSize is not equal to the batch_size in + * the config file. It is the total words number of all samples + * in this forward batch. + * @param numSequences The sample number. It is equal to the batch_size + * in the config file. + * @param starts Each start position of each samples. + * @param inputValue The input values. + */ void forwardSequence(int batchSize, size_t numSequences, const int *starts, MatrixPtr inputValue); + /** + * Compute lstm backward one sequence by one sequence. + */ void backwardSequence(int batchSize, size_t numSequences, const int *starts, MatrixPtr inputGrad); + /** + * Compute lstm forward one batch by one batch. The batch value is + * reorganized by SequenceToBatch class. The batch output value will + * be convert into sequence value after finishing forward. Here, one + * batch contains one word of each sample. If the length of each sample + * is not equality, the batch will not pads zero and contains less words. + * The total batch numbers are the max length of the sequence. The details + * can refer to SequenceToBatch class. On GPU mode, it will launch GPU + * kernel for loop. + * + * @code + * for (int i = 0; i < numBatch(max_sequence_length); ++i) { + * compute one batch. + * } + * @endcode + */ void forwardBatch(int batchSize, size_t numSequences, const int *starts, MatrixPtr inputValue); + /** + * Compute lstm backward one batch by one batch. + */ void backwardBatch(int batchSize, size_t numSequences, const int *starts, MatrixPtr inputGrad); + /** + * This function only supports GPU. It not need to reorganize input into + * batch value. It will launch one kernel to parallelly compute forward + * propagation in sequence level. + */ void forwardSeqParallel(int batchSize, size_t numSequences, const int *starts, MatrixPtr inputValue); + /** + * Backward propagation corresponding to forwardSeqParallel. + */ void backwardSeqParallel(int batchSize, size_t numSequences, const int *starts, MatrixPtr inputGrad); + /** + * This function is used for sequence generation and get output after + * forwardBatch. + */ void getPrevBatchOutput(size_t numSequences); + /** + * This function is used for sequence generation and get state after + * forwardBatch. + */ void getPrevBatchState(size_t numSequences); protected: + /// Learned parameters, shape: (size, 4*size). + /// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$. std::unique_ptr weight_; + /// Learned bias parameter, shape: (1, 7 * size). + /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$. std::unique_ptr bias_; - /* real bias and peephole for different gates */ - MatrixPtr localBias_, checkIg_, checkFg_, checkOg_; - /* the gradient of, real bias and peephole for different gates */ - MatrixPtr localBiasGrad_, checkIgGrad_, checkFgGrad_, checkOgGrad_; - + /// The reeal bias, point to \f$b_i, b_f, b_c, b_o\f$. + MatrixPtr localBias_; + /// The peephole connection for input gate. + MatrixPtr checkIg_; + /// The peephole connection for forget gate. + MatrixPtr checkFg_; + /// The peephole connection for output gate. + MatrixPtr checkOg_; + /// The gradient of real bias + MatrixPtr localBiasGrad_; + /// The gradient of peephole connection for input gates. + MatrixPtr checkIgGrad_; + /// The gradient of peephole connection for forget gates. + MatrixPtr checkFgGrad_; + /// The gradient of peephole connection for output gates. + MatrixPtr checkOgGrad_; + + /// Stores the cell state of previous time step, namely \f$c_{t-1}\f$. Argument state_; + /// Stores the hidden of previous time step, namely \f$h_{t-1}\f$. Argument preOutput_; + /// Stores the value and gradient of four gates, namely + /// \f$i_t, f_t, o_t, c_t\f$. Argument gate_; + /// Whether it is reversed lstm. bool reversed_; + /// Whether to use batch method to compute. bool useBatch_; + /// Whether to use sequence parallell method to compute. bool useSeqParallel_; + /// batchValue_ is used in method of batch calculation. It stores the + /// batch value after reorganized input. std::unique_ptr batchValue_; + /// The gradient of batchValue_. std::unique_ptr batchGrad_; + /// Used in generation and stores the state of previous time step. MatrixPtr prevState_; + /// Used in generation and stores the output of previous time step. MatrixPtr prevOutput_; MatrixPtr prevBatchOutput2_; + /// The total state. MatrixPtr totalState_; }; diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp index a7c7b89328..30ef679f92 100644 --- a/paddle/gserver/layers/RecurrentLayer.cpp +++ b/paddle/gserver/layers/RecurrentLayer.cpp @@ -22,16 +22,26 @@ P_DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation."); namespace paddle { -/* -RecurrentLayer takes 1 input layer with the same size. -For each sequence [start, end] it performs the following computation: -out_i = act(in_i) for i = start -out_i = act(in_i + out_{i-1} * W) for start < i <= end - -If reversed is true, the order is reversed: -out_i = act(in_i) for i = end -out_i = act(in_i + out_{i+1} * W) for start <= i < end -*/ +/** + * @brief RecurrentLayer takes 1 input layer. The output size is the same with + * input layer. + * For each sequence [start, end] it performs the following computation: + * \f[ + * out_{i} = act(in_{i}) \ \ \text{for} \ i = start \\ + * out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end + * + * \f] + * If reversed is true, the order is reversed: + * \f[ + * out_{i} = act(in_{i}) \ \ \text{for} \ i = end \\ + * out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end + * \f] + * There are two methods to calculate rnn. One way is to compute rnn one + * sequence by one sequence. The other way is to reorganize the input + * into batches, then compute rnn one batch by one batch. Users can select + * them by rnn_use_batch flag. + */ + class RecurrentLayer : public Layer { public: explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {} @@ -49,23 +59,69 @@ public: LayerStatePtr getState(); protected: + /** + * @brief If user do not set --rnn_use_batch=true, it will + * compute rnn forward one sequence by one sequence in default. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ void forwardSequence(int batchSize, size_t numSequences, const int* starts); + /** + * @brief Compute rnn forward by one sequence. + * @param start The start position of this sequence (or sample). + * @param length The length of this sequence (or sample), namely the words + * number of this sequence. + */ void forwardOneSequence(int start, int length); + /** + * @brief Compute rnn backward one sequence by onesequence. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ void backwardSequence(int batchSize, size_t numSequences, const int* starts); + /** + * @brief Compute rnn backward by one sequence. + * @param start The start position of this sequence (or sample). + * @param length The length of this sequence (or sample), namely the words + * number of this sequence. + */ void backwardOneSequence(int start, int length); + /** + * @brief Reorganize input into batches and compute rnn forward batch + * by batch. It will convert batch shape to sequence after finishing forward. + * The batch info can refer to SequenceToBatch class. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ void forwardBatch(int batchSize, size_t numSequences, const int* starts); + + /** + * @brief Reorganize input into batches and compute rnn forward batch + * by batch. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ void backwardBatch(int batchSize, size_t numSequences, const int* starts); protected: std::unique_ptr weight_; std::unique_ptr bias_; - // frameOutput_[i] is used to hold the i-th sample of output_ + /// frameOutput_[i] is used to hold the i-th sample of output_ std::vector frameOutput_; MatrixPtr prevOutput_; + /// Whether compute rnn by reverse. bool reversed_; + /// If compute batch by batch, batchValue_ will be used to save the + /// reorganized input value. std::unique_ptr batchValue_; + /// If compute batch by batch, batchGrad_ will be used to save the + /// gradient with respect to reorganized input value. std::unique_ptr batchGrad_; }; diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 85625c2f6a..d8903ff818 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -2617,7 +2617,6 @@ def tensor_layer(input, size, act=None, name=None, In this formular: - :math:`x_{1}`: the first input contains M elements. - :math:`x_{2}`: the second input contains N elements. - - y[out]: contains K elements. - :math:`y_{i}`: the i-th element of y. - :math:`W_{i}`: the i-th learned weight, shape if [M, N] - :math:`{x_{2}}^\mathrm{T}`: the transpose of :math:`x_{2}`. @@ -2909,6 +2908,17 @@ def block_expand_layer(input, time step is block_y * block_x * channel. This layer can be used after convolution neural network, and before recurrent neural network. + The simple usage is: + + .. code-block:: python + + block_expand = block_expand_layer(input, + channel=128, + stride_x=1, + stride_y=1, + block_x=1, + block_x=3) + :param input: The input layer. :type input: LayerOutput :param channel: The channel number of input layer. -- GitLab