提交 9c0895ee 编写于 作者: D dangqingqing

Add commits for recurrent layer and lstm layer.

ISSUE=4539687

git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1422 1ad973e4-5ce8-4261-8a94-b56d1f490c56
上级 ee473459
......@@ -196,7 +196,6 @@ GruCompute
.. doxygenclass:: paddle::GruCompute
:members:
Recurrent Layer Group
=====================
......
......@@ -24,13 +24,15 @@ namespace paddle {
* @brief Expand feature map to minibatch matrix.
* - matrix width is: blockH_ * blockW_ * channels_
* - matirx height is: outputH_ * outputW_
*
* \f[
* outputH_ = 1 + (2 * paddingH_ + imgSizeH_ - blockH_ + strideH_ - 1) /
* strideH_;
* outputW_ = 1 + (2 * paddingW_ + imgSizeW_ - blockW_ + strideW_ - 1) /
* strideW_;
* outputH\_ = 1 + (2 * paddingH\_ + imgSizeH\_ - blockH\_ + strideH\_ - 1) /
* strideH\_ \\
* outputW\_ = 1 + (2 * paddingW\_ + imgSizeW\_ - blockW\_ + strideW\_ - 1) /
* strideW\_
* \f]
* The expand method is same with ExpandConvLayer, but saved the transposed
*
* The expand method is the same with ExpandConvLayer, but saved the transposed
* value. After expanding, output_.sequenceStartPositions will store timeline.
* The number of time steps are outputH_ * outputW_ and the dimension of each
* time step is blockH_ * blockW_ * channels_. This layer can be used after
......
......@@ -21,31 +21,54 @@ limitations under the License. */
#include "LstmCompute.h"
namespace paddle {
/*
LstmLayer takes 1 input layer with size * 4.
Input layer is diveded into 4 equal parts:
(input_s, input_ig, input_fg, input_og)
For each sequence [start, end] it performs the following computation:
out_i = actState(state_i) * actGate(outputGate_i)
state_i = actInput(input_s_i + bias_s + output_{i-1} * recurrIW)
* actGate(inputGate_i) + actGate(forgetGate_i) * state_{i-1}
inputGate = input_ig_i + bias_ig + output_{i-1} * recurrIGW
+ state_{i-1} * inputCheck
ouputGate = input_og_i + bias_og + output_{i-1} * recurrOGW
+ state_{i} * outputCheck
forgetGate = input_fg_i + bias_fg + output_{i-1} * recurrFGW
+ state_{i-1} * forgetCheck
parameter[0] consists of (recurrIW, recurrIGW, recurrFGW, recurrOGW)
baisParameter consists of
(bias_s, bias_ig, bias_og, bias_fg, inputCheck, forgetCheck, outputCheck)
actInput is defined by config active_type
actState is defined by config active_state_type
actGate is defined by config actvie_gate_type
*/
/**
* @brief LstmLayer takes 1 input layer with size * 4.
* Input layer is diveded into 4 equal parts:
* (input_s, input_ig, input_fg, input_og)
*
* For each sequence [start, end] it performs the following computation:
* @code
* output_{i} = actState(state_{i}) * actGate(outputGate_{i})
* state_{i} = actInput(input_s_{i} + bias_s +
* output_{i-1} * recurrIW) * actGate(inputGate_{i}) +
* actGate(forgetGate_{i}) * state_{i-1}
* inputGate = input_ig_{i} + bias_ig + output_{i-1} * recurrIGW +
* state_{i-1} * inputCheck
* ouputGate = input_og_{i} + bias_og + output_{i-1} * recurrOGW +
* state_{i} * outputCheck
* forgetGate = input_fg_{i} + bias_fg + output_{i-1} * recurrFGW +
* state_{i-1} * forgetCheck
* @endcode
*
* - parameter[0] consists of (recurrIW, recurrIGW, recurrFGW, recurrOGW)
* - baisParameter consists of
* (bias_s, bias_ig, bias_og, bias_fg, inputCheck, forgetCheck, outputCheck)
*
* - actInput is defined by config active_type.
* - actState is defined by config active_state_type.
* - actGate is defined by config actvie_gate_type.
*
* There are two ways to compute, namely one sequence by one sequence or
* one batch by one batch. By default and no setting pre_batch_state true,
* it will compute batch by batch.
*
* The formula in the paper is as follows:
* \f[
* i_t = \sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) \\
* f_t = \sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) \\
* \tilde{c_t} = tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) \\
* o_t = \sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) \\
* c_t = f_t * c_{t-1} + i_t * \tilde{c_t} \\
* h_t = o_t tanh(c_t)
* \f]
*
* @note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
* operations on the input sequence were NOT included in LstmLayer. So
* users should use fc_layer or mixed_layer before lstm_later.
*
* The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
* The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$.
*/
class LstmLayer : public Layer, public LstmCompute {
public:
......@@ -64,43 +87,120 @@ public:
LayerStatePtr getState();
protected:
/**
* @brief Compute lstm forward one sequence by one sequence.
* @param batchSize The batchSize is not equal to the batch_size in
* the config file. It is the total words number of all samples
* in this forward batch.
* @param numSequences The sample number. It is equal to the batch_size
* in the config file.
* @param starts Each start position of each samples.
* @param inputValue The input values.
*/
void forwardSequence(int batchSize, size_t numSequences,
const int *starts, MatrixPtr inputValue);
/**
* Compute lstm backward one sequence by one sequence.
*/
void backwardSequence(int batchSize, size_t numSequences,
const int *starts, MatrixPtr inputGrad);
/**
* Compute lstm forward one batch by one batch. The batch value is
* reorganized by SequenceToBatch class. The batch output value will
* be convert into sequence value after finishing forward. Here, one
* batch contains one word of each sample. If the length of each sample
* is not equality, the batch will not pads zero and contains less words.
* The total batch numbers are the max length of the sequence. The details
* can refer to SequenceToBatch class. On GPU mode, it will launch GPU
* kernel for loop.
*
* @code
* for (int i = 0; i < numBatch(max_sequence_length); ++i) {
* compute one batch.
* }
* @endcode
*/
void forwardBatch(int batchSize, size_t numSequences,
const int *starts, MatrixPtr inputValue);
/**
* Compute lstm backward one batch by one batch.
*/
void backwardBatch(int batchSize, size_t numSequences,
const int *starts, MatrixPtr inputGrad);
/**
* This function only supports GPU. It not need to reorganize input into
* batch value. It will launch one kernel to parallelly compute forward
* propagation in sequence level.
*/
void forwardSeqParallel(int batchSize, size_t numSequences,
const int *starts, MatrixPtr inputValue);
/**
* Backward propagation corresponding to forwardSeqParallel.
*/
void backwardSeqParallel(int batchSize, size_t numSequences,
const int *starts, MatrixPtr inputGrad);
/**
* This function is used for sequence generation and get output after
* forwardBatch.
*/
void getPrevBatchOutput(size_t numSequences);
/**
* This function is used for sequence generation and get state after
* forwardBatch.
*/
void getPrevBatchState(size_t numSequences);
protected:
/// Learned parameters, shape: (size, 4*size).
/// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
std::unique_ptr<Weight> weight_;
/// Learned bias parameter, shape: (1, 7 * size).
/// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$.
std::unique_ptr<Weight> bias_;
/* real bias and peephole for different gates */
MatrixPtr localBias_, checkIg_, checkFg_, checkOg_;
/* the gradient of, real bias and peephole for different gates */
MatrixPtr localBiasGrad_, checkIgGrad_, checkFgGrad_, checkOgGrad_;
/// The reeal bias, point to \f$b_i, b_f, b_c, b_o\f$.
MatrixPtr localBias_;
/// The peephole connection for input gate.
MatrixPtr checkIg_;
/// The peephole connection for forget gate.
MatrixPtr checkFg_;
/// The peephole connection for output gate.
MatrixPtr checkOg_;
/// The gradient of real bias
MatrixPtr localBiasGrad_;
/// The gradient of peephole connection for input gates.
MatrixPtr checkIgGrad_;
/// The gradient of peephole connection for forget gates.
MatrixPtr checkFgGrad_;
/// The gradient of peephole connection for output gates.
MatrixPtr checkOgGrad_;
/// Stores the cell state of previous time step, namely \f$c_{t-1}\f$.
Argument state_;
/// Stores the hidden of previous time step, namely \f$h_{t-1}\f$.
Argument preOutput_;
/// Stores the value and gradient of four gates, namely
/// \f$i_t, f_t, o_t, c_t\f$.
Argument gate_;
/// Whether it is reversed lstm.
bool reversed_;
/// Whether to use batch method to compute.
bool useBatch_;
/// Whether to use sequence parallell method to compute.
bool useSeqParallel_;
/// batchValue_ is used in method of batch calculation. It stores the
/// batch value after reorganized input.
std::unique_ptr<SequenceToBatch> batchValue_;
/// The gradient of batchValue_.
std::unique_ptr<SequenceToBatch> batchGrad_;
/// Used in generation and stores the state of previous time step.
MatrixPtr prevState_;
/// Used in generation and stores the output of previous time step.
MatrixPtr prevOutput_;
MatrixPtr prevBatchOutput2_;
/// The total state.
MatrixPtr totalState_;
};
......
......@@ -22,16 +22,26 @@ P_DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
namespace paddle {
/*
RecurrentLayer takes 1 input layer with the same size.
For each sequence [start, end] it performs the following computation:
out_i = act(in_i) for i = start
out_i = act(in_i + out_{i-1} * W) for start < i <= end
If reversed is true, the order is reversed:
out_i = act(in_i) for i = end
out_i = act(in_i + out_{i+1} * W) for start <= i < end
*/
/**
* @brief RecurrentLayer takes 1 input layer. The output size is the same with
* input layer.
* For each sequence [start, end] it performs the following computation:
* \f[
* out_{i} = act(in_{i}) \ \ \text{for} \ i = start \\
* out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
*
* \f]
* If reversed is true, the order is reversed:
* \f[
* out_{i} = act(in_{i}) \ \ \text{for} \ i = end \\
* out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
* \f]
* There are two methods to calculate rnn. One way is to compute rnn one
* sequence by one sequence. The other way is to reorganize the input
* into batches, then compute rnn one batch by one batch. Users can select
* them by rnn_use_batch flag.
*/
class RecurrentLayer : public Layer {
public:
explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
......@@ -49,23 +59,69 @@ public:
LayerStatePtr getState();
protected:
/**
* @brief If user do not set --rnn_use_batch=true, it will
* compute rnn forward one sequence by one sequence in default.
* @param batchSize Total words number of all samples in this batch.
* @param numSequences The sample number.
* @param starts Each start position of each samples.
*/
void forwardSequence(int batchSize, size_t numSequences, const int* starts);
/**
* @brief Compute rnn forward by one sequence.
* @param start The start position of this sequence (or sample).
* @param length The length of this sequence (or sample), namely the words
* number of this sequence.
*/
void forwardOneSequence(int start, int length);
/**
* @brief Compute rnn backward one sequence by onesequence.
* @param batchSize Total words number of all samples in this batch.
* @param numSequences The sample number.
* @param starts Each start position of each samples.
*/
void backwardSequence(int batchSize, size_t numSequences, const int* starts);
/**
* @brief Compute rnn backward by one sequence.
* @param start The start position of this sequence (or sample).
* @param length The length of this sequence (or sample), namely the words
* number of this sequence.
*/
void backwardOneSequence(int start, int length);
/**
* @brief Reorganize input into batches and compute rnn forward batch
* by batch. It will convert batch shape to sequence after finishing forward.
* The batch info can refer to SequenceToBatch class.
* @param batchSize Total words number of all samples in this batch.
* @param numSequences The sample number.
* @param starts Each start position of each samples.
*/
void forwardBatch(int batchSize, size_t numSequences, const int* starts);
/**
* @brief Reorganize input into batches and compute rnn forward batch
* by batch.
* @param batchSize Total words number of all samples in this batch.
* @param numSequences The sample number.
* @param starts Each start position of each samples.
*/
void backwardBatch(int batchSize, size_t numSequences, const int* starts);
protected:
std::unique_ptr<Weight> weight_;
std::unique_ptr<Weight> bias_;
// frameOutput_[i] is used to hold the i-th sample of output_
/// frameOutput_[i] is used to hold the i-th sample of output_
std::vector<Argument> frameOutput_;
MatrixPtr prevOutput_;
/// Whether compute rnn by reverse.
bool reversed_;
/// If compute batch by batch, batchValue_ will be used to save the
/// reorganized input value.
std::unique_ptr<SequenceToBatch> batchValue_;
/// If compute batch by batch, batchGrad_ will be used to save the
/// gradient with respect to reorganized input value.
std::unique_ptr<SequenceToBatch> batchGrad_;
};
......
......@@ -2617,7 +2617,6 @@ def tensor_layer(input, size, act=None, name=None,
In this formular:
- :math:`x_{1}`: the first input contains M elements.
- :math:`x_{2}`: the second input contains N elements.
- y[out]: contains K elements.
- :math:`y_{i}`: the i-th element of y.
- :math:`W_{i}`: the i-th learned weight, shape if [M, N]
- :math:`{x_{2}}^\mathrm{T}`: the transpose of :math:`x_{2}`.
......@@ -2909,6 +2908,17 @@ def block_expand_layer(input,
time step is block_y * block_x * channel. This layer can be used after
convolution neural network, and before recurrent neural network.
The simple usage is:
.. code-block:: python
block_expand = block_expand_layer(input,
channel=128,
stride_x=1,
stride_y=1,
block_x=1,
block_x=3)
:param input: The input layer.
:type input: LayerOutput
:param channel: The channel number of input layer.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册