diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index 35215d7fa6cf984415fad5db6e290046fc4bea46..e0888e70d598de9a7a876cfb4ce5e82539dcbbec 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -112,26 +112,22 @@ framework::OpKernelType FusionLSTMOp::GetExpectedKernelType( void FusionLSTMOpMaker::Make() { AddInput("X", - "(LoDTensor) the first input is a LodTensor, which support " + "(LoDTensor) the input is a LodTensor, which support " "variable-time length input sequence. The underlying tensor in " - "this LoDTensor is a matrix with shape (T X 4D), where T is the " - "total time steps in this mini-batch, D is the hidden size."); - AddInput("H0", - "(Tensor, optional) the initial hidden state is an optional " - "input. This is a tensor with shape (N x D), where N is the " - "batch size and D is the hidden size.") - .AsDispensable(); - AddInput("C0", - "(Tensor, optional) the initial cell state is an optional " - "input. This is a tensor with shape (N x D), where N is the " - "batch size. `H0` and `C0` can be NULL but only at the same time.") - .AsDispensable(); - AddInput("Weight", - "(Tensor) the learnable hidden-hidden weights." + "this LoDTensor is a matrix with shape (T X M), where T is the " + "total time steps in this mini-batch, M is the dim size of x."); + AddInput("WeightX", + "(Tensor) the learnable weights of X." + " - The shape is (M x 4D), where M is the dim size of x, D is the " + "hidden size. " + " - Weight = {W_cx, W_ix, W_fx, W_ox}"); + AddInput("WeightH", + "(Tensor) same as LSTMOp, the learnable hidden-hidden weights." " - The shape is (D x 4D), where D is the hidden size. " " - Weight = {W_ch, W_ih, W_fh, W_oh}"); AddInput("Bias", - "(Tensor) the learnable weights, which contains two parts: " + "(Tensor) the learnable weights. Almost same as LSTMOp" + "Note: we should add the fc bias into this (1x4D) in bias." "input-hidden bias weight and peephole connections weight if " "setting `use_peepholes` True. " "1. `use_peepholes = False` " @@ -140,29 +136,31 @@ void FusionLSTMOpMaker::Make() { "2. `use_peepholes = True` " " - The shape is (1 x 7D). " " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}."); + AddInput("H0", + "(Tensor, optional) (same as LSTMOp) the initial hidden state is an " + "optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size and D is the hidden size.") + .AsDispensable(); + AddInput("C0", + "(Tensor, optional) (same as LSTMOp) (the initial cell state is an " + "optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size. `H0` and `C0` can be NULL but only at the same time.") + .AsDispensable(); AddOutput("Hidden", - "(LoDTensor) the hidden state of LSTM operator. " + "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. " "The shape is (T x D), and lod is the same with the `Input`."); AddOutput("Cell", - "(LoDTensor) the cell state of LSTM operator. " + "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. " "The shape is (T x D), and lod is the same with the `Input`."); AddOutput("XX", - "(LoDTensor) the first input is a LodTensor, which support " - "variable-time length input sequence. The underlying tensor in " - "this LoDTensor is a matrix with shape (T X 4D), where T is the " - "total time steps in this mini-batch, D is the hidden size."); - AddOutput("BatchedGate", - "(LoDTensor) This LoDTensor contains input gate, forget gate " - "and output gate after the nonlinear computation. This " - "LoDTensor has the same shape as the reorganized input, which " - "is also be called batch input. The LoD size is 2. The first " - "LoD is the batch offsets and the second LoD contains the " - "indexes, which denote the position of reorganized sequence " - "in the raw input.") - .AsIntermediate(); - AddOutput("BatchCellPreAct", - "(LoDTensor) This LoDTensor is obtained in the forward and used " - "in the backward.") + "(LoDTensor) the result after X * WeightX (size is T x 4D)" + " or batched_X (size is T x M), this will be automatically chosen," + " where T is the total time steps in this mini-batch," + " D is the hidden size, M is the dim size of x input."); + AddOutput("BatchedGate", "(LoDTensor) (same as LSTMOp).").AsIntermediate(); + AddOutput("BatchCellPreAct", "(LoDTensor) (same as LSTMOp).") .AsIntermediate(); AddAttr("use_peepholes", "(bool, defalut: True) " @@ -190,46 +188,8 @@ void FusionLSTMOpMaker::Make() { .SetDefault("tanh") .InEnum({"sigmoid", "tanh", "relu", "identity"}); AddComment(R"DOC( -Long-Short Term Memory (LSTM) Operator. - -The defalut implementation is diagonal/peephole connection -(https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows: - -$$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) $$ - -$$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) $$ - -$$ \\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) $$ - -$$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) $$ - -$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$ - -$$ h_t = o_t \\odot act_h(c_t) $$ - -- W terms denote weight matrices (e.g. $W_{xi}$ is the matrix - of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$ - are diagonal weight matrices for peephole connections. In our implementation, - we use vectors to reprenset these diagonal weight matrices. -- The b terms denote bias vectors ($b_i$ is the input gate bias vector). -- $\sigma$ is the non-line activations, such as logistic sigmoid function. -- $i, f, o$ and $c$ are the input gate, forget gate, output gate, - and cell activation vectors, respectively, all of which have the same size as - the cell output activation vector $h$. -- The $\odot$ is the element-wise product of the vectors. -- $act_g$ and $act_h$ are the cell input and cell output activation functions - and `tanh` is usually used for them. -- $\tilde{c_t}$ is also called candidate hidden state, - which is computed based on the current input and the previous hidden state. - -Set `use_peepholes` False to disable peephole connection. The formula -is omitted here, please refer to the paper -http://www.bioinf.jku.at/publications/older/2604.pdf for details. - -Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$ -operations on the input $x_{t}$ are NOT included in this operator. -Users can choose to use fully-connect operator before LSTM operator. - +Fusion Long-Short Term Memory (LSTM) Operator. +This operator fuse the X into LSTM, more details can refer to LSTM op. )DOC"); } @@ -266,14 +226,12 @@ class FuisonLSTMKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); - auto* wx = ctx.Input("WeightX"); // x*4D - auto* wh = ctx.Input("WeightH"); // D*4D + auto* wx = ctx.Input("WeightX"); + auto* wh = ctx.Input("WeightH"); auto* bias = ctx.Input("Bias"); auto* hidden_t0 = ctx.Input("H0"); auto* cell_t0 = ctx.Input("C0"); - // the result after x*Wx (size: sum_words*4D) or batched_x (size: - // sum_words*x) auto* xx = ctx.Output("XX"); auto* batched_gate = ctx.Output("BatchedGate"); auto* hidden_out = ctx.Output("Hidden"); @@ -312,7 +270,6 @@ class FuisonLSTMKernel : public framework::OpKernel { lstm_value.check_ig = nullptr; lstm_value.check_fg = nullptr; lstm_value.check_og = nullptr; - lstm_value.prev_state_value = nullptr; Tensor ordered_c0;