Split some APIs from nn.py to rnn.py and sequence_lod.py (#21030)

* split some APIs from nn.py to rnn.py * split some APIs from nn.py to sequence_lod.py test=develop * fix unit-test bug test=develop * fix test_layers unit-test bug test=develop

Split some APIs from nn.py to rnn.py and sequence_lod.py (#21030)
* split some APIs from nn.py to rnn.py * split some APIs from nn.py to sequence_lod.py test=develop * fix unit-test bug test=develop * fix test_layers unit-test bug test=develop
78cc1ca6 · Tao Luo · GitHub · 77c20835 · 78cc1ca6 · 78cc1ca6
4 changed file
--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
@@ -35,6 +35,7 @@ from .metric_op import *
 from .learning_rate_scheduler import *
 from .collective import *
 from .distributions import *
+from .sequence_lod import *
 from . import rnn

 __all__ = []
@@ -48,6 +49,7 @@ __all__ += detection.__all__
 __all__ += metric_op.__all__
 __all__ += learning_rate_scheduler.__all__
 __all__ += distributions.__all__
+__all__ += sequence_lod.__all__
 __all__ += rnn.__all__

 from .rnn import *
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -40,10 +40,6 @@ __all__ = [
    'fc',
    'center_loss',
    'embedding',
-    'dynamic_lstm',
-    'dynamic_lstmp',
-    'dynamic_gru',
-    'gru_unit',
    'linear_chain_crf',
    'crf_decoding',
    'cos_sim',
@@ -51,11 +47,8 @@ __all__ = [
    'bpr_loss',
    'square_error_cost',
    'chunk_eval',
-    'sequence_conv',
    'conv2d',
    'conv3d',
-    'sequence_pool',
-    'sequence_softmax',
    'softmax',
    'pool2d',
    'pool3d',
@@ -64,14 +57,8 @@ __all__ = [
    'batch_norm',
    'instance_norm',
    'data_norm',
-    'beam_search_decode',
    'conv2d_transpose',
    'conv3d_transpose',
-    'sequence_expand',
-    'sequence_expand_as',
-    'sequence_pad',
-    'sequence_unpad',
-    'lstm_unit',
    'reduce_sum',
    'reduce_mean',
    'reduce_max',
@@ -79,9 +66,6 @@ __all__ = [
    'reduce_prod',
    'reduce_all',
    'reduce_any',
-    'sequence_first_step',
-    'sequence_last_step',
-    'sequence_slice',
    'dropout',
    'split',
    'ctc_greedy_decoder',
@@ -90,13 +74,11 @@ __all__ = [
    'matmul',
    'topk',
    'warpctc',
-    'sequence_reshape',
    'transpose',
    'im2sequence',
    'nce',
    'sampled_softmax_with_cross_entropy',
    'hsigmoid',
-    'beam_search',
    'row_conv',
    'multiplex',
    'layer_norm',
@@ -128,7 +110,6 @@ __all__ = [
    'scatter',
    'scatter_nd_add',
    'scatter_nd',
-    'sequence_scatter',
    'random_crop',
    'mean_iou',
    'relu',
@@ -149,16 +130,13 @@ __all__ = [
    'leaky_relu',
    'soft_relu',
    'flatten',
-    'sequence_mask',
    'stack',
    'pad2d',
    'unstack',
-    'sequence_enumerate',
    'unique',
    'unique_with_counts',
    'expand',
    'expand_as',
-    'sequence_concat',
    'scale',
    'elementwise_add',
    'elementwise_div',
@@ -191,7 +169,6 @@ __all__ = [
    'maxout',
    'space_to_depth',
    'affine_grid',
-    'sequence_reverse',
    'affine_channel',
    'similarity_focus',
    'hash',
@@ -201,7 +178,6 @@ __all__ = [
    'bilinear_tensor_product',
    'merge_selected_rows',
    'get_tensor_from_selected_rows',
-    'lstm',
    'shuffle_channel',
    'temporal_shift',
    'py_func',
@@ -688,844 +664,6 @@ def _pull_box_sparse(input, size, dtype='float32'):
    return outs


-def dynamic_lstm(input,
-                 size,
-                 h_0=None,
-                 c_0=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_peepholes=True,
-                 is_reverse=False,
-                 gate_activation='sigmoid',
-                 cell_activation='tanh',
-                 candidate_activation='tanh',
-                 dtype='float32',
-                 name=None):
-    """
-    **Note**:
-        1. This OP only supports LoDTensor as inputs. If you need to deal with Tensor, please use :ref:`api_fluid_layers_lstm` .
-        2. In order to improve efficiency, users must first map the input of dimension [T, hidden_size] to input of [T, 4 * hidden_size], and then pass it to this OP.
-
-    The implementation of this OP include diagonal/peephole connections.
-    Please refer to `Gers, F. A., & Schmidhuber, J. (2000) <ftp://ftp.idsia.ch/pub/juergen/TimeCount-IJCNN2000.pdf>`_ .
-    If you do not need peephole connections, please set use_peepholes to False .
-
-    This OP computes each timestep as follows:
-
-    .. math::
-      i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_{x_i} + b_{h_i})
-    .. math::
-      f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_{x_f} + b_{h_f})
-    .. math::
-      o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_{x_o} + b_{h_o})
-    .. math::
-      \widetilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + b{x_c} + b_{h_c})
-    .. math::
-      c_t = f_t \odot c_{t-1} + i_t \odot \widetilde{c_t}
-    .. math::
-      h_t = o_t \odot tanh(c_t)
-
-    The symbolic meanings in the formula are as follows:
-
-    - :math:`x_{t}` represents the input at timestep :math:`t`
-    - :math:`h_{t}` represents the hidden state at timestep :math:`t`
-    - :math:`h_{t-1}, c_{t-1}` represent the hidden state and cell state at timestep :math:`t-1` , respectively
-    - :math:`\widetilde{c_t}` represents the candidate cell state
-    - :math:`i_t` , :math:`f_t` and :math:`o_t` represent input gate, forget gate, output gate, respectively
-    - :math:`W` represents weight (e.g., :math:`W_{ix}` is the weight of a linear transformation of input :math:`x_{t}` when calculating input gate :math:`i_t` )
-    - :math:`b` represents bias (e.g., :math:`b_{i}` is the bias of input gate)
-    - :math:`\sigma` represents nonlinear activation function for gate, default sigmoid
-    - :math:`\odot` represents the Hadamard product of a matrix, i.e. multiplying the elements of the same position for two matrices with the same dimension to get another matrix with the same dimension
-
-    Parameters:
-        input ( :ref:`api_guide_Variable_en` ): LSTM input tensor, multi-dimensional LODTensor of shape :math:`[T, 4*hidden\_size]` . Data type is float32 or float64.
-        size (int): must be 4 * hidden_size.
-        h_0( :ref:`api_guide_Variable_en` , optional): The initial hidden state of the LSTM, multi-dimensional Tensor of shape :math:`[batch\_size, hidden\_size]` .
-                       Data type is float32 or float64. If set to None, it will be a vector of all 0. Default: None.
-        c_0( :ref:`api_guide_Variable_en` , optional): The initial hidden state of the LSTM, multi-dimensional Tensor of shape :math:`[batch\_size, hidden\_size]` .
-                       Data type is float32 or float64. If set to None, it will be a vector of all 0. `h_0` and `c_0` can be None but only at the same time. Default: None.
-        param_attr(ParamAttr, optional): Parameter attribute of weight. If it is None, the default weight parameter attribute is used. Please refer to ref:`api_fluid_ParamAttr' .
-                              If the user needs to set this parameter, the dimension must be :math:`[hidden\_size, 4*hidden\_size]` . Default: None.
-
-                              - Weights = :math:`\{ W_{cr},W_{ir},W_{fr},W_{or} \}` , the shape is [hidden_size, 4*hidden_size].
-
-        bias_attr (ParamAttr, optional): The bias attribute for the learnable bias
-                              weights, which contains two parts, input-hidden
-                              bias weights and peephole connections weights if
-                              setting `use_peepholes` to `True`.
-                              Please refer to ref:`api_fluid_ParamAttr' . Default: None.
-
-                              1. `use_peepholes = False`
-                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
-                                 - The shape is [1, 4*hidden_size].
-                              2. `use_peepholes = True`
-                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
-                                                 W_{fc}, W_{oc}`}.
-                                 - The shape is [1, 7*hidden_size].
-                                 
-        use_peepholes (bool, optional): Whether to use peephole connection or not. Default: True.
-        is_reverse (bool, optional): Whether to calculate reverse LSTM. Default: False.
-        gate_activation (str, optional): The activation for input gate, forget gate and output gate. Default: "sigmoid".
-        cell_activation (str, optional): The activation for cell output. Default: "tanh".
-        candidate_activation (str, optional): The activation for candidate hidden state. Default: "tanh".
-        dtype (str, optional): Data type, can be "float32" or "float64". Default: "float32".
-        name (str, optional): A name for this layer. Please refer to :ref:`api_guide_Name` . Default: None.
-
-    Returns:
-        tuple ( :ref:`api_guide_Variable` , :ref:`api_guide_Variable` ) :
-
-            The hidden state and cell state of LSTM
-
-                - hidden: LoDTensor with shape of :math:`[T, hidden\_size]` , and its lod and dtype is the same as the input.
-                - cell: LoDTensor with shape of :math:`[T, hidden\_size]` , and its lod and dtype is the same as the input.
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid as fluid
-            emb_dim = 256
-            vocab_size = 10000
-            hidden_dim = 512
-            
-            data = fluid.data(name='x', shape=[None], dtype='int64', lod_level=1)
-            emb = fluid.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True)
-
-            forward_proj = fluid.layers.fc(input=emb, size=hidden_dim * 4,
-                                           bias_attr=False)
-
-            forward, cell = fluid.layers.dynamic_lstm(
-                input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
-            forward.shape  # (-1, 512)
-            cell.shape  # (-1, 512)
-    """
-    assert in_dygraph_mode(
-    ) is not True, "please use lstm instead of dynamic_lstm in dygraph mode!"
-    assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
-    helper = LayerHelper('lstm', **locals())
-    size = size // 4
-    weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
-    bias_size = [1, 7 * size]
-    if not use_peepholes:
-        bias_size[1] = 4 * size
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
-
-    hidden = helper.create_variable_for_type_inference(dtype)
-    cell = helper.create_variable_for_type_inference(dtype)
-    batch_gate = helper.create_variable_for_type_inference(dtype)
-    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
-    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
-    batch_size = input.shape[0]
-    if h_0:
-        assert h_0.shape == (batch_size, size), \
-            'The shape of h0 should be (batch_size, %d)' % size
-        inputs['H0'] = h_0
-    if c_0:
-        assert c_0.shape == (batch_size, size), \
-            'The shape of c0 should be (batch_size, %d)' % size
-        inputs['C0'] = c_0
-
-    helper.append_op(
-        type='lstm',
-        inputs=inputs,
-        outputs={
-            'Hidden': hidden,
-            'Cell': cell,
-            'BatchGate': batch_gate,
-            'BatchCellPreAct': batch_cell_pre_act
-        },
-        attrs={
-            'use_peepholes': use_peepholes,
-            'is_reverse': is_reverse,
-            'gate_activation': gate_activation,
-            'cell_activation': cell_activation,
-            'candidate_activation': candidate_activation
-        })
-    return hidden, cell
-
-
-def lstm(input,
-         init_h,
-         init_c,
-         max_len,
-         hidden_size,
-         num_layers,
-         dropout_prob=0.0,
-         is_bidirec=False,
-         is_test=False,
-         name=None,
-         default_initializer=None,
-         seed=-1):
-    """
-    **Note**:
-        This OP only supports running on GPU devices.
-
-    This OP implements LSTM operation - `Hochreiter, S., & Schmidhuber, J. (1997) <http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf>`_ .
-
-    The implementation of this OP does not include diagonal/peephole connections.
-    Please refer to `Gers, F. A., & Schmidhuber, J. (2000) <ftp://ftp.idsia.ch/pub/juergen/TimeCount-IJCNN2000.pdf>`_ .
-    If you need peephole connections, please use :ref:`api_fluid_layers_dynamic_lstm` .
-
-    This OP computes each timestep as follows:
-
-    .. math::
-      i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_{x_i} + b_{h_i})
-    .. math::
-      f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_{x_f} + b_{h_f})
-    .. math::
-      o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_{x_o} + b_{h_o})
-    .. math::
-      \widetilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + b{x_c} + b_{h_c})
-    .. math::
-      c_t = f_t \odot c_{t-1} + i_t \odot \widetilde{c_t}
-    .. math::
-      h_t = o_t \odot tanh(c_t)
-
-    The symbolic meanings in the formula are as follows:
-
-    - :math:`x_{t}` represents the input at timestep :math:`t`
-    - :math:`h_{t}` represents the hidden state at timestep :math:`t`
-    - :math:`h_{t-1}, c_{t-1}` represent the hidden state and cell state at timestep :math:`t-1` , respectively
-    - :math:`\widetilde{c_t}` represents the candidate cell state
-    - :math:`i_t` , :math:`f_t` and :math:`o_t` represent input gate, forget gate, output gate, respectively
-    - :math:`W` represents weight (e.g., :math:`W_{ix}` is the weight of a linear transformation of input :math:`x_{t}` when calculating input gate :math:`i_t` )
-    - :math:`b` represents bias (e.g., :math:`b_{i}` is the bias of input gate)
-    - :math:`\sigma` represents nonlinear activation function for gate, default sigmoid
-    - :math:`\odot` represents the Hadamard product of a matrix, i.e. multiplying the elements of the same position for two matrices with the same dimension to get another matrix with the same dimension
-
-    Parameters:
-        input ( :ref:`api_guide_Variable_en` ): LSTM input tensor, 3-D Tensor of shape :math:`[batch\_size, seq\_len, input\_dim]` . Data type is float32 or float64
-        init_h( :ref:`api_guide_Variable_en` ): The initial hidden state of the LSTM, 3-D Tensor of shape :math:`[num\_layers, batch\_size, hidden\_size]` .
-                       If is_bidirec = True, shape should be :math:`[num\_layers*2, batch\_size, hidden\_size]` . Data type is float32 or float64.
-        init_c( :ref:`api_guide_Variable_en` ): The initial cell state of the LSTM, 3-D Tensor of shape :math:`[num\_layers, batch\_size, hidden\_size]` .
-                       If is_bidirec = True, shape should be :math:`[num\_layers*2, batch\_size, hidden\_size]` . Data type is float32 or float64.
-        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len.
-        hidden_size (int): hidden size of the LSTM.
-        num_layers (int): total layers number of the LSTM.
-        dropout_prob(float, optional): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
-                             There is NO dropout work on rnn output of the last RNN layers.
-                             Default: 0.0.
-        is_bidirec (bool, optional): If it is bidirectional. Default: False.
-        is_test (bool, optional): If it is in test phrase. Default: False.
-        name (str, optional): A name for this layer. If set None, the layer
-                         will be named automatically. Default: None.
-        default_initializer(Initializer, optional): Where use initializer to initialize the Weight
-                         If set None, defaule initializer will be used. Default: None.
-        seed(int, optional): Seed for dropout in LSTM, If it's -1, dropout will use random seed. Default: 1.
-
-
-    Returns:
-        tuple ( :ref:`api_guide_Variable_en` , :ref:`api_guide_Variable_en` , :ref:`api_guide_Variable_en` ) :
-
-                        Three tensors, rnn_out, last_h, last_c:
-
-                        - rnn_out is result of LSTM hidden, shape is :math:`[seq\_len, batch\_size, hidden\_size]` \
-                          if is_bidirec set to True, shape will be :math:`[seq\_len, batch\_size, hidden\_size*2]`
-                        - last_h is the hidden state of the last step of LSTM \
-                          shape is :math:`[num\_layers, batch\_size, hidden\_size]` \
-                          if is_bidirec set to True, shape will be :math:`[num\_layers*2, batch\_size, hidden\_size]`
-                        - last_c(Tensor): the cell state of the last step of LSTM \
-                          shape is :math:`[num\_layers, batch\_size, hidden\_size]` \
-                          if is_bidirec set to True, shape will be :math:`[num\_layers*2, batch\_size, hidden\_size]`
-
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-
-            emb_dim = 256
-            vocab_size = 10000
-            data = fluid.data(name='x', shape=[None, 100], dtype='int64')
-            emb = fluid.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True)
-            batch_size = 20
-            max_len = 100
-            dropout_prob = 0.2
-            input_size = 100
-            hidden_size = 150
-            num_layers = 1
-            init_h = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0 )
-            init_c = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0 )
-            rnn_out, last_h, last_c = layers.lstm( emb, init_h, init_c, \
-                    max_len, hidden_size, num_layers, \
-                    dropout_prob=dropout_prob)
-            rnn_out.shape  # (-1, 100, 150)
-            last_h.shape  # (1, 20, 150)
-            last_c.shape  # (1, 20, 150)
-    """
-
-    helper = LayerHelper('cudnn_lstm', **locals())
-
-    dtype = input.dtype
-    input_shape = list(input.shape)
-    input_size = input_shape[-1]
-    weight_size = 0
-    for i in range(num_layers):
-        if i == 0:
-            input_weight_size = (input_size * hidden_size) * 4
-        else:
-            if is_bidirec:
-                input_weight_size = (hidden_size * 2 * hidden_size) * 4
-            else:
-                input_weight_size = (hidden_size * hidden_size) * 4
-
-        hidden_weight_size = (hidden_size * hidden_size) * 4
-
-        if is_bidirec:
-            weight_size += (input_weight_size + hidden_weight_size) * 2
-            weight_size += hidden_size * 8 * 2
-        else:
-            weight_size += input_weight_size + hidden_weight_size
-            weight_size += hidden_size * 8
-
-    weight = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=[weight_size],
-        dtype=dtype,
-        default_initializer=default_initializer)
-
-    out = helper.create_variable_for_type_inference(dtype)
-    last_h = helper.create_variable_for_type_inference(dtype)
-    last_c = helper.create_variable_for_type_inference(dtype)
-
-    cache = helper.create_variable(
-        persistable=True, type=core.VarDesc.VarType.RAW, stop_gradient=True)
-
-    helper.append_op(
-        type='cudnn_lstm',
-        inputs={
-            'Input': input,
-            'InitH': init_h,
-            'InitC': init_c,
-            'W': weight,
-            'Cache': cache,
-        },
-        outputs={
-            'Out': out,
-            'last_h': last_h,
-            'last_c': last_c,
-        },
-        attrs={
-            'max_len': max_len,
-            'is_bidirec': is_bidirec,
-            'input_size': input_size,
-            'hidden_size': hidden_size,
-            'num_layers': num_layers,
-            'is_test': is_test,
-            'dropout_prob': dropout_prob,
-            'seed': seed,
-        })
-    return out, last_h, last_c
-
-
-def dynamic_lstmp(input,
-                  size,
-                  proj_size,
-                  param_attr=None,
-                  bias_attr=None,
-                  use_peepholes=True,
-                  is_reverse=False,
-                  gate_activation='sigmoid',
-                  cell_activation='tanh',
-                  candidate_activation='tanh',
-                  proj_activation='tanh',
-                  dtype='float32',
-                  name=None,
-                  h_0=None,
-                  c_0=None,
-                  cell_clip=None,
-                  proj_clip=None):
-    """
-    **Note**:
-        1. In order to improve efficiency, users must first map the input of dimension [T, hidden_size] to input of [T, 4 * hidden_size], and then pass it to this OP.
-
-    This OP implements the LSTMP (LSTM Projected) layer.
-    The LSTMP layer has a separate linear mapping layer behind the LSTM layer. -- `Sak, H., Senior, A., & Beaufays, F. (2014) <https://ai.google/research/pubs/pub43905.pdf>`_ .
-
-    Compared with the standard LSTM layer, LSTMP has an additional linear mapping layer,
-    which is used to map from the original hidden state :math:`h_t` to the lower dimensional state :math:`r_t` .
-    This reduces the total number of parameters and computational complexity, especially when the output unit is relatively large.
-
-    The default implementation of the OP contains diagonal/peephole connections,
-    please refer to `Gers, F. A., & Schmidhuber, J. (2000) <ftp://ftp.idsia.ch/pub/juergen/TimeCount-IJCNN2000.pdf>`_ .
-    If you need to disable the peephole connections, set use_peepholes to False.
-
-    This OP computes each timestep as follows:
-
-    .. math::
-      i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i)
-    .. math::
-          f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f)
-    .. math::
-          o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_{t-1} + b_o)
-    .. math::
-          \widetilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c)
-    .. math::
-          c_t = f_t \odot c_{t-1} + i_t \odot \widetilde{c_t}
-    .. math::
-          h_t = o_t \odot act_h(c_t)
-    .. math::
-          r_t = \overline{act_h}(W_{rh}h_t)
-
-    The symbolic meanings in the formula are as follows:
-
-    - :math:`x_{t}` represents the input at timestep :math:`t`
-    - :math:`h_{t}` represents the hidden state at timestep :math:`t`
-    - :math:`r_{t}` : represents the state of the projected output of the hidden state :math:`h_{t}`
-    - :math:`h_{t-1}, c_{t-1}, r_{t-1}` represent the hidden state, cell state and projected output at timestep :math:`t-1` , respectively
-    - :math:`\widetilde{c_t}` represents the candidate cell state
-    - :math:`i_t` , :math:`f_t` and :math:`o_t` represent input gate, forget gate, output gate, respectively
-    - :math:`W` represents weight (e.g., :math:`W_{ix}` is the weight of a linear transformation of input :math:`x_{t}` when calculating input gate :math:`i_t` )
-    - :math:`b` represents bias (e.g., :math:`b_{i}` is the bias of input gate)
-    - :math:`\sigma` represents nonlinear activation function for gate, default sigmoid
-    - :math:`\odot` represents the Hadamard product of a matrix, i.e. multiplying the elements of the same position for two matrices with the same dimension to get another matrix with the same dimension
-
-    Parameters:
-        input( :ref:`api_guide_Variable_en` ): The input of dynamic_lstmp layer, which supports
-                         variable-time length input sequence.
-                         It is a multi-dimensional LODTensor of shape :math:`[T, 4*hidden\_size]` . Data type is float32 or float64.
-        size(int): must be 4 * hidden_size.
-        proj_size(int): The size of projection output.
-        param_attr(ParamAttr, optional): Parameter attribute of weight. If it is None, the default weight parameter attribute is used. Please refer to ref:`api_fluid_ParamAttr' .
-                              If the user needs to set this parameter, the dimension must be :math:`[hidden\_size, 4*hidden\_size]` . Default: None.
-
-                              - Weights = :math:`\{ W_{cr},W_{ir},W_{fr},W_{or} \}` , the shape is [P, 4*hidden_size] , where P is the projection size.
-                              - Projection weight  = :math:`\{ W_{rh} \}` , the shape is [hidden_size, P].
-
-        bias_attr (ParamAttr, optional): The bias attribute for the learnable bias
-                              weights, which contains two parts, input-hidden
-                              bias weights and peephole connections weights if
-                              setting `use_peepholes` to `True`.
-                              Please refer to ref:`api_fluid_ParamAttr' . Default: None.
-
-                              1. `use_peepholes = False`
-                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
-                                 - The shape is [1, 4*hidden_size].
-                              2. `use_peepholes = True`
-                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
-                                                 W_{fc}, W_{oc}`}.
-                                 - The shape is [1, 7*hidden_size].
-
-        use_peepholes (bool, optional): Whether to use peephole connection or not. Default True.
-        is_reverse (bool, optional): Whether to calculate reverse LSTM. Default False.
-        gate_activation (str, optional): The activation for input gate, forget gate and output gate. Default "sigmoid".
-        cell_activation (str, optional): The activation for cell output. Default "tanh".
-        candidate_activation (str, optional): The activation for candidate hidden state. Default "tanh".
-        proj_activation(str, optional): The activation for projection output. Default "tanh".
-        dtype (str, optional): Data type, can be "float32" or "float64". Default "float32".
-        name (str, optional): A name for this layer. Please refer to :ref:`api_guide_Name` . Default: None.
-        h_0( :ref:`api_guide_Variable` , optional): The initial hidden state is an optional input, default is zero.
-                       This is a tensor with shape :math:`[batch\_size, P]` , where P is the projection size. Default: None.
-        c_0( :ref:`api_guide_Variable` , optional): The initial cell state is an optional input, default is zero.
-                       This is a tensor with shape :math:`[batch\_size, P]` , where P is the projection size.
-                       `h_0` and `c_0` can be None but only at the same time. Default: None.
-        cell_clip(float, optional): If not None, the cell state is clipped
-                             by this value prior to the cell output activation. Default: None.
-        proj_clip(float, optional): If `num_proj > 0` and `proj_clip` is
-                            provided, then the projected values are clipped elementwise to within
-                            `[-proj_clip, proj_clip]`. Default: None.
-
-    Returns:
-        tuple ( :ref:`api_guide_Variable` , :ref:`api_guide_Variable` ) :
-
-                The hidden state and cell state of LSTMP
-
-                - hidden: LoDTensor with shape of :math:`[T, P]` , and its lod and dtype is the same as the input.
-                - cell: LoDTensor with shape of :math:`[T, hidden\_size]` , and its lod and dtype is the same as the input.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            dict_dim, emb_dim = 128, 64
-            data = fluid.data(name='sequence', shape=[None], dtype='int64', lod_level=1)
-            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
-            hidden_dim, proj_dim = 512, 256
-            fc_out = fluid.layers.fc(input=emb, size=hidden_dim * 4,
-                                    act=None, bias_attr=None)
-            proj_out, last_c = fluid.layers.dynamic_lstmp(input=fc_out,
-                                                    size=hidden_dim * 4,
-                                                    proj_size=proj_dim,
-                                                    use_peepholes=False,
-                                                    is_reverse=True,
-                                                    cell_activation="tanh",
-                                                    proj_activation="tanh")
-            proj_out.shape  # (-1, 256)
-            last_c.shape  # (-1, 512)
-    """
-
-    assert in_dygraph_mode(
-    ) is not True, "please use lstm instead of dynamic_lstmp in dygraph mode!"
-
-    assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
-    helper = LayerHelper('lstmp', **locals())
-    size = size // 4
-    weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[proj_size, 4 * size], dtype=dtype)
-    proj_weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[size, proj_size], dtype=dtype)
-    bias_size = [1, 7 * size]
-    if not use_peepholes:
-        bias_size[1] = 4 * size
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
-
-    projection = helper.create_variable_for_type_inference(dtype)
-    cell = helper.create_variable_for_type_inference(dtype)
-    ordered_proj0 = helper.create_variable_for_type_inference(dtype)
-    batch_hidden = helper.create_variable_for_type_inference(dtype)
-    batch_gate = helper.create_variable_for_type_inference(dtype)
-    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
-    inputs = {
-        'Input': input,
-        'Weight': weight,
-        'ProjWeight': proj_weight,
-        'Bias': bias
-    }
-    batch_size = input.shape[0]
-    if h_0:
-        assert h_0.shape == (batch_size, proj_size), \
-            'The shape of h0 should be (batch_size, %d)' % proj_size
-        inputs['H0'] = h_0
-    if c_0:
-        assert c_0.shape == (batch_size, size), \
-            'The shape of c0 should be (batch_size, %d)' % size
-        inputs['C0'] = c_0
-
-    if cell_clip:
-        assert cell_clip >= 0, "cell_clip should not be negtive."
-    if proj_clip:
-        assert proj_clip >= 0, "proj_clip should not be negtive."
-
-    helper.append_op(
-        type='lstmp',
-        inputs=inputs,
-        outputs={
-            'Projection': projection,
-            'Cell': cell,
-            'BatchHidden': batch_hidden,
-            'BatchGate': batch_gate,
-            'BatchCellPreAct': batch_cell_pre_act
-        },
-        attrs={
-            'use_peepholes': use_peepholes,
-            'cell_clip': cell_clip,
-            'proj_clip': proj_clip,
-            'is_reverse': is_reverse,
-            'gate_activation': gate_activation,
-            'cell_activation': cell_activation,
-            'candidate_activation': candidate_activation,
-            'proj_activation': proj_activation
-        })
-    return projection, cell
-
-
-def dynamic_gru(input,
-                size,
-                param_attr=None,
-                bias_attr=None,
-                is_reverse=False,
-                gate_activation='sigmoid',
-                candidate_activation='tanh',
-                h_0=None,
-                origin_mode=False):
-    """
-    **Note: The input type of this must be LoDTensor. If the input type to be
-    processed is Tensor, use** :ref:`api_fluid_layers_StaticRNN` .
-
-    This operator is used to perform the calculations for a single layer of
-    Gated Recurrent Unit (GRU) on full sequences step by step. The calculations
-    in one time step support these two modes:
-
-    If ``origin_mode`` is True, then the formula used is from paper
-    `Learning Phrase Representations using RNN Encoder Decoder for Statistical
-    Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_ .
-
-    .. math::
-
-        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
-
-        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
-
-        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
-
-        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
-
-
-    if ``origin_mode`` is False, then the formula used is from paper
-    `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
-    Modeling  <https://arxiv.org/pdf/1412.3555.pdf>`_
-
-    .. math::
-
-        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
-
-        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
-
-        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
-
-        h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t}
-
-    :math:`x_t` is the input of current time step, but it is not from ``input`` .
-    This operator does not include the calculations :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` ,
-    **Note** thus a fully-connect layer whose size is 3 times of ``size`` should
-    be used before this operator, and the output should be used as ``input`` here.
-    :math:`h_{t-1}` is the hidden state from previous time step. 
-    :math:`u_t` , :math:`r_t` , :math:`\\tilde{h_t}` and :math:`h_t` stand for
-    update gate, reset gate, candidate hidden and hidden output separately.
-    :math:`W_{uh}, b_u` , :math:`W_{rh}, b_r` and :math:`W_{ch}, b_c` stand for
-    the weight matrix and bias used in update gate, reset gate, candidate hidden
-    calculations. For implementation, the three weight matrix are merged into a
-    tensor shaped :math:`[D, D \\times 3]` , the three bias are concatenated as
-    a tensor shaped :math:`[1, D \\times 3]` , where :math:`D` stands for the
-    hidden size; The data layout of weight tensor is: :math:`W_{uh}` and :math:`W_{rh}`
-    are concatenated with shape :math:`[D, D  \\times 2]` lying on the first part,
-    and :math:`W_{ch}` lying on the latter part with shape :math:`[D, D]` .
-
-
-    Args:
-        input(Variable): A LoDTensor whose lod level is 1, representing the input
-            after linear projection. Its shape should be :math:`[T, D \\times 3]` ,
-            where :math:`T` stands for the total sequence lengths in this mini-batch,
-            :math:`D` for the hidden size. The data type should be float32 or float64.
-        size(int): Indicate the hidden size.
-        param_attr(ParamAttr, optional):  To specify the weight parameter property.
-            Default: None, which means the default weight parameter property is used.
-            See usage for details in :ref:`api_fluid_ParamAttr` .
-        bias_attr (ParamAttr, optional): To specify the bias parameter property.
-            Default: None, which means the default bias parameter property is used.
-            See usage for details in :ref:`api_fluid_ParamAttr` .
-        is_reverse(bool, optional): Whether to compute in the reversed order of
-            input sequences. Default False.
-        gate_activation(str, optional): The activation fuction corresponding to
-            :math:`act_g` in the formula. "sigmoid", "tanh", "relu" and "identity"
-            are supported. Default "sigmoid".
-        candidate_activation(str, optional): The activation fuction corresponding to
-            :math:`act_c` in the formula. "sigmoid", "tanh", "relu" and "identity"
-            are supported. Default "tanh".
-        h_0 (Variable, optional): A Tensor representing the initial hidden state.
-            It not provided, the default initial hidden state is 0. The shape is
-            :math:`[N, D]` , where :math:`N` is the number of sequences in the
-            mini-batch, :math:`D` for the hidden size. The data type should be
-            same as ``input`` . Default None.
-
-    Returns:
-        Variable: A LoDTensor whose lod level is 1 and shape is :math:`[T, D]` , \
-            where :math:`T` stands for the total sequence lengths in this mini-batch \
-            :math:`D` for the hidden size. It represents GRU transformed sequence output, \
-            and has the same lod and data type with ``input`` .
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            dict_dim, emb_dim = 128, 64
-            data = fluid.data(name='sequence',
-                      shape=[None],
-                      dtype='int64',
-                      lod_level=1)
-            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
-            hidden_dim = 512
-            x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
-            hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim)
-    """
-
-    assert in_dygraph_mode(
-    ) is not True, "please use gru instead of dynamic_gru in dygraph mode!"
-
-    helper = LayerHelper('gru', **locals())
-    dtype = helper.input_dtype()
-
-    weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
-    batch_size = input.shape[0]
-    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
-    if h_0:
-        assert h_0.shape == (
-            batch_size, size
-        ), 'The shape of h0 should be(batch_size, %d)' % size
-        inputs['H0'] = h_0
-
-    hidden = helper.create_variable_for_type_inference(dtype)
-    batch_gate = helper.create_variable_for_type_inference(dtype)
-    batch_reset_hidden_prev = helper.create_variable_for_type_inference(dtype)
-    batch_hidden = helper.create_variable_for_type_inference(dtype)
-
-    helper.append_op(
-        type='gru',
-        inputs=inputs,
-        outputs={
-            'Hidden': hidden,
-            'BatchGate': batch_gate,
-            'BatchResetHiddenPrev': batch_reset_hidden_prev,
-            'BatchHidden': batch_hidden
-        },
-        attrs={
-            'is_reverse': is_reverse,
-            'gate_activation': gate_activation,
-            'activation': candidate_activation,
-            'origin_mode': origin_mode
-        })
-    return hidden
-
-
-def gru_unit(input,
-             hidden,
-             size,
-             param_attr=None,
-             bias_attr=None,
-             activation='tanh',
-             gate_activation='sigmoid',
-             origin_mode=False):
-    """
-    Gated Recurrent Unit (GRU) RNN cell. This operator performs GRU calculations for
-    one time step and it supports these two modes:
-
-    If ``origin_mode`` is True, then the formula used is from paper
-    `Learning Phrase Representations using RNN Encoder Decoder for Statistical
-    Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_ .
-
-    .. math::
-
-        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
-
-        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
-
-        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
-
-        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
-
-
-    if ``origin_mode`` is False, then the formula used is from paper
-    `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
-    Modeling  <https://arxiv.org/pdf/1412.3555.pdf>`_
-
-    .. math::
-
-        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
-
-        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
-
-        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
-
-        h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t}
-
-    :math:`x_t` is the input of current time step, but it is not ``input`` .
-    This operator does not include the calculations :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` ,
-    **Note** thus a fully-connect layer whose size is 3 times of GRU hidden size should
-    be used before this operator, and the output should be used as ``input`` here.
-    :math:`h_{t-1}` is the hidden state from previous time step. 
-    :math:`u_t` , :math:`r_t` , :math:`\\tilde{h_t}` and :math:`h_t` stand for
-    update gate, reset gate, candidate hidden and hidden output separately.
-    :math:`W_{uh}, b_u` , :math:`W_{rh}, b_r` and :math:`W_{ch}, b_c` stand for
-    the weight matrix and bias used in update gate, reset gate, candidate hidden
-    calculations. For implementation, the three weight matrix are merged into a
-    tensor shaped :math:`[D, D \\times 3]` , the three bias are concatenated as
-    a tensor shaped :math:`[1, D \\times 3]` , where :math:`D` stands for the
-    hidden size; The data layout of weight tensor is: :math:`W_{uh}` and :math:`W_{rh}`
-    are concatenated with shape :math:`[D, D  \\times 2]` lying on the first part,
-    and :math:`W_{ch}` lying on the latter part with shape :math:`[D, D]` .
-
-
-    Args:
-        input(Variable): A 2D Tensor representing the input after linear projection
-            after linear projection. Its shape should be :math:`[N, D \\times 3]` ,
-            where :math:`N` stands for batch size, :math:`D` for the hidden size.
-            The data type should be float32 or float64.
-        hidden(Variable): A 2D Tensor representing the hidden state from previous step.
-            Its shape should be :math:`[N, D]` , where :math:`N` stands for batch size,
-            :math:`D` for the hidden size. The data type should be same as ``input`` .
-        size(int): Indicate the hidden size.
-        param_attr(ParamAttr, optional):  To specify the weight parameter property.
-            Default: None, which means the default weight parameter property is used.
-            See usage for details in :ref:`api_fluid_ParamAttr` .
-        bias_attr (ParamAttr, optional): To specify the bias parameter property.
-            Default: None, which means the default bias parameter property is used.
-            See usage for details in :ref:`api_fluid_ParamAttr` .
-        activation(str, optional): The activation fuction corresponding to
-            :math:`act_c` in the formula. "sigmoid", "tanh", "relu" and "identity"
-            are supported. Default "tanh".
-        gate_activation(str, optional): The activation fuction corresponding to
-            :math:`act_g` in the formula. "sigmoid", "tanh", "relu" and "identity"
-            are supported. Default "sigmoid".
-
-    Returns:
-        tuple: The tuple contains three Tensor variables with the same data type \
-            as ``input`` . They represent the hidden state for next time step ( :math:`h_t` ), \
-            reseted previous hidden state ( :math:`r_t \odot h_{t-1}` ), and the \
-            concatenation of :math:`h_t, r_t, \\tilde{h_t}` . And they have shape \
-            :math:`[N, D]` , :math:`[N, D]` , :math:`[N, D \times 3]` separately. \
-            Usually only the hidden state for next time step ( :math:`h_t` ) is used \
-            as output and state, the other two are intermediate results of calculations.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            dict_dim, emb_dim = 128, 64
-            data = fluid.data(name='step_data', shape=[None], dtype='int64')
-            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
-            hidden_dim = 512
-            x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
-            pre_hidden = fluid.data(
-                name='pre_hidden', shape=[None, hidden_dim], dtype='float32')
-            hidden = fluid.layers.gru_unit(
-                input=x, hidden=pre_hidden, size=hidden_dim * 3)
-
-    """
-    activation_dict = dict(
-        identity=0,
-        sigmoid=1,
-        tanh=2,
-        relu=3, )
-    activation = activation_dict[activation]
-    gate_activation = activation_dict[gate_activation]
-
-    helper = LayerHelper('gru_unit', **locals())
-    dtype = helper.input_dtype()
-    size = size // 3
-
-    # create weight
-    weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
-
-    gate = helper.create_variable_for_type_inference(dtype)
-    reset_hidden_pre = helper.create_variable_for_type_inference(dtype)
-    updated_hidden = helper.create_variable_for_type_inference(dtype)
-    inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight}
-    # create bias
-    if helper.bias_attr:
-        bias_size = [1, 3 * size]
-        bias = helper.create_parameter(
-            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
-        inputs['Bias'] = bias
-
-    helper.append_op(
-        type='gru_unit',
-        inputs=inputs,
-        outputs={
-            'Gate': gate,
-            'ResetHiddenPrev': reset_hidden_pre,
-            'Hidden': updated_hidden,
-        },
-        attrs={
-            'activation': 2,  # tanh
-            'gate_activation': 1,  # sigmoid
-            'origin_mode': origin_mode
-        })
-
-    return updated_hidden, reset_hidden_pre, gate
-
-
 @templatedoc()
 def linear_chain_crf(input, label, param_attr=None, length=None):
    """
@@ -2198,244 +1336,34 @@ def chunk_eval(input,
            num_correct_chunks)


-@templatedoc()
-def sequence_conv(input,
-                  num_filters,
-                  filter_size=3,
-                  filter_stride=1,
-                  padding=True,
-                  padding_start=None,
-                  bias_attr=None,
-                  param_attr=None,
-                  act=None,
-                  name=None):
+def softmax(input, use_cudnn=False, name=None, axis=-1):
    """
-    **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use conv2d Op.(fluid.layers.** :ref:`api_fluid_layers_conv2d` ).
+    This operator implements the softmax layer. The calculation process is as follows:

-    This operator receives input sequences with variable length and other convolutional
-    configuration parameters(num_filters, filter_size) to apply the convolution operation.
-    It fills all-zero padding data on both sides of the sequence by default to ensure that
-    the output is the same length as the input. You can customize the padding behavior by
-    configuring the parameter :attr:`padding\_start` .
+    1. The dimension :attr:`axis` of the ``input`` will be permuted to the last.
    
-    **Warning:** the parameter :attr:`padding` take no effect and will be deprecated in the future.
+    2. Then the input tensor will be logically flattened to a 2-D matrix. The matrix's
+    second dimension(row length) is the same as the dimension :attr:`axis` of the input
+    tensor, and the first dimension(column length) is the product of all other
+    dimensions of the input tensor. For each row of the matrix, the softmax operator
+    squashes the K-dimensional(K is the width of the matrix, which is also the size
+    of the input tensor's dimension :attr:`axis`) vector of arbitrary real values to a
+    K-dimensional vector of real values in the range [0, 1] that add up to 1.

-    .. code-block:: text
+    3. After the softmax operation is completed, the inverse operations of steps 1 and 2 
+    are performed to restore the two-dimensional matrix to the same dimension as the ``input``.

-            Here we will illustrate the details of the padding operation:
-            For a mini-batch of 2 variable lengths sentences, containing 3, and 1 time-steps:
-            Assumed input (X) is a [4, N] float LoDTensor, and for the sake of simplicity, we assume N=2.
-            input.data = [[1, 1],
-                          [2, 2],
-                          [3, 3],
-                          [4, 4]]
+    It computes the exponential of the given dimension and the sum of exponential
+    values of all the other dimensions in the K-dimensional vector input.
+    Then the ratio of the exponential of the given dimension and the sum of
+    exponential values of all the other dimensions is the output of the softmax
+    operator.

-            This is to say that input (X) has 4 words and the dimension of each word
-            representation is 2.
+    For each row :math:`i` and each column :math:`j` in the matrix, we have:

-            * Case1:
+    .. math::

-                If padding_start is -1 and filter_size is 3.
-                The length of padding data is calculated as follows:
-                up_pad_len = max(0, -padding_start) = 1
-                down_pad_len = max(0, filter_size + padding_start - 1) = 1
-
-                The output of the input sequence after padding is:
-                data_aftet_padding = [[0, 0, 1, 1, 2, 2],
-                                      [1, 1, 2, 2, 3, 3],
-                                      [2, 2, 3, 3, 0, 0],
-                                      [0, 0, 4, 4, 0, 0]]
-
-                It will be multiplied by the filter weight to get the final output.
-                Assume num_filters = 3
-                output.data = [[ 0.3234, -0.2334,  0.7433],
-                               [ 0.5646,  0.9464, -0.1223],
-                               [-0.1343,  0.5653,  0.4555],
-                               [ 0.9954, -0.1234, -0.1234]]
-                output.shape = [4, 3]     # 3 = num_filters
-                output.lod = [[0, 3, 4]]  # Remain the same
-
-
-    Args:
-        input (Variable): LoDTensor with shape :math:`(M, K)`, where M is the total time-step of mini-batch
-            and K is hidden_size of input. Only lod_level of 1 is supported. The data type should be float32 or
-            float64.
-        num_filters (int): the number of filters.
-        filter_size (int): the height of filter. Specified filter width is not supported, the width is
-            hidden_size by default. Default: 3.
-        filter_stride (int): stride of the filter. Currently only supports :attr:`stride` = 1.
-        padding (bool): the parameter :attr:`padding` take no effect and will be discarded in the
-            future. Currently, it will always pad input to make sure the length of the output is
-            the same as input whether :attr:`padding` is set true or false. Because the length of
-            input sequence may be shorter than :attr:`filter\_size`, which will cause the convolution
-            result to not be computed correctly. These padding data will not be trainable or updated
-            while trainnig. Default: True.
-        padding_start (int): It is used to indicate the start index for padding the input
-            sequence, which can be negative. The negative number means to pad
-            :attr:`|padding_start|` time-steps of all-zero data at the beginning of each instance.
-            The positive number means to skip :attr:`padding_start` time-steps of each instance,
-            and it will pad :math:`filter\_size + padding\_start - 1` time-steps of all-zero data
-            at the end of the sequence to ensure that the output is the same length as the input.
-            If set None, the same length :math:`\\frac{filter\_size}{2}` of data will be filled
-            on both sides of the sequence. If set 0, the length of :math:`filter\_size - 1` data
-            is padded at the end of each input sequence. Default: None.
-        bias_attr (ParamAttr): To specify the bias parameter property. Default: None, which means the
-            default bias parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` .
-        param_attr (ParamAttr): To specify the weight parameter property. Default: None, which means the
-            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` .
-        act (str): Activation to be applied to the output of this layer, such as tanh, softmax,
-            sigmoid, relu. For more information, please refer to :ref:`api_guide_activations_en` . Default: None.
-        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
-
-    Returns:
-        Variable: LoDTensor with the same length as input. The data type is float32 or float64, which is same as input.
-
-    Examples:
-
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-
-             x = fluid.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
-             x_conved = fluid.layers.sequence_conv(input=x, num_filters=2, filter_size=3, padding_start=-1)
-    """
-
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_conv', **locals())
-    dtype = helper.input_dtype()
-    filter_shape = [filter_size * input.shape[1], num_filters]
-    filter_param = helper.create_parameter(
-        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
-    pre_bias = helper.create_variable_for_type_inference(dtype)
-    if padding_start is None:
-        padding_start = -int(filter_size // 2)
-
-    helper.append_op(
-        type='sequence_conv',
-        inputs={
-            'X': [input],
-            'Filter': [filter_param],
-        },
-        outputs={"Out": pre_bias},
-        attrs={
-            'contextStride': filter_stride,
-            'contextStart': padding_start,
-            'contextLength': filter_size,
-        })
-    pre_act = helper.append_bias_op(pre_bias)
-    return helper.append_activation(pre_act)
-
-
-def sequence_softmax(input, use_cudnn=False, name=None):
-    """
-    **Note**:
-    
-    **The input type of the OP must be LoDTensor. For Tensor, use:** :ref:`api_fluid_layers_softmax` 
-
-    A LoD-tensor can be regarded as several sequences, and this op apply softmax algo on each sequence.
-    The shape of input Tensor can be :math:`[N, 1]` or :math:`[N]`, where :math:`N`
-    is the sum of the length of all sequences. Recommended usage: :math:`[N]`.
-
-    For i-th sequence in a mini-batch:
-
-    .. math::
-
-        Out(X[lod[i]:lod[i+1]], :) = \\frac{\exp(X[lod[i]:lod[i+1], :])}{\sum(\exp(X[lod[i]:lod[i+1], :]))}
-
-    For example, for a LoD-Tensor with 6 sequences ([3, 2, 4, 1, 2, 3] - sequence length list in order), 
-    the lod in the runtime is [[0, 3, 5, 9, 10, 12, 15]],
-    then softmax will be computed among :math:`X[0:3,:],X[3:5,:],X[5:9,:],X[9:10,:],X[10:12,:],X[12:15,:]`,
-    and :math:`N` turns out to be 15.
-
-    .. code-block:: text
-
-        *Case 1:
-
-            Given:
-                input.data = [0.7, 1, 0.6,
-                              1.5, 1.1,
-                              1.2, 0.2, 0.6, 1.9,
-                              3.1,
-                              2.5, 0.8,
-                              0.1, 2.4, 1.3]
-                input.lod = [[0, 3, 5, 9, 10, 12, 15]]
-            then:
-                 output.data = [0.30724832, 0.41474187, 0.2780098,
-                                0.59868765, 0.40131235,
-                                0.2544242, 0.09359743, 0.13963096, 0.5123474, 
-                                1.,
-                                0.84553474, 0.15446526,
-                                0.06995796, 0.69777346, 0.23226859]
-                 output.lod = [[0, 3, 5, 9, 10, 12, 15]]    
-    
-
-    Args:
-        input (Variable):A LoDTensor with shape of  :math:`[N, 1]` or  :math:`[N]`, Recommended usage: :math:`[N]`. 
-                         Supported data types: float32, float64. 
-        use_cudnn (bool, optional): Use cudnn kernel or not. Effective only when the cudnn version of the paddle 
-                                    library is installed and GPU is used for training or reasoning. Default: False.
-        name (str, optional): The default value is None. Normally there is no need for user to set this property. 
-                              For more information, please refer to :ref:`api_guide_Name`
-
-    Returns:
-        Variable: A LoD-Tensor which has the same shape and data type with input.
-
-    Examples:
-
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-             x = fluid.data(name='x', shape=[7, 1],
-                              dtype='float32', lod_level=1)
-             x_sequence_softmax_1 = fluid.layers.sequence_softmax(input=x)  
-
-             y = fluid.data(name='y', shape=[7],
-                 dtype='float32', lod_level=1)
-             x_sequence_softmax_2 = fluid.layers.sequence_softmax(input=y)  
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_softmax', **locals())
-    dtype = helper.input_dtype()
-    softmax_out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="sequence_softmax",
-        inputs={"X": input},
-        outputs={"Out": softmax_out},
-        attrs={"use_cudnn": use_cudnn})
-    return softmax_out
-
-
-def softmax(input, use_cudnn=False, name=None, axis=-1):
-    """
-    This operator implements the softmax layer. The calculation process is as follows:
-
-    1. The dimension :attr:`axis` of the ``input`` will be permuted to the last.
-    
-    2. Then the input tensor will be logically flattened to a 2-D matrix. The matrix's
-    second dimension(row length) is the same as the dimension :attr:`axis` of the input
-    tensor, and the first dimension(column length) is the product of all other
-    dimensions of the input tensor. For each row of the matrix, the softmax operator
-    squashes the K-dimensional(K is the width of the matrix, which is also the size
-    of the input tensor's dimension :attr:`axis`) vector of arbitrary real values to a
-    K-dimensional vector of real values in the range [0, 1] that add up to 1.
-
-    3. After the softmax operation is completed, the inverse operations of steps 1 and 2 
-    are performed to restore the two-dimensional matrix to the same dimension as the ``input``.
-
-    It computes the exponential of the given dimension and the sum of exponential
-    values of all the other dimensions in the K-dimensional vector input.
-    Then the ratio of the exponential of the given dimension and the sum of
-    exponential values of all the other dimensions is the output of the softmax
-    operator.
-
-    For each row :math:`i` and each column :math:`j` in the matrix, we have:
-
-    .. math::
-
-        Out[i, j] = \\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}
+        Out[i, j] = \\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}

    Example:

@@ -3061,493 +1989,148 @@ def conv3d(input,
    return helper.append_activation(pre_act)


-def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
+@templatedoc()
+def pool2d(input,
+           pool_size=-1,
+           pool_type="max",
+           pool_stride=1,
+           pool_padding=0,
+           global_pooling=False,
+           use_cudnn=True,
+           ceil_mode=False,
+           name=None,
+           exclusive=True,
+           data_format="NCHW"):
    """
-    **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use pool2d Op.(fluid.layers.** :ref:`api_fluid_layers_pool2d` ).
-
-    This operator only supports LoDTensor as input. It will apply specified pooling
-    operation on the input LoDTensor. It pools features of all time-steps of each
-    sequence at the last lod_level using :attr:`pool_type` mentioned in the parameters,
-    such as sum, average, sqrt, etc.
-
-    It supports six pool_type:
-
-    - average: :math:`Out[i] = \\frac{\sum_i X_i}{N}`
-    - sum:     :math:`Out[i] = \sum_jX_{ij}`
-    - sqrt:    :math:`Out[i] = \\frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}`
-    - max:     :math:`Out[i] = max(X_i)`
-    - last:    :math:`Out[i] = X_{N_i}`
-    - first:   :math:`Out[i]` = X_0
-
-    where :math:`N_i` is the length of i-th input sequence.
-
-    .. code-block:: text
-
-        Case 1:
-        input is a 1-level LoDTensor and pad_value = 0.0:
-            input.lod = [[0, 2, 5, 7, 7]]
-            input.data = [[1.], [3.], [2.], [4.], [6.], [5.], [1.]]
-            input.shape = [7, 1]
-
-        output is LoDTensor:
-            out.shape = [4, 1]
-            with condition out.shape[0] == len(x.lod[-1]) == 4
-
-        for different pool_type:
-            average: out.data = [[2.], [4.], [3.], [0.0]], where 2.=(1. + 3.)/2, 4.=(2. + 4. + 6.)/3, 3.=(5. + 1.)/2
-            sum    : out.data = [[4.], [12.], [6.], [0.0]], where 4.=1. + 3., 12.=2. + 4. + 6., 6.=5. + 1.
-            sqrt   : out.data = [[2.82], [6.93], [4.24], [0.0]], where 2.82=(1. + 3.)/sqrt(2), 6.93=(2. + 4. + 6.)/sqrt(3), 4.24=(5. + 1.)/sqrt(2)
-            max    : out.data = [[3.], [6.], [5.], [0.0]], where 3.=max(1., 3.), 6.=max(2., 4., 6.), 5.=max(5., 1.)
-            last   : out.data = [[3.], [6.], [1.], [0.0]], where 3.=last(1., 3.), 6.=last(2., 4., 6.), 1.=last(5., 1.)
-            first  : out.data = [[1.], [2.], [5.], [0.0]], where 1.=first(1., 3.), 2.=first(2., 4., 6.), 5.=first(5., 1.)
-
-            and all above [0.0] at last of out.data is padding data.
-
-        Case 2:
-        input is a 2-level LoDTensor containing 3 sequences with length info [2, 0, 3],
-        where 0 means empty sequence.
-        The first sequence contains 2 subsequence with length info [1, 2];
-        The last sequence contains 3 subsequence with length info [1, 0, 3].
-            input.lod = [[0, 2, 2, 5], [0, 1, 3, 4, 4, 7]]
-            input.data = [[1.], [3.], [2.], [4.], [6.], [5.], [1.]]
-            input.shape = [7, 1]
-
-        If pool_typ = sum, it will apply pooling on last lod_level [0, 1, 3, 4, 4, 7]. pad_value = 0.0
-        output is LoDTensor:
-            out.shape= [5, 1]
-            out.lod = [[0, 2, 2, 5]]
-            where out.shape[0] == len(x.lod[-1]) == 5
-            sum: out.data = [[1.], [5.], [4.], [0.0], [12.]]
-            where 1.=1., 5.=3. + 2., 4.=4., 0.0=pad_value, 12.=6. + 5. + 1.
+    ${comment}

    Args:
-        input (variable): LoDTensor with lod_level no more than 2. The data type should be float32.
-        pool_type (str): The pooling type that supports average, sum, sqrt, max, last or first.
-        is_test (bool): Only works when :attr:`pool_type` is max. If set False, a temporary Tenosr maxIndex is
-            created to record the index information corresponding to the maximum value, which is used for backward
-            gradient calculation in the training phase. Default: False.
-        pad_value (float): Used to pad the pooling result for empty input sequence. Default: 0.0
+        input (Variable): The input tensor of pooling operator which is a 4-D tensor with
+                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
+                          `"NHWC"`, where `N` is batch size, `C` is the number of channels,
+                          `H` is the height of the feature, and `W` is the width of the
+                          feature. The data type if float32 or float64.
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        pool_type: ${pooling_type_comment}
+        pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+        pool_padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
+            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+            Otherwise, the pool padding size will be a square of an int.
+        global_pooling (bool): ${global_pooling_comment}
+        use_cudnn (bool): ${use_cudnn_comment}
+        ceil_mode (bool): ${ceil_mode_comment}
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+        exclusive (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                `[batch_size, input_channels, input_height, input_width]`.

    Returns:
-        Variable: LoDTensor after pooling with data type float32.
+        Variable: The output tensor of pooling result. The data type is same as input tensor.
+
+    Raises:
+        ValueError: If `pool_type` is not "max" nor "avg".
+        ValueError: If `global_pooling` is False and `pool_size` is -1.
+        TypeError: If `use_cudnn` is not a bool value.
+        ValueError: If `data_format` is not "NCHW" or "NHWC".
+        ValueError: If `pool_padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `pool_padding` is "VALID", but `ceil_mode` is True.
+        ValueError: If `pool_padding` is a list or tuple, but the elements in the batch or channel dimensions are non-zero.
+        ShapeError: If the input is not a 4-D or 5-D Tensor.
+        ShapeError: If the dimension of input minus the size of `pool_stride` is not 2.
+        ShapeError: If the size of `pool_size` and `pool_stride` is not equal.
+        ShapeError: If the output's shape calculated is not greater than 0.
+

    Examples:

        .. code-block:: python

-            import paddle.fluid as fluid
+          import paddle.fluid as fluid

-            x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-            avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
-            sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
-            sqrt_x = fluid.layers.sequence_pool(input=x, pool_type='sqrt')
-            max_x = fluid.layers.sequence_pool(input=x, pool_type='max')
-            last_x = fluid.layers.sequence_pool(input=x, pool_type='last')
-            first_x = fluid.layers.sequence_pool(input=x, pool_type='first')
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_pool', **locals())
-    dtype = helper.input_dtype()
-    pool_out = helper.create_variable_for_type_inference(dtype)
-    max_index = helper.create_variable_for_type_inference(dtype)
+          data = fluid.data(name='data', shape=[None, 3, 32, 32], dtype='float32')

-    helper.append_op(
-        type="sequence_pool",
-        inputs={"X": input},
-        outputs={"Out": pool_out,
-                 "MaxIndex": max_index},
-        attrs={
-            "pooltype": pool_type.upper(),
-            "is_test": is_test,
-            "pad_value": pad_value
-        })
+          # max pool2d
+          pool2d = fluid.layers.pool2d(
+            input = data,
+            pool_size = 2,
+            pool_type = "max",
+            pool_stride = 1,
+            global_pooling=False)

-    # when pool_type is max, variable max_index is initialized,
-    # so we stop the gradient explicitly here
-    if pool_type == 'max':
-        max_index.stop_gradient = True
+          # average pool2d
+          pool2d = fluid.layers.pool2d(
+            input = data,
+            pool_size = 2,
+            pool_type = "avg",
+            pool_stride = 1,
+            global_pooling=False)

-    return pool_out
+          # global average pool2d
+          pool2d = fluid.layers.pool2d(
+            input = data,
+            pool_size = 2,
+            pool_type = "avg",
+            pool_stride = 1,
+            global_pooling=True)

+          # Attr(pool_padding) is a list with 4 elements, Attr(data_format) is "NCHW".
+          out_1 = fluid.layers.pool2d(
+            input = data,
+            pool_size = 3,
+            pool_type = "avg",
+            pool_stride = 1,
+            pool_padding = [1, 2, 1, 0],
+            data_format = "NCHW")

-@templatedoc()
-def sequence_concat(input, name=None):
+          # Attr(pool_padding) is a string, Attr(data_format) is "NCHW".
+          out_2 = fluid.layers.pool2d(
+            input = data,
+            pool_size = 3,
+            pool_type = "avg",
+            pool_stride = 1,
+            pool_padding = "VALID",
+            data_format = "NCHW")
    """
-    **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use concat Op.(fluid.layers.** :ref:`api_fluid_layers_concat` ).
+    if pool_type not in ["max", "avg"]:
+        raise ValueError(
+            "Unknown Attr(pool_type): '%s'. It can only be 'max' or 'avg'.",
+            str(pool_type))

-    This operator only supports LoDTensor as input. It concatenates the multiple LoDTensor from input by the LoD information,
-    and outputs the concatenated LoDTensor.
+    if global_pooling is False and pool_size == -1:
+        raise ValueError(
+            "When Attr(global_pooling) is False, Attr(pool_size) must be passed "
+            "and be a valid value. Received pool_size: %s." % str(pool_size))

-    .. code-block:: text
+    if not isinstance(use_cudnn, bool):
+        raise TypeError("Attr(use_cudnn) should be True or False. Received "
+                        "Attr(use_cudnn): %s." % str(use_cudnn))

-        input is a list of LoDTensor:
-            input = [x1, x2]
-        where:
-            x1.lod = [[0, 3, 5]]
-            x1.data = [[1], [2], [3], [4], [5]]
-            x1.shape = [5, 1]
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))

-            x2.lod = [[0, 2, 4]]
-            x2.data = [[6], [7], [8], [9]]
-            x2.shape = [4, 1]
-        and should satisfy: len(x1.lod[0]) == len(x2.lod[0])
+    pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
+    pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride')

-        output is LoDTensor:
-            out.lod = [[0, 3+2, 5+4]]
-            out.data = [[1], [2], [3], [6], [7], [4], [5], [8], [9]]
-            out.shape = [9, 1]
-
-    Args:
-        input(list of Variable): List of LoDTensor to be concatenated. The length of each LoDTensor should be same.
-            The data type can be float32, float64 or int64.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
-
-    Returns:
-        Variable: Output the concatenated LoDTensor. The data type is same as input.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
-            y = fluid.data(name='y', shape=[-1, 10], dtype='float32', lod_level=1)
-            out = fluid.layers.sequence_concat(input=[x, y])
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_concat', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(
-        type='sequence_concat', inputs={'X': input}, outputs={'Out': [out]})
-    return out
-
-
-def sequence_first_step(input):
-    """
-    This operator only supports LoDTensor as input. Given the input LoDTensor, it will
-    select first time-step feature of each sequence as output.
-
-    .. code-block:: text
-
-       Case 1:
-        input is 1-level LoDTensor:
-            input.lod = [[0, 2, 5, 7]]
-            input.data = [[1.], [3.], [2.], [4.], [6.], [5.], [1.]]
-            input.shape = [7, 1]
-
-        output is a LoDTensor:
-            out.shape = [3, 1]
-            out.shape[0] == len(x.lod[-1]) == 3
-            out.data = [[1.], [2.], [5.]], where 1.=first(1., 3.), 2.=first(2., 4., 6.), 5.=first(5., 1.)
-
-        Case 2:
-        input is a 2-level LoDTensor containing 3 sequences with length info [2, 0, 3],
-        where 0 means empty sequence.
-        The first sequence contains 2 subsequence with length info [1, 2];
-        The last sequence contains 3 subsequence with length info [1, 0, 3].
-            input.lod = [[0, 2, 2, 5], [0, 1, 3, 4, 4, 7]]
-            input.data = [[1.], [3.], [2.], [4.], [6.], [5.], [1.]]
-            input.shape = [7, 1]
-
-        It will apply pooling on last lod_level [0, 1, 3, 4, 4, 7]. pad_value = 0.0
-        output is a LoDTensor:
-            out.shape= [5, 1]
-            out.lod = [[0, 2, 2, 5]]
-            out.shape[0] == len(x.lod[-1]) == 5
-            out.data = [[1.], [3.], [4.], [0.0], [6.]]
-            where 1.=first(1.), 3.=first(3., 2.), 4.=first(4.), 0.0 = pad_value, 6.=first(6., 5., 1.)
-
-    Args:
-        input(Variable): LoDTensor with lod_level no more than 2. The data type should be float32.
-
-    Returns:
-        Variable: LoDTensor consist of the sequence's first step vector. The data type is float32.
-
-    Examples:
-
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-             x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-             x_first_step = fluid.layers.sequence_first_step(input=x)
-    """
-    return sequence_pool(input=input, pool_type="first")
-
-
-def sequence_last_step(input):
-    """
-    This operator only supports LoDTensor as input. Given the input LoDTensor, it will
-    select last time-step feature of each sequence as output.
-
-    .. code-block:: text
-
-        Case 1:
-        input is 1-level LoDTensor:
-            input.lod = [[0, 2, 5, 7]]
-            input.data = [[1.], [3.], [2.], [4.], [6.], [5.], [1.]]
-            input.shape = [7, 1]
-
-        output is a LoDTensor:
-            out.shape = [3, 1]
-            out.shape[0] == len(x.lod[-1]) == 3
-            out.data = [[3.], [6.], [1.]], where 3.=last(1., 3.), 6.=last(2., 4., 6.), 1.=last(5., 1.)
-
-        Case 2:
-        input is a 2-level LoDTensor containing 3 sequences with length info [2, 0, 3],
-        where 0 means empty sequence.
-        The first sequence contains 2 subsequence with length info [1, 2];
-        The last sequence contains 3 subsequence with length info [1, 0, 3].
-            input.lod = [[0, 2, 2, 5], [0, 1, 3, 4, 4, 7]]
-            input.data = [[1.], [3.], [2.], [4.], [6.], [5.], [1.]]
-            input.shape = [7, 1]
-
-        It will apply pooling on last lod_level [0, 1, 3, 4, 4, 7]. pad_value = 0.0
-        output is a LoDTensor:
-            out.shape= [5, 1]
-            out.lod = [[0, 2, 2, 5]]
-            out.shape[0] == len(x.lod[-1]) == 5
-            out.data = [[1.], [2.], [4.], [0.0], [1.]]
-            where 1.=last(1.), 2.=last(3., 2.), 4.=last(4.), 0.0 = pad_value, 1=last(6., 5., 1.)
-
-
-    Args:
-        input(Variable): LoDTensor with lod_level no more than 2. The data type should be float32.
-
-    Returns:
-        Variable: LoDTensor consist of the sequence's last step vector. The data type is float32.
-
-    Examples:
-
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-             x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-             x_last_step = fluid.layers.sequence_last_step(input=x)
-    """
-    return sequence_pool(input=input, pool_type="last")
-
-
-def sequence_slice(input, offset, length, name=None):
-    """
-    **Sequence Slice Layer**
-
-    The layer crops a subsequence from given sequence with given start
-    offset and subsequence length.
-
-    It only supports sequence data (LoDTensor with lod_level equal to 1).
-
-    .. code-block:: text
-
-              - Case:
-
-            Given the input Variable **input**:
-
-                input.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]],
-                input.lod = [[3, 2]],
-                input.dims = (5, 2),
-
-            with offset.data = [[0], [1]] and length.data = [[2], [1]],
-
-            the output Variable will be
-
-                out.data = [[a1, a2], [b1, b2], [e1, e2]],
-                out.lod = [[2, 1]],
-                out.dims = (3, 2).
-
-    Note:
-          The first dimension size of **input**, **offset** and **length**
-          should be equal. The **offset** should start from 0.
-
-    Args:
-        input(Variable): LoDTensor, The input Variable which consists of the complete
-                         sequences.The data type is float32 or float64.
-        offset(Variable): LoDTensor, The offset to slice each sequence.The data
-                         type is int32 or int64.
-        length(Variable): LoDTensor, The length of each subsequence.The data
-                         type is int32 or int64.
-        name(str|None): The default value is None.  Normally there is no need
-                        for user to set this property.  For more information,
-                        please refer to :ref:`api_guide_Name`
-
-    Returns:
-        Variable: The output subsequences.
-
-    Examples:
-
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-             import numpy as np
-             seqs = fluid.data(name='x', shape=[10, 5],
-                              dtype='float32', lod_level=1)
-             offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32"))
-             length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32"))
-             subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
-                                                   length=length)
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper("sequence_slice", **locals())
-    dtype = helper.input_dtype()
-    out = helper.create_variable_for_type_inference(dtype)
-
-    offset.stop_gradient = True
-    length.stop_gradient = True
-
-    helper.append_op(
-        type="sequence_slice",
-        inputs={"X": input,
-                "Offset": offset,
-                "Length": length},
-        outputs={"Out": out})
-
-    return out
-
-
-@templatedoc()
-def pool2d(input,
-           pool_size=-1,
-           pool_type="max",
-           pool_stride=1,
-           pool_padding=0,
-           global_pooling=False,
-           use_cudnn=True,
-           ceil_mode=False,
-           name=None,
-           exclusive=True,
-           data_format="NCHW"):
-    """
-    ${comment}
-
-    Args:
-        input (Variable): The input tensor of pooling operator which is a 4-D tensor with
-                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
-                          `"NHWC"`, where `N` is batch size, `C` is the number of channels,
-                          `H` is the height of the feature, and `W` is the width of the
-                          feature. The data type if float32 or float64.
-        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be a square of an int.
-        pool_type: ${pooling_type_comment}
-        pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain two integers, (pool_stride_Height, pool_stride_Width).
-            Otherwise, the pool stride size will be a square of an int.
-        pool_padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
-            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Otherwise, the pool padding size will be a square of an int.
-        global_pooling (bool): ${global_pooling_comment}
-        use_cudnn (bool): ${use_cudnn_comment}
-        ceil_mode (bool): ${ceil_mode_comment}
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
-        exclusive (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                `[batch_size, input_channels, input_height, input_width]`.
-
-    Returns:
-        Variable: The output tensor of pooling result. The data type is same as input tensor.
-
-    Raises:
-        ValueError: If `pool_type` is not "max" nor "avg".
-        ValueError: If `global_pooling` is False and `pool_size` is -1.
-        TypeError: If `use_cudnn` is not a bool value.
-        ValueError: If `data_format` is not "NCHW" or "NHWC".
-        ValueError: If `pool_padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `pool_padding` is "VALID", but `ceil_mode` is True.
-        ValueError: If `pool_padding` is a list or tuple, but the elements in the batch or channel dimensions are non-zero.
-        ShapeError: If the input is not a 4-D or 5-D Tensor.
-        ShapeError: If the dimension of input minus the size of `pool_stride` is not 2.
-        ShapeError: If the size of `pool_size` and `pool_stride` is not equal.
-        ShapeError: If the output's shape calculated is not greater than 0.
-
-
-    Examples:
-
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-
-          data = fluid.data(name='data', shape=[None, 3, 32, 32], dtype='float32')
-
-          # max pool2d
-          pool2d = fluid.layers.pool2d(
-            input = data,
-            pool_size = 2,
-            pool_type = "max",
-            pool_stride = 1,
-            global_pooling=False)
-
-          # average pool2d
-          pool2d = fluid.layers.pool2d(
-            input = data,
-            pool_size = 2,
-            pool_type = "avg",
-            pool_stride = 1,
-            global_pooling=False)
-
-          # global average pool2d
-          pool2d = fluid.layers.pool2d(
-            input = data,
-            pool_size = 2,
-            pool_type = "avg",
-            pool_stride = 1,
-            global_pooling=True)
-
-          # Attr(pool_padding) is a list with 4 elements, Attr(data_format) is "NCHW".
-          out_1 = fluid.layers.pool2d(
-            input = data,
-            pool_size = 3,
-            pool_type = "avg",
-            pool_stride = 1,
-            pool_padding = [1, 2, 1, 0],
-            data_format = "NCHW")
-
-          # Attr(pool_padding) is a string, Attr(data_format) is "NCHW".
-          out_2 = fluid.layers.pool2d(
-            input = data,
-            pool_size = 3,
-            pool_type = "avg",
-            pool_stride = 1,
-            pool_padding = "VALID",
-            data_format = "NCHW")
-    """
-    if pool_type not in ["max", "avg"]:
-        raise ValueError(
-            "Unknown Attr(pool_type): '%s'. It can only be 'max' or 'avg'.",
-            str(pool_type))
-
-    if global_pooling is False and pool_size == -1:
-        raise ValueError(
-            "When Attr(global_pooling) is False, Attr(pool_size) must be passed "
-            "and be a valid value. Received pool_size: %s." % str(pool_size))
-
-    if not isinstance(use_cudnn, bool):
-        raise TypeError("Attr(use_cudnn) should be True or False. Received "
-                        "Attr(use_cudnn): %s." % str(use_cudnn))
-
-    if data_format not in ["NCHW", "NHWC"]:
-        raise ValueError(
-            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
-
-    pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
-    pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride')
-
-    def update_padding(padding, data_format):
-        def is_list_or_tuple(ele):
-            if isinstance(ele, list) or isinstance(ele, tuple):
-                return True
-            return False
+    def update_padding(padding, data_format):
+        def is_list_or_tuple(ele):
+            if isinstance(ele, list) or isinstance(ele, tuple):
+                return True
+            return False

        if is_list_or_tuple(padding) and len(padding) == 4:
            if is_list_or_tuple(padding[0]) and (data_format == "NCHW"):
@@ -5383,868 +3966,93 @@ def conv3d_transpose(input,
                    raise ValueError(
                        "Non-zero padding(%s) in the batch or channel dimensions "
                        "is not supported." % str(padding))
-                padding = padding[1:4]
-                padding = [ele for a_list in padding for ele in a_list]
-            padding = utils.convert_to_list(padding, 6, 'padding')
-
-        elif is_list_or_tuple(padding) and len(padding) == 6:
-            padding = utils.convert_to_list(padding, 6, 'padding')
-
-        else:
-            padding = utils.convert_to_list(padding, 3, 'padding')
-            padding = [
-                padding[0], padding[0], padding[1], padding[1], padding[2],
-                padding[2]
-            ]
-        return padding
-
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'." %
-                str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0, 0, 0, 0, 0, 0]
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0, 0, 0, 0, 0, 0]
-
-    padding = _update_padding(padding, data_format)
-
-    if filter_size is None:
-        if output_size is None:
-            raise ValueError("output_size must be set when filter_size is None")
-        if isinstance(output_size, int):
-            output_size = [output_size, output_size]
-
-        d_in = input.shape[2] if data_format == 'NCDHW' else input.shape[1]
-        h_in = input.shape[3] if data_format == 'NCDHW' else input.shape[2]
-        w_in = input.shape[4] if data_format == 'NCDHW' else input.shape[3]
-
-        filter_size_d = (output_size[0] - (d_in - 1) * stride[0] + padding[0] +
-                         padding[1] - 1) // dilation[0] + 1
-        filter_size_h = (output_size[1] - (h_in - 1) * stride[1] + padding[2] +
-                         padding[3] - 1) // dilation[1] + 1
-        filter_size_w = (output_size[2] - (w_in - 1) * stride[2] + padding[4] +
-                         padding[5] - 1) // dilation[2] + 1
-        filter_size = [filter_size_d, filter_size_h, filter_size_w]
-    else:
-        filter_size = utils.convert_to_list(filter_size, 3,
-                                            'conv3d_transpose.filter_size')
-
-    if len(padding) == 6 and utils._is_symmetric_padding(padding, 3):
-        padding = [padding[0], padding[2], padding[4]]
-
-    groups = 1 if groups is None else groups
-    filter_shape = [input_channel, num_filters // groups] + filter_size
-    img_filter = helper.create_parameter(
-        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
-
-    if data_format == 'NCDHW':
-        data_format = 'NCHW'
-    if data_format == 'NDHWC':
-        data_format = 'NHWC'
-
-    pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type=l_type,
-        inputs={'Input': [input],
-                'Filter': [img_filter]},
-        outputs={'Output': pre_bias},
-        attrs={
-            'strides': stride,
-            'paddings': padding,
-            'padding_algorithm': padding_algorithm,
-            'dilations': dilation,
-            'groups': groups,
-            'use_cudnn': use_cudnn,
-            'data_format': data_format
-        })
-
-    if data_format == 'NCHW':
-        pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
-    else:
-        pre_act = helper.append_bias_op(pre_bias, dim_start=4, dim_end=5)
-    out = helper.append_activation(pre_act)
-    return out
-
-
-def sequence_expand(x, y, ref_level=-1, name=None):
-    """Sequence Expand Layer. This layer will expand the input variable ``x`` \
-        according to specified level ``ref_level`` lod of ``y``. Please note that \
-        the lod level of ``x`` is at most 1. If the lod level of ``x`` is 1, than \
-        the size of lod of ``x`` must be equal to the length of ``ref_level`` lod \
-        of ``y``. If the lod level of ``x`` is 0, then the first dim of ``x`` should \
-        be equal to the size of ``ref_level`` of ``y``. The rank of **x** is at least 2. \
-        When rank of ``x`` is greater than 2, then it would be viewed as a 2-D tensor.
-
-    Please note that the input ``x`` should be LodTensor or Tensor, \
-        and input ``y`` must be LodTensor.
-
-    Following examples will explain how sequence_expand works:
-
-    .. code-block:: text
-
-        Case 1
-
-        Consider 2 sequences [a][b] and [c][d], now we want to expand them to [a][b], [a][b], [c][d] and [c][d].
-        Sequence [a][b] expand twice and [c][d] expands twice, so the lod which according to is [2, 2].
-
-        Input x is a 1-level LoDTensor:
-            x.lod  = [[2,        2]]    #lod based on length may be easier to understand
-            x.data = [[a], [b], [c], [d]]
-            x.dims = [4, 1]
-
-        input y is a LoDTensor:
-            y.lod = [[2,    2],    #the 0th level lod, according to this level
-                     [3, 3, 1, 1]] #the 1st level lod, it has nothing to do with this level
-
-        ref_level: 0
-
-        then output is a 1-level LoDTensor out:
-            out.lod =  [[2,        2,        2,        2]]    #lod based on offfset
-            out.data = [[a], [b], [a], [b], [c], [d], [c], [d]]
-            out.dims = [8, 1]
-
-
-        Case 2
-
-        Consider 3 sequences [a], [b], [c], now we want to expand them to [a][a], [c][c][c].
-        It's obvious that the lod info of expanded sequences is [2, 0, 3].
-
-        x is a Tensor:
-            x.data = [[a], [b], [c]]
-            x.dims = [3, 1]
-
-        y is a LoDTensor:
-            y.lod = [[2, 0, 3]]
-
-        ref_level: -1
-
-        then output is a 1-level LodTensor:
-            out.data = [[a], [a], [c], [c], [c]]
-            out.dims = [5, 1]
-
-    Args:
-        x (Variable): The input variable which is a Tensor or LoDTensor, with the \
-            dims ``[M, K]``. The lod level is at most 1. The data type should be \
-            float32, float64, int8, int32 or int64.
-        y (Variable): The input variable which is a LoDTensor, the lod level is \
-            at least 1.
-        ref_level (int): Lod level of ``y`` to be referred by ``x``. If set to -1, \
-                         refer the last level of lod.
-        name(str, optional): For detailed information, please refer \
-            to :ref:`api_guide_Name`. Usually name is no need to set and \
-            None by default. 
-
-    Returns: The expanded variable which is a LoDTensor, with dims ``[N, K]``. \
-            ``N`` depends on the lod info of ``x`` and ``y``. \
-            The data type is same as input.
-
-    Return Type: Variable
-
-    Examples:
-        .. code-block:: python
-	
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            import numpy as np
-
-            x = fluid.data(name='x', shape=[4, 1], dtype='float32')
-            y = fluid.data(name='y', shape=[8, 1],
-                        dtype='float32', lod_level=1)
-            out = layers.sequence_expand(x=x, y=y, ref_level=0)
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            place = fluid.CPUPlace()
-
-            np_data = np.array([[1], [2], [3], [4]]).astype('float32')
-            x_lod_tensor = fluid.create_lod_tensor(np_data, [[2, 2]], place)
-            print(x_lod_tensor)
-            #lod: [[0, 2, 4]]
-            #    dim: 4, 1
-            #    layout: NCHW
-            #    dtype: float
-            #    data: [1 2 3 4]
-
-            np_data = np.array([[1], [2], [3], [4], [5], [6], [7], [8]]).astype('float32')
-	    y_lod_tensor = fluid.create_lod_tensor(np_data, [[2, 2], [3,3,1,1]], place)
-            print(y_lod_tensor)
-            #lod: [[0, 2, 4][0, 3, 6, 7, 8]]
-            #    dim: 8, 1
-            #    layout: NCHW
-            #    dtype: int64_t
-            #    data: [0 0 1 1 1 1 1 0]
-
-            out_main = exe.run(fluid.default_main_program(),
-                            feed={'x': x_lod_tensor, 'y': y_lod_tensor},
-                            fetch_list=[out], return_numpy=False)
-            print(out_main[0])
-            #lod: [[0, 2, 4, 6, 8]]
-            #    dim: 8, 1
-            #    layout: NCHW
-            #    dtype: float
-            #    data: [1 2 1 2 3 4 3 4]
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_expand', input=x, **locals())
-    dtype = helper.input_dtype()
-    tmp = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='sequence_expand',
-        inputs={'X': x,
-                'Y': y},
-        outputs={'Out': tmp},
-        attrs={'ref_level': ref_level})
-    return tmp
-
-
-def sequence_expand_as(x, y, name=None):
-    """Sequence Expand As Layer. This OP will expand the input variable ``x`` \
-        according to the zeroth level lod of ``y``. Current implementation requires \
-        the level number of ``y``'s lod must be 1, and the first dimension of \
-        ``x`` should be equal to the size of ``y``'s zeroth level lod, thus \
-        the expanded LodTensor has the same lod info as ``y``. The expanded result \
-        has nothing to do with ``x``'s lod, so the lod of Input(X) is not considered.
-
-    Please note that the input ``x`` should be LodTensor or Tensor, \
-        and input ``y`` must be LodTensor.
-
-    Following examples will explain how sequence_expand_as works:
-
-    .. code-block:: text
-
-        Case 1:
-
-        Consider 4 sequences [a], [b], [c], [d], now we want to expand them to [a][a][a], [b][b][b], [c] and [d].
-        It's obvious that the lod info of expanded sequences is [0, 3, 6, 7, 8].
-        Given a 1-level LodTensor ``x``: 
-            x.data = [[a], [b], [c], [d]]
-            x.dims = [4, 1]
-        and input ``y``
-            y.lod = [[3, 3, 1, 1]] #lod based on length may be easier to understand
-
-        then we get 1-level LoDTensor out:
-            Out.lod =  [[0,            3,              6,  7,  8]] #based on offset
-            Out.data = [[a], [a], [a], [b], [b], [b], [c], [d]]
-            Out.dims = [8, 1]
-
-
-        Case 2:
-
-        Given a common Tensor ``x``:
-            x.data = [[a, b], [c, d], [e, f]]
-            x.dims = [3, 2]
-        and input ``y``:
-            y.lod = [[0, 2, 3, 6]]
-
-        then we get a 1-level LoDTensor:
-            out.lod =  [[0,             2,     3,                    6]]
-            out.data = [[a, b], [a, b] [c, d], [e, f], [e, f], [e, f]]
-            out.dims = [6, 2]
-
-    Args:
-        x (Variable): The input variable which is a Tensor or LoDTensor, with the \
-            dims ``[M, K]``. The data type should be float32, float64, int8, int32 \
-            or int64.
-        y (Variable): The input variable which is a LoDTensor with 1-level lod.
-        name (str, optional): For detailed information, please refer \
-            to :ref:`api_guide_Name`. Usually name is no need to set and \
-            None by default.
-
-    Returns: The expanded variable which is a LoDTensor with the dims ``[N, K]``. \
-            ``N`` depends on the lod of ``y``, and the lod level must be 1. \
-            The data type is same as input.
-
-    Return Type: Variable
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            import numpy as np
-
-            x = fluid.data(name='x', shape=[4, 1], dtype='float32')
-            y = fluid.data(name='y', shape=[8, 1], dtype='float32', lod_level=1)
-            out = layers.sequence_expand_as(x=x, y=y)
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            place = fluid.CPUPlace()
-
-            np_data = np.array([[1], [2], [3], [4]]).astype('float32')
-            x_lod_tensor = fluid.create_lod_tensor(np_data, [[2, 2]], place)
-            print(x_lod_tensor)
-            #lod: [[0, 2, 4]]
-            #    dim: 4, 1
-            #    layout: NCHW
-            #    dtype: float
-            #    data: [1 2 3 4]
-
-            np_data = np.array([[1], [2], [3], [4], [5], [6], [7], [8]]).astype('float32')
-	    y_lod_tensor = fluid.create_lod_tensor(np_data, [[3,3,1,1]], place)
-            print(y_lod_tensor)
-            #lod: [[0, 3, 6, 7, 8]]
-            #    dim: 8, 1
-            #    layout: NCHW
-            #    dtype: int64_t
-            #    data: [0 0 1 0 1 1 1 0]
-
-            out_main = exe.run(fluid.default_main_program(),
-                            feed={'x': x_lod_tensor, 'y': y_lod_tensor},
-                            fetch_list=[out], return_numpy=False)
-            print(out_main[0])
-            #lod: [[0, 3, 6, 7, 8]]
-            #    dim: 8, 1
-            #    layout: NCHW
-            #    dtype: float
-            #    data: [1 1 1 2 2 2 3 4]
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_expand_as', input=x, **locals())
-    dtype = helper.input_dtype()
-    tmp = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='sequence_expand_as',
-        inputs={'X': x,
-                'Y': y},
-        outputs={'Out': tmp})
-    return tmp
-
-
-def sequence_pad(x, pad_value, maxlen=None, name=None):
-    """
-    This layer padding the sequences in a same batch to a common length (according \
-         to ``maxlen``). The padding value is defined by ``pad_value``, and will be \
-        appended to the tail of sequences. The result is a Python tuple ``(Out, Length)``: \
-        the LodTensor ``Out`` is the padded sequences, and LodTensor ``Length`` is \
-        the length information of input sequences. For removing paddding data (unpadding \
-	operation), See :ref:`api_fluid_layers_sequence_unpad` .
-
-    Please note that the input ``x`` should be LodTensor.
-
-    .. code-block:: text
-
-        Case 1:
-        Given input 1-level LoDTensor x:
-            x.lod = [[0,  2,   5]]
-            x.data = [[a],[b],[c],[d],[e]]
-        pad_value:
-            pad_value.data = [0]
-        maxlen = 4
-
-        the output tuple (Out, Length):
-            Out.data = [[[a],[b],[0],[0]],[[c],[d],[e],[0]]]
-            Length.data = [2, 3]      #Original sequences length
-
-        Case 2:
-        Given input 1-level LoDTensor x:
-            x.lod =  [[0,             2,                     5]]
-            x.data = [[a1,a2],[b1,b2],[c1,c2],[d1,d2],[e1,e2]]
-        pad_value:
-            pad_value.data = [0]
-        defualt maxlen = None, (the virtual value is 3, according to the shape of x)
-
-        the output tuple (Out, Length):
-            Out.data = [[[a1,a2],[b1,b2],[0,0]],[[c1,c2],[d1,d2],[e1,e2]]]
-            Length.data = [2, 3]
-
-        Case 3:
-        Given input 1-level LoDTensor x:
-            x.lod =  [[0,             2,                     5]]
-            x.data = [[a1,a2],[b1,b2],[c1,c2],[d1,d2],[e1,e2]]
-        pad_value:
-            pad_value.data = [p1,p2]
-        defualt maxlen = None, (the virtual value is 3)
-
-        get tuple (Out, Length):
-            Out.data = [[[a1,a2],[b1,b2],[p1,p2]],[[c1,c2],[d1,d2],[e1,e2]]]
-            Length.data = [2, 3]
-
-
-
-    Args:
-        x (Variable): Input 1-level LodTensor with dims ``[M, K]``. The batch \
-            size is described by lod infor (the number of sequnces ). \
-            The data type should be float32, float64, int8, int32 or int64.
-        pad_value (Variable): Padding value. It can be a scalar or a 1D tensor \
-            with length ``K``. If it's a scalar, it will be automatically broadcasted \
-            to a Tensor. The data type should be as same as ``x``.
-        maxlen (int, optional): The length of padded sequences, None by default. \
-            When it is None, all sequences will be padded up to the length of the \
-            longest one among them; when it a certain positive value, it must be \
-            greater than the length of the longest original sequence.
-        name (str, optional): For detailed information, please refer \
-            to :ref:`api_guide_Name`. Usually name is no need to set and \
-            None by default.
-
-    Returns: A Python tuple (Out, Length): the 1st is a 0 level LodTensor \
-            ``Out``, with the shape ``[batch_size, maxlen, K]``; the second is the original \
-            sequences length infor ``Length``, which should be a 0-level 1D LodTensor. \
-            The size of ``Length`` is equal to batch size, and the data type is int64.
-
-    Return Type: tuple
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy
-
-            x = fluid.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
-            pad_value = fluid.layers.assign(
-                input=numpy.array([0.0], dtype=numpy.float32))
-            out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
-    """
-
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_pad', input=x, **locals())
-    dtype = helper.input_dtype()
-    out = helper.create_variable_for_type_inference(dtype)
-    length = helper.create_variable_for_type_inference(dtype)
-
-    pad_value.stop_gradient = True
-    length.stop_gradient = True
-
-    if maxlen is None:
-        maxlen = -1
-    helper.append_op(
-        type='sequence_pad',
-        inputs={'X': x,
-                'PadValue': pad_value},
-        outputs={'Out': out,
-                 'Length': length},
-        attrs={'padded_length': maxlen})
-    return out, length
-
-
-def sequence_unpad(x, length, name=None):
-    """
-    **Note**:
-    
-    **The input of the OP is Tensor and the output is LoDTensor.  For padding operation, See:**  :ref:`api_fluid_layers_sequence_pad`  
-     
-    The OP removes the padding data from the input based on the length information and returns a LoDTensor.
-
-    .. code-block:: text
-
-	Case 1:
-
-	Given input Variable **x**:
-	    x.data = [[ 1.0,  2.0,  3.0,  4.0,  5.0],
-		      [ 6.0,  7.0,  8.0,  9.0, 10.0],
-		      [11.0, 12.0, 13.0, 14.0, 15.0]],
-
-	in which there are 3 sequences padded to length 5, and the acutal length
-	specified by input Variable **length**:
-
-	    length.data = [2, 3, 4],
-
-	after unpadding, the output Variable will be:
-
-	    out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
-	    out.lod = [[0, 2, 5, 9]]
-
-    Args:
-        x(Variable): A Tensor which contains padding data, and its shape size can not be less than 2.
-                     Supported data types: float32, float64, int32, int64.
-        length(Variable): A 1D Tensor that stores the actual length of each sample, and the Tensor 
-                          has the same shape with the 0th dimension of the X . Supported data types: int64.
-        name(str|None):  The default value is None.  Normally there is no need for user to set this property.  
-                         For more information, please refer to :ref:`api_guide_Name`
-
-    Returns:
-        Variable: A LoDTensor whose recursive sequence length is consistent with the information of the length parameter and it has the same data type with input.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy
-
-            # pad data
-            x = fluid.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
-            pad_value = fluid.layers.assign(input=numpy.array([0.0], dtype=numpy.float32))
-            pad_data, len = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
-            
-            # unpad data
-            unpad_data = fluid.layers.sequence_unpad(x=pad_data, length=len)
-    """
-
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_unpad', input=x, **locals())
-    dtype = helper.input_dtype()
-    out = helper.create_variable_for_type_inference(dtype)
-
-    length.stop_gradient = True
-
-    helper.append_op(
-        type='sequence_unpad',
-        inputs={'X': x,
-                'Length': length},
-        outputs={'Out': out})
-    return out
-
-
-def beam_search(pre_ids,
-                pre_scores,
-                ids,
-                scores,
-                beam_size,
-                end_id,
-                level=0,
-                is_accumulated=True,
-                name=None,
-                return_parent_idx=False):
-    """
-    Beam search is a classical algorithm for selecting candidate words in a
-    machine translation task.
-
-    Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
-    for more details.
-
-    **This operator only supports LoDTensor.** It is used after finishing
-    scores calculation to perform beam search for one time step. Specifically,
-    after ``ids`` and ``scores`` have been produced, it selects the top-K
-    ( `k` is ``beam_size`` ) candidate word ids of current step from ``ids``
-    according to the correspongding ``scores``. Additionally, ``pre_id`` and
-    ``pre_scores`` are the output of `beam_search` at previous step, they
-    are needed for special use to handle ended candidate translations.
-
-    Note that if ``is_accumulated`` is True, the ``scores`` passed in should
-    be accumulated scores. Otherwise, the ``scores`` are
-    considered as the probabilities of single step and would be transformed to
-    the log field and added up with ``pre_scores`` for final scores in this
-    operator. Length penalty should be done with extra operators before calculating
-    the accumulated scores if needed.
-
-    Please see the following demo for a fully beam search usage example:
-
-        fluid/tests/book/test_machine_translation.py
-
-    Args:
-        pre_ids(Variable): A LodTensor variable (lod level is 2), representing
-            the selected ids of previous step. It is the output of beam_search
-            at previous step. Its shape is `[batch_size, 1]` and its lod is
-            `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at the
-            first step. The data type should be int64.
-        pre_scores(Variable): A LodTensor variable has the same shape and lod
-            with ``pre_ids`` , representing the accumulated scores corresponding
-            to the selected ids of previous step. It is the output of
-            beam_search at previous step. The data type should be float32.
-        ids(Variable|None): A LodTensor variable containing the candidates ids.
-            It has the same lod with ``pre_ids`` and its shape should be
-            `[batch_size * beam_size, K]`, where `K` supposed to be greater than
-            ``beam_size`` and the first dimension size (decrease as samples reach
-            to the end) should be same as that of ``pre_ids`` . The data type
-            should be int64. It can be None, which use indice in ``scores`` as
-            ids.
-        scores(Variable): A LodTensor variable containing the accumulated
-            scores corresponding to ``ids`` . Both its shape and lod are same as
-            thoes of ``ids`` . The data type should be float32.
-        beam_size(int): The beam width used in beam search.
-        end_id(int): The id of end token.
-        level(int): **It can be ignored and mustn't change currently.**
-            The 2 level lod used in this operator has the following
-            meaning: The first level describes how many beams each sample has,
-            which would change to 0 when beams of the sample all end (batch reduce);
-            The second level describes how many times each beam is selected.
-            Default 0, which shouldn't be changed currently.
-        is_accumulated(bool): Whether the input ``score`` is accumulated scores.
-            Default True.
-        name(str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
-            None by default.
-        return_parent_idx(bool, optional): Whether to return an extra Tensor variable
-            in output, which stores the selected ids' parent indice in
-            ``pre_ids`` and can be used to update RNN's states by gather operator.
-            Default False.
-
-    Returns:
-        tuple: The tuple contains two or three LodTensor variables. The two LodTensor, \
-            representing the selected ids and the corresponding accumulated scores of \
-            current step, have the same shape `[batch_size, beam_size]` and lod with 2 levels, \
-            and have data types int64 and float32. If ``return_parent_idx`` is True, \
-            an extra Tensor variable preserving the selected ids' parent indice \
-            is included, whose shape is `[batch_size * beam_size]` and data type \
-            is int64.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            # Suppose `probs` contains predicted results from the computation
-            # cell and `pre_ids` and `pre_scores` is the output of beam_search
-            # at previous step.
-            beam_size = 4
-            end_id = 1
-            pre_ids = fluid.data(
-                name='pre_id', shape=[None, 1], lod_level=2, dtype='int64')
-            pre_scores = fluid.data(
-                name='pre_scores', shape=[None, 1], lod_level=2, dtype='float32')
-            probs = fluid.data(
-                name='probs', shape=[None, 10000], dtype='float32')
-            topk_scores, topk_indices = fluid.layers.topk(probs, k=beam_size)
-            accu_scores = fluid.layers.elementwise_add(
-                x=fluid.layers.log(x=topk_scores),
-                y=fluid.layers.reshape(pre_scores, shape=[-1]),
-                axis=0)
-            selected_ids, selected_scores = fluid.layers.beam_search(
-                pre_ids=pre_ids,
-                pre_scores=pre_scores,
-                ids=topk_indices,
-                scores=accu_scores,
-                beam_size=beam_size,
-                end_id=end_id)
-    """
-    helper = LayerHelper('beam_search', **locals())
-    score_type = pre_scores.dtype
-    id_type = pre_ids.dtype
-
-    inputs = {"pre_ids": pre_ids, "pre_scores": pre_scores, "scores": scores}
-    if ids is not None:
-        inputs["ids"] = ids
-
-    selected_scores = helper.create_variable_for_type_inference(
-        dtype=score_type)
-    selected_ids = helper.create_variable_for_type_inference(dtype=id_type)
-    # parent_idx is a tensor used to gather cell states at the next time
-    # step. Though lod in selected_ids can also be used to gather by
-    # sequence_expand, it is not efficient.
-    # gather_op's index input only supports int32 dtype currently
-    parent_idx = helper.create_variable_for_type_inference(dtype="int32")
-
-    helper.append_op(
-        type='beam_search',
-        inputs=inputs,
-        outputs={
-            'selected_ids': selected_ids,
-            'selected_scores': selected_scores,
-            'parent_idx': parent_idx
-        },
-        attrs={
-            # TODO(ChunweiYan) to assure other value support
-            'level': level,
-            'beam_size': beam_size,
-            'end_id': end_id,
-            'is_accumulated': is_accumulated,
-        })
-    if return_parent_idx:
-        return selected_ids, selected_scores, parent_idx
-    else:
-        return selected_ids, selected_scores
-
-
-def beam_search_decode(ids, scores, beam_size, end_id, name=None):
-    """
-    This operator is used after beam search has completed. It constructs the
-    full predicted sequences for each sample by walking back along the search
-    paths stored in lod of ``ids`` . The result sequences are stored in a
-    LoDTensor, which uses the following way to parse:
-
-    .. code-block:: text
-
-        If lod = [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]]
-
-        The first level of lod stands for: There are 2 samples each having 3
-        (beam width) predicted sequence.
-
-        The second level of lod stands for: The lengths of the first sample's
-        3 predicted sequences are 12, 12, 16; The lengths of the second sample's
-        3 predicted sequences are 14, 13, 15.
-
-
-    Please see the following demo for a fully beam search usage example:
-        fluid/tests/book/test_machine_translation.py
-
-    Args:
-        ids(Variable): The LoDTensorArray variable containing the selected ids
-            of all steps. Each LoDTensor in it has int64 data type and 2 level
-            lod which can be used to get the search paths.
-        scores(Variable): The LodTensorArray variable containing the accumulated
-            scores corresponding to selected ids of all steps. It has the same size
-            as ``ids`` . Each LoDTensor in it has the same shape and lod as the
-            counterpart in ``ids`` , and has a float32 data type.
-        beam_size(int): The beam width used in beam search.
-        end_id(int): The id of end token.
-        name(str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
-            None by default.
-
-    Returns:
-        tuple: The tuple contains two LodTensor variables. The two LodTensor, \
-            containing the full sequences of ids and the correspongding accumulated \
-            scores, have the same shape flattened to 1D and have the same 2 level \
-            lod. The lod can be used to get how many predicted sequences each sample \
-            has and how many ids each predicted sequence has.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            # Suppose `ids` and `scores` are LodTensorArray variables reserving
-            # the selected ids and scores of all steps
-            ids = fluid.layers.create_array(dtype='int64')
-            scores = fluid.layers.create_array(dtype='float32')
-            finished_ids, finished_scores = fluid.layers.beam_search_decode(
-                ids, scores, beam_size=5, end_id=0)
-    """
-    helper = LayerHelper('beam_search_decode', **locals())
-    sentence_ids = helper.create_variable_for_type_inference(dtype=ids.dtype)
-    sentence_scores = helper.create_variable_for_type_inference(dtype=ids.dtype)
-
-    helper.append_op(
-        type="beam_search_decode",
-        inputs={"Ids": ids,
-                "Scores": scores},
-        outputs={
-            "SentenceIds": sentence_ids,
-            "SentenceScores": sentence_scores
-        },
-        attrs={"beam_size": beam_size,
-               "end_id": end_id})
-
-    return sentence_ids, sentence_scores
-
-
-def lstm_unit(x_t,
-              hidden_t_prev,
-              cell_t_prev,
-              forget_bias=0.0,
-              param_attr=None,
-              bias_attr=None,
-              name=None):
-    """
-    Long-Short Term Memory (LSTM) RNN cell. This operator performs LSTM calculations for
-    one time step, whose implementation is based on calculations described in `RECURRENT
-    NEURAL NETWORK REGULARIZATION <http://arxiv.org/abs/1409.2329>`_  .
-
-    We add forget_bias to the biases of the forget gate in order to
-    reduce the scale of forgetting. The formula is as follows:
-    
-    .. math::
-
-        i_{t} & = \sigma(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
-
-        f_{t} & = \sigma(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
-
-        c_{t} & = f_{t}c_{t-1} + i_{t} tanh (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+                padding = padding[1:4]
+                padding = [ele for a_list in padding for ele in a_list]
+            padding = utils.convert_to_list(padding, 6, 'padding')

-        o_{t} & = \sigma(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+        elif is_list_or_tuple(padding) and len(padding) == 6:
+            padding = utils.convert_to_list(padding, 6, 'padding')

-        h_{t} & = o_{t} tanh (c_{t})
+        else:
+            padding = utils.convert_to_list(padding, 3, 'padding')
+            padding = [
+                padding[0], padding[0], padding[1], padding[1], padding[2],
+                padding[2]
+            ]
+        return padding

-    :math:`x_{t}` stands for ``x_t`` , corresponding to the input of current time step;
-    :math:`h_{t-1}` and :math:`c_{t-1}` correspond to ``hidden_t_prev`` and ``cell_t_prev`` ,
-    representing the output of from previous time step.
-    :math:`i_{t}, f_{t}, c_{t}, o_{t}, h_{t}` are input gate, forget gate, cell, output gate
-    and hidden calculation.
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'." %
+                str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0, 0, 0, 0, 0, 0]
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0, 0, 0, 0, 0, 0]

-    Args:
-        x_t(Variable): A 2D Tensor representing the input of current time step.
-            Its shape should be :math:`[N, M]` , where :math:`N` stands for batch
-            size, :math:`M` for the feature size of input. The data type should
-            be float32 or float64.
-        hidden_t_prev(Variable): A 2D Tensor representing the hidden value from
-            previous step. Its shape should be :math:`[N, D]` , where :math:`N`
-            stands for batch size, :math:`D` for the hidden size. The data type
-            should be same as ``x_t`` .
-        cell_t_prev(Variable): A 2D Tensor representing the cell value from
-            previous step. It has the same shape and data type with ``hidden_t_prev`` .
-        forget_bias (float, optional): :math:`forget\\_bias` added to the biases
-            of the forget gate. Default 0.
-        param_attr(ParamAttr, optional):  To specify the weight parameter property.
-            Default: None, which means the default weight parameter property is used.
-            See usage for details in :ref:`api_fluid_ParamAttr` .
-        bias_attr (ParamAttr, optional): To specify the bias parameter property.
-            Default: None, which means the default bias parameter property is used.
-            See usage for details in :ref:`api_fluid_ParamAttr` .
-        name(str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
-            None by default.
+    padding = _update_padding(padding, data_format)

-    Returns:
-        tuple: The tuple contains two Tensor variables with the same shape and \
-            data type with ``hidden_t_prev`` , representing the hidden value and \
-            cell value which correspond to :math:`h_{t}` and :math:`c_{t}` in \
-            the formula.
+    if filter_size is None:
+        if output_size is None:
+            raise ValueError("output_size must be set when filter_size is None")
+        if isinstance(output_size, int):
+            output_size = [output_size, output_size]

-    Raises:
-        ValueError: Rank of x_t must be 2.
-        ValueError: Rank of hidden_t_prev must be 2.
-        ValueError: Rank of cell_t_prev must be 2.
-        ValueError: The 1st dimensions of x_t, hidden_t_prev and cell_t_prev must be the same.
-        ValueError: The 2nd dimensions of hidden_t_prev and cell_t_prev must be the same.
+        d_in = input.shape[2] if data_format == 'NCDHW' else input.shape[1]
+        h_in = input.shape[3] if data_format == 'NCDHW' else input.shape[2]
+        w_in = input.shape[4] if data_format == 'NCDHW' else input.shape[3]

-    Examples:
+        filter_size_d = (output_size[0] - (d_in - 1) * stride[0] + padding[0] +
+                         padding[1] - 1) // dilation[0] + 1
+        filter_size_h = (output_size[1] - (h_in - 1) * stride[1] + padding[2] +
+                         padding[3] - 1) // dilation[1] + 1
+        filter_size_w = (output_size[2] - (w_in - 1) * stride[2] + padding[4] +
+                         padding[5] - 1) // dilation[2] + 1
+        filter_size = [filter_size_d, filter_size_h, filter_size_w]
+    else:
+        filter_size = utils.convert_to_list(filter_size, 3,
+                                            'conv3d_transpose.filter_size')

-        .. code-block:: python
+    if len(padding) == 6 and utils._is_symmetric_padding(padding, 3):
+        padding = [padding[0], padding[2], padding[4]]

-            import paddle.fluid as fluid
+    groups = 1 if groups is None else groups
+    filter_shape = [input_channel, num_filters // groups] + filter_size
+    img_filter = helper.create_parameter(
+        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)

-            dict_dim, emb_dim, hidden_dim = 128, 64, 512
-            data = fluid.data(name='step_data', shape=[None], dtype='int64')
-            x = fluid.embedding(input=data, size=[dict_dim, emb_dim])
-            pre_hidden = fluid.data(
-                name='pre_hidden', shape=[None, hidden_dim], dtype='float32')
-            pre_cell = fluid.data(
-                name='pre_cell', shape=[None, hidden_dim], dtype='float32')
-            hidden = fluid.layers.lstm_unit(
-                x_t=x,
-                hidden_t_prev=pre_hidden,
-                cell_t_prev=pre_cell)
-    """
-    helper = LayerHelper('lstm_unit', **locals())
-
-    if len(x_t.shape) != 2:
-        raise ValueError("Rank of x_t must be 2.")
-
-    if len(hidden_t_prev.shape) != 2:
-        raise ValueError("Rank of hidden_t_prev must be 2.")
-
-    if len(cell_t_prev.shape) != 2:
-        raise ValueError("Rank of cell_t_prev must be 2.")
-
-    if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[
-            0] != cell_t_prev.shape[0]:
-        raise ValueError("The 1st dimensions of x_t, hidden_t_prev and "
-                         "cell_t_prev must be the same.")
-
-    if hidden_t_prev.shape[1] != cell_t_prev.shape[1]:
-        raise ValueError("The 2nd dimensions of hidden_t_prev and "
-                         "cell_t_prev must be the same.")
-
-    if bias_attr is None:
-        bias_attr = ParamAttr()
-
-    size = cell_t_prev.shape[1]
-    concat_out = concat(input=[x_t, hidden_t_prev], axis=1)
-    fc_out = fc(input=concat_out,
-                size=4 * size,
-                param_attr=param_attr,
-                bias_attr=bias_attr)
-    dtype = x_t.dtype
-    c = helper.create_variable_for_type_inference(dtype)
-    h = helper.create_variable_for_type_inference(dtype)
+    if data_format == 'NCDHW':
+        data_format = 'NCHW'
+    if data_format == 'NDHWC':
+        data_format = 'NHWC'

+    pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype)
    helper.append_op(
-        type='lstm_unit',
-        inputs={"X": fc_out,
-                "C_prev": cell_t_prev},
-        outputs={"C": c,
-                 "H": h},
-        attrs={"forget_bias": forget_bias})
+        type=l_type,
+        inputs={'Input': [input],
+                'Filter': [img_filter]},
+        outputs={'Output': pre_bias},
+        attrs={
+            'strides': stride,
+            'paddings': padding,
+            'padding_algorithm': padding_algorithm,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'data_format': data_format
+        })

-    return h, c
+    if data_format == 'NCHW':
+        pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
+    else:
+        pre_act = helper.append_bias_op(pre_bias, dim_start=4, dim_end=5)
+    out = helper.append_activation(pre_act)
+    return out


 def reduce_sum(input, dim=None, keep_dim=False, name=None):
@@ -7521,63 +5329,6 @@ def warpctc(input,
    return loss_out


-def sequence_reshape(input, new_dim):
-    """
-    **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use reshape Op.(fluid.layers.** :ref:`api_fluid_layers_reshape` ).
-
-    This operator only supports LoDTensor as input. Given :attr:`new_dim` ,
-    it will compute new shape according to original length of each sequence,
-    original dimensions and :attr:`new_dim` . Then it will output a new LoDTensor
-    containing :attr:`new_dim` . Currently it only supports 1-level LoDTensor.
-    Please make sure that (original length * original dimensions) can be divided
-    by the :attr:`new_dim` with no remainder for each sequence.
-
-    .. code-block:: text
-
-        input is a LoDTensor:
-            input.lod  = [[0, 2, 6]]
-            input.data = [[1,  2], [3,  4],
-                          [5,  6], [7,  8],
-                          [9, 10], [11, 12]]
-            input.shape = [6, 2]
-
-        set new_dim = 4
-        out is a LoDTensor:
-            out.lod  = [[0, 1, 3]]
-            out.data = [[1,  2,  3,  4],
-                        [5,  6,  7,  8],
-                        [9, 10, 11, 12]]
-            out.shape = [3, 4]
-
-
-    Args:
-
-       input (Variable): 1-level LoDTensor with shape :math:`[M, K]` . The data type should
-            be int32, int64, float32 or float64.
-       new_dim (int): New dimension that the input LoDTensor is reshaped to.
-
-    Returns:
-        Variable: Reshaped LoDTensor according to new dimension. The data type is same as input.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[None, 16], dtype='float32', lod_level=1)
-            x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=4)
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_reshape', **locals())
-    out = helper.create_variable_for_type_inference(helper.input_dtype())
-    helper.append_op(
-        type='sequence_reshape',
-        inputs={'X': [input]},
-        outputs={'Out': [out]},
-        attrs={'new_dim': new_dim})
-    return out
-
-
 # FIXME(wuyi): let docstring_checker.py understand @autodoc.
 # For now, the comments in c++ use types like Tensor, but in python side
 # the type is often "Variable", and arguments may vary.
@@ -11074,84 +8825,6 @@ def scatter_nd(index, updates, shape, name=None):
    return scatter_nd_add(zeros(shape, updates.dtype), index, updates, name)


-def sequence_scatter(input, index, updates, name=None):
-    """
-    **Note**:
-    
-    **The index and updates parameters of the OP must be LoDTensor.**
-     
-    Plus the updates data to the correspoding input according to the index.
- 
-    The updated algorithm is as follows: output[instance_index][index [pos]] = input[instance_index][index [pos]] +  updates[pos], 
-    where instance_idx is the K sample corresponding to pos in batch.
-
-    The value of output[i][j] depends on whether j can be found in the i+1th interval of the index. If found, 
-    out[i][j] = input[i][j] + update[m] [n], otherwise, out[i][j] = input[i][j].
-
-    For example, in the following example, the lod information for index is divided into three sequences. Among 
-    them, because the element 0 can be found in the first interval of the index, it is updated with the value of 
-    the corresponding position of the updates, out[0][0] = input[0][0]+updates[0][0] . Because element 1 cannot 
-    be found in the third interval of index, out[2][1] = input[2][1].
-
-    .. code-block:: text
-        
-        *Case 1:
-
-            Given:
-                input.data = [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
-                              [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
-                              [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
-                              input.dims = [3, 6]
-
-                index.data = [[0], [1], [2], [5], [4], [3], [2], [1], [3], [2], [5], [4]]
-                index.lod =  [[0,        3,                       8,                 12]]
-
-                updates.data = [[0.3], [0.3], [0.4], [0.1], [0.2], [0.3], [0.4], [0.0], [0.2], [0.3], [0.1], [0.4]]
-                updates.lod =  [[  0,            3,                                 8,                         12]]
-
-            Then:
-                out.data = [[1.3, 1.3, 1.4, 1.0, 1.0, 1.0],
-                            [1.0, 1.0, 1.4, 1.3, 1.2, 1.1],
-                            [1.0, 1.0, 1.3, 1.2, 1.4, 1.1]]
-                out.dims = X.dims = [3, 6]
-
-    Args:
-        input (Variable): A Tensor with shape of  :math:`[N, k_1... k_n]`. Supported data types: float32, float64, int32, int64.
-        index (Variable):  A LoDTensor contains index information. Its LoD level must be 1 and its data type must be int64.
-        updates (Variable): A LodTensor contains updates information. It has the same  LoD level with the index and has the 
-                            same data type  with the input. Supported data types: float32, float64, int32, int64.
-        name (str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, 
-                              please refer to :ref:`api_guide_Name`
-
-    Returns:
-        Variable: A Tensor which has been updated. It has the same shape and data type with input.
-
-    Examples:
-
-        .. code-block:: python
-	
-            import paddle.fluid as fluid
-
-            input = fluid.data( name="x", shape=[None, 3, 6], dtype='float32' )
-            index = fluid.data( name='index', shape=[12, 1],  dtype='int64', lod_level=1)
-            updates = fluid.data( name='updates', shape=[12, 1], dtype='float32', lod_level=1)
-            output = fluid.layers.sequence_scatter(input, index, updates)
-
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_scatter', **locals())
-    dtype = helper.input_dtype()
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="sequence_scatter",
-        inputs={"X": input,
-                "Ids": index,
-                "Updates": updates},
-        outputs={"Out": out})
-    return out
-
-
 @templatedoc()
 def random_crop(x, shape, seed=None):
    """
@@ -12582,146 +10255,6 @@ def flatten(x, axis=1, name=None):
    return out


-def sequence_enumerate(input, win_size, pad_value=0, name=None):
-    """
-    Generate a new sequence for the input index sequence with \
-        shape ``[d_1, win_size]``, which enumerates all the \
-        sub-sequences with length ``win_size`` of the input with \
-        shape ``[d_1, 1]``, and padded by ``pad_value`` if necessary in generation.
-
-    Please note that the `input` must be LodTensor.
-
-    .. code-block:: text
-
-        Input x:
-            x.lod = [[0, 3, 5]]
-            x.data = [[1], [2], [3], [4], [5]]
-            x.dims = [5, 1]
-
-        Attrs:
-            win_size = 2
-            pad_value = 0
-
-        Output:
-            out.lod = [[0, 3, 5]]
-            out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
-            out.dims = [5, 2]
-
-
-    Args:
-        input (Variable): The input variable which is a index sequence, \
-            which should be a LodTensor with shape ``[d_1, 1]`` and 1-level lod info. \
-            The data type should be float32, float64, int8, int32 or int64.
-        win_size (int): The window size for enumerating all sub-sequences.
-        pad_value (int, optional): The padding value, default 0.
-        name(str, optional): For detailed information, please refer \
-            to :ref:`api_guide_Name`. Usually name is no need to set and \
-            None by default.
-
-    Returns: The enumerate sequence variable which is a LoDTensor with \
-            shape ``[d_1, win_size]`` and 1-level lod info. \
-            The data type is same as ``input``.
-
-    Return Type: Variable
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            x = fluid.data(name='x', shape=[-1, 1], dtype='int32', lod_level=1)
-            out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_enumerate', **locals())
-    out = helper.create_variable_for_type_inference(
-        helper.input_dtype(), stop_gradient=True)
-    helper.append_op(
-        type='sequence_enumerate',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={'win_size': win_size,
-               'pad_value': pad_value})
-    return out
-
-
-def sequence_mask(x, maxlen=None, dtype='int64', name=None):
-    """
-    **SequenceMask Layer**
-
-    This layer outputs a mask according to the input :code:`x` and
-    :code:`maxlen` with data type of :code:`dtype`.
-
-    Supposing :code:`x` is a Tensor with shape [d_1, d_2, ..., d_n], the
-    :code:`y` is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
-
-    .. math::
-
-        y(i_1, i_2,..., i_n, j) = (j < x(i_1, i_2,..., i_n))
-
-    .. code-block:: text
-
-        Case:
-
-        Consider input:
-            x = [3, 1, 1, 0]    max_len = 4
-
-        then we get out:
-            mask = [[1, 1, 1, 0],
-                    [1, 0, 0, 0],
-                    [1, 0, 0, 0],
-                    [0, 0, 0, 0]]
-
-    Args:
-        x (Variable): Input tensor of sequence_mask layer, \
-            whose elements are integers less than :code:`maxlen`. \
-            Tensor or LodTensor with shape [d_1, d_2, ..., d_n].
-        maxlen (int, optional): Maximum length of the sequence. If :code:`maxlen` \
-                           is None, it would be replace with :math:`max(x)`.
-        dtype (np.dtype|core.VarDesc.VarType|str, optional): Data type of the output, \
-             ``int64`` by default.
-        name(str, optional): For detailed information, please refer \
-            to :ref:`api_guide_Name`. Usually name is no need to set and \
-            None by default.
-
-    Returns: The output sequence mask. Tensor or LodTensor with shape [d_1, d_2, ..., d_n, maxlen] \
-            and data type of :code:`dtype`. The data type should be float32, float64, int8, \
-            int32 or int64.
-
-    Return Type: Variable
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-
-            x = fluid.data(name='x', shape=[10], dtype='float32', lod_level=1)
-            mask = layers.sequence_mask(x=x)
-
-    """
-    helper = LayerHelper('sequence_mask', **locals())
-    if name is None:
-        out = helper.create_variable_for_type_inference(dtype=dtype)
-    else:
-        out = helper.create_variable_for_type_inference(dtype=dtype, name=name)
-
-    inputs = {'X': [x]}
-    attrs = {'out_dtype': out.dtype}
-    if maxlen is not None:
-        if isinstance(maxlen, Variable):
-            inputs['MaxLenTensor'] = maxlen
-        else:
-            attrs['maxlen'] = maxlen
-
-    helper.append_op(
-        type='sequence_mask', inputs=inputs, outputs={'Y': out}, attrs=attrs)
-
-    out.stop_gradient = True
-    return out
-
-
 def stack(x, axis=0):
    """

@@ -15306,68 +12839,6 @@ def space_to_depth(x, blocksize, name=None):
    return out


-@templatedoc()
-def sequence_reverse(x, name=None):
-    """
-    **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use reverse Op.(fluid.layers.** :ref:`api_fluid_layers_reverse` ).
-
-    This operator only supports LoDTensor as input. It will reverse each sequence for input LoDTensor.
-    Currently it only supports 1-level LoDTensor. This operator is very useful when building a
-    reverse :ref:`api_fluid_layers_DynamicRNN` network.
-
-    .. code-block:: text
-
-        input(x) is a LoDTensor:
-            x.lod  = [[0, 2, 5]]
-            x.data = [[1,  2,  3,  4],
-                      [5,  6,  7,  8],
-                      [9, 10, 11, 12],
-                      [13,14, 15, 16],
-                      [17,18, 19, 20]]
-            x.shape = [5, 4]
-
-        output LoDTensor with same shape and LoD info:
-            out.lod  = [[0, 2, 5]]
-            out.data = [[5,  6,  7,  8],
-                        [1,  2,  3,  4],
-                        [17,18, 19, 20],
-                        [13,14, 15, 16],
-                        [9, 10, 11, 12]]
-            out.shape = [5, 4]
-
-    Args:
-        x(Variable): LoDTensor with 1-level LoD info. Currently it only supports 1-level LoDTensor.
-            The data type should be float32, float64, int8, int32 or int64.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
-
-    Returns:
-        Variable: LoDTensor reversed from input. The data type is same with input.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-            x_reversed = fluid.layers.sequence_reverse(x)
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper("sequence_reverse", **locals())
-    if name is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
-
-    helper.append_op(
-        type="sequence_reverse",
-        inputs={"X": x},
-        outputs={"Y": out},
-        attrs=dict())
-    return out
-
-
 def affine_channel(x,
                   scale=None,
                   bias=None,

--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -20,7 +20,11 @@ from . import nn
 from . import tensor
 from . import control_flow
 from . import utils
+from . import sequence_lod
 from .utils import *
+from ..layer_helper import LayerHelper
+from ..framework import in_dygraph_mode
+from ..param_attr import ParamAttr

 __all__ = [
    'RNNCell',
@@ -30,6 +34,14 @@ __all__ = [
    'BeamSearchDecoder',
    'rnn',
    'dynamic_decode',
+    'dynamic_lstm',
+    'dynamic_lstmp',
+    'dynamic_gru',
+    'gru_unit',
+    'lstm_unit',
+    'lstm',
+    'beam_search',
+    'beam_search_decode',
 ]


@@ -433,7 +445,7 @@ def rnn(cell,

    if sequence_length:
        max_seq_len = nn.shape(flatten(inputs)[0])[0]
-        mask = nn.sequence_mask(
+        mask = sequence_lod.sequence_mask(
            sequence_length,
            maxlen=max_seq_len,
            dtype=flatten(initial_states)[0].dtype)
@@ -1163,3 +1175,1195 @@ def dynamic_decode(decoder,
        final_outputs = map_structure(_transpose_batch_time, final_outputs)

    return final_outputs, final_states
+
+
+def dynamic_lstm(input,
+                 size,
+                 h_0=None,
+                 c_0=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_peepholes=True,
+                 is_reverse=False,
+                 gate_activation='sigmoid',
+                 cell_activation='tanh',
+                 candidate_activation='tanh',
+                 dtype='float32',
+                 name=None):
+    """
+    **Note**:
+        1. This OP only supports LoDTensor as inputs. If you need to deal with Tensor, please use :ref:`api_fluid_layers_lstm` .
+        2. In order to improve efficiency, users must first map the input of dimension [T, hidden_size] to input of [T, 4 * hidden_size], and then pass it to this OP.
+
+    The implementation of this OP include diagonal/peephole connections.
+    Please refer to `Gers, F. A., & Schmidhuber, J. (2000) <ftp://ftp.idsia.ch/pub/juergen/TimeCount-IJCNN2000.pdf>`_ .
+    If you do not need peephole connections, please set use_peepholes to False .
+
+    This OP computes each timestep as follows:
+
+    .. math::
+      i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_{x_i} + b_{h_i})
+    .. math::
+      f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_{x_f} + b_{h_f})
+    .. math::
+      o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_{x_o} + b_{h_o})
+    .. math::
+      \widetilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + b{x_c} + b_{h_c})
+    .. math::
+      c_t = f_t \odot c_{t-1} + i_t \odot \widetilde{c_t}
+    .. math::
+      h_t = o_t \odot tanh(c_t)
+
+    The symbolic meanings in the formula are as follows:
+
+    - :math:`x_{t}` represents the input at timestep :math:`t`
+    - :math:`h_{t}` represents the hidden state at timestep :math:`t`
+    - :math:`h_{t-1}, c_{t-1}` represent the hidden state and cell state at timestep :math:`t-1` , respectively
+    - :math:`\widetilde{c_t}` represents the candidate cell state
+    - :math:`i_t` , :math:`f_t` and :math:`o_t` represent input gate, forget gate, output gate, respectively
+    - :math:`W` represents weight (e.g., :math:`W_{ix}` is the weight of a linear transformation of input :math:`x_{t}` when calculating input gate :math:`i_t` )
+    - :math:`b` represents bias (e.g., :math:`b_{i}` is the bias of input gate)
+    - :math:`\sigma` represents nonlinear activation function for gate, default sigmoid
+    - :math:`\odot` represents the Hadamard product of a matrix, i.e. multiplying the elements of the same position for two matrices with the same dimension to get another matrix with the same dimension
+
+    Parameters:
+        input ( :ref:`api_guide_Variable_en` ): LSTM input tensor, multi-dimensional LODTensor of shape :math:`[T, 4*hidden\_size]` . Data type is float32 or float64.
+        size (int): must be 4 * hidden_size.
+        h_0( :ref:`api_guide_Variable_en` , optional): The initial hidden state of the LSTM, multi-dimensional Tensor of shape :math:`[batch\_size, hidden\_size]` .
+                       Data type is float32 or float64. If set to None, it will be a vector of all 0. Default: None.
+        c_0( :ref:`api_guide_Variable_en` , optional): The initial hidden state of the LSTM, multi-dimensional Tensor of shape :math:`[batch\_size, hidden\_size]` .
+                       Data type is float32 or float64. If set to None, it will be a vector of all 0. `h_0` and `c_0` can be None but only at the same time. Default: None.
+        param_attr(ParamAttr, optional): Parameter attribute of weight. If it is None, the default weight parameter attribute is used. Please refer to ref:`api_fluid_ParamAttr' .
+                              If the user needs to set this parameter, the dimension must be :math:`[hidden\_size, 4*hidden\_size]` . Default: None.
+
+                              - Weights = :math:`\{ W_{cr},W_{ir},W_{fr},W_{or} \}` , the shape is [hidden_size, 4*hidden_size].
+
+        bias_attr (ParamAttr, optional): The bias attribute for the learnable bias
+                              weights, which contains two parts, input-hidden
+                              bias weights and peephole connections weights if
+                              setting `use_peepholes` to `True`.
+                              Please refer to ref:`api_fluid_ParamAttr' . Default: None.
+
+                              1. `use_peepholes = False`
+                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                 - The shape is [1, 4*hidden_size].
+                              2. `use_peepholes = True`
+                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                                 W_{fc}, W_{oc}`}.
+                                 - The shape is [1, 7*hidden_size].
+                                 
+        use_peepholes (bool, optional): Whether to use peephole connection or not. Default: True.
+        is_reverse (bool, optional): Whether to calculate reverse LSTM. Default: False.
+        gate_activation (str, optional): The activation for input gate, forget gate and output gate. Default: "sigmoid".
+        cell_activation (str, optional): The activation for cell output. Default: "tanh".
+        candidate_activation (str, optional): The activation for candidate hidden state. Default: "tanh".
+        dtype (str, optional): Data type, can be "float32" or "float64". Default: "float32".
+        name (str, optional): A name for this layer. Please refer to :ref:`api_guide_Name` . Default: None.
+
+    Returns:
+        tuple ( :ref:`api_guide_Variable` , :ref:`api_guide_Variable` ) :
+
+            The hidden state and cell state of LSTM
+
+                - hidden: LoDTensor with shape of :math:`[T, hidden\_size]` , and its lod and dtype is the same as the input.
+                - cell: LoDTensor with shape of :math:`[T, hidden\_size]` , and its lod and dtype is the same as the input.
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle.fluid as fluid
+            emb_dim = 256
+            vocab_size = 10000
+            hidden_dim = 512
+            
+            data = fluid.data(name='x', shape=[None], dtype='int64', lod_level=1)
+            emb = fluid.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True)
+
+            forward_proj = fluid.layers.fc(input=emb, size=hidden_dim * 4,
+                                           bias_attr=False)
+
+            forward, cell = fluid.layers.dynamic_lstm(
+                input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
+            forward.shape  # (-1, 512)
+            cell.shape  # (-1, 512)
+    """
+    assert in_dygraph_mode(
+    ) is not True, "please use lstm instead of dynamic_lstm in dygraph mode!"
+    assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
+    helper = LayerHelper('lstm', **locals())
+    size = size // 4
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
+    bias_size = [1, 7 * size]
+    if not use_peepholes:
+        bias_size[1] = 4 * size
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    hidden = helper.create_variable_for_type_inference(dtype)
+    cell = helper.create_variable_for_type_inference(dtype)
+    batch_gate = helper.create_variable_for_type_inference(dtype)
+    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
+    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+    batch_size = input.shape[0]
+    if h_0:
+        assert h_0.shape == (batch_size, size), \
+            'The shape of h0 should be (batch_size, %d)' % size
+        inputs['H0'] = h_0
+    if c_0:
+        assert c_0.shape == (batch_size, size), \
+            'The shape of c0 should be (batch_size, %d)' % size
+        inputs['C0'] = c_0
+
+    helper.append_op(
+        type='lstm',
+        inputs=inputs,
+        outputs={
+            'Hidden': hidden,
+            'Cell': cell,
+            'BatchGate': batch_gate,
+            'BatchCellPreAct': batch_cell_pre_act
+        },
+        attrs={
+            'use_peepholes': use_peepholes,
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'cell_activation': cell_activation,
+            'candidate_activation': candidate_activation
+        })
+    return hidden, cell
+
+
+def lstm(input,
+         init_h,
+         init_c,
+         max_len,
+         hidden_size,
+         num_layers,
+         dropout_prob=0.0,
+         is_bidirec=False,
+         is_test=False,
+         name=None,
+         default_initializer=None,
+         seed=-1):
+    """
+    **Note**:
+        This OP only supports running on GPU devices.
+
+    This OP implements LSTM operation - `Hochreiter, S., & Schmidhuber, J. (1997) <http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf>`_ .
+
+    The implementation of this OP does not include diagonal/peephole connections.
+    Please refer to `Gers, F. A., & Schmidhuber, J. (2000) <ftp://ftp.idsia.ch/pub/juergen/TimeCount-IJCNN2000.pdf>`_ .
+    If you need peephole connections, please use :ref:`api_fluid_layers_dynamic_lstm` .
+
+    This OP computes each timestep as follows:
+
+    .. math::
+      i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_{x_i} + b_{h_i})
+    .. math::
+      f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_{x_f} + b_{h_f})
+    .. math::
+      o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_{x_o} + b_{h_o})
+    .. math::
+      \widetilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + b{x_c} + b_{h_c})
+    .. math::
+      c_t = f_t \odot c_{t-1} + i_t \odot \widetilde{c_t}
+    .. math::
+      h_t = o_t \odot tanh(c_t)
+
+    The symbolic meanings in the formula are as follows:
+
+    - :math:`x_{t}` represents the input at timestep :math:`t`
+    - :math:`h_{t}` represents the hidden state at timestep :math:`t`
+    - :math:`h_{t-1}, c_{t-1}` represent the hidden state and cell state at timestep :math:`t-1` , respectively
+    - :math:`\widetilde{c_t}` represents the candidate cell state
+    - :math:`i_t` , :math:`f_t` and :math:`o_t` represent input gate, forget gate, output gate, respectively
+    - :math:`W` represents weight (e.g., :math:`W_{ix}` is the weight of a linear transformation of input :math:`x_{t}` when calculating input gate :math:`i_t` )
+    - :math:`b` represents bias (e.g., :math:`b_{i}` is the bias of input gate)
+    - :math:`\sigma` represents nonlinear activation function for gate, default sigmoid
+    - :math:`\odot` represents the Hadamard product of a matrix, i.e. multiplying the elements of the same position for two matrices with the same dimension to get another matrix with the same dimension
+
+    Parameters:
+        input ( :ref:`api_guide_Variable_en` ): LSTM input tensor, 3-D Tensor of shape :math:`[batch\_size, seq\_len, input\_dim]` . Data type is float32 or float64
+        init_h( :ref:`api_guide_Variable_en` ): The initial hidden state of the LSTM, 3-D Tensor of shape :math:`[num\_layers, batch\_size, hidden\_size]` .
+                       If is_bidirec = True, shape should be :math:`[num\_layers*2, batch\_size, hidden\_size]` . Data type is float32 or float64.
+        init_c( :ref:`api_guide_Variable_en` ): The initial cell state of the LSTM, 3-D Tensor of shape :math:`[num\_layers, batch\_size, hidden\_size]` .
+                       If is_bidirec = True, shape should be :math:`[num\_layers*2, batch\_size, hidden\_size]` . Data type is float32 or float64.
+        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len.
+        hidden_size (int): hidden size of the LSTM.
+        num_layers (int): total layers number of the LSTM.
+        dropout_prob(float, optional): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
+                             There is NO dropout work on rnn output of the last RNN layers.
+                             Default: 0.0.
+        is_bidirec (bool, optional): If it is bidirectional. Default: False.
+        is_test (bool, optional): If it is in test phrase. Default: False.
+        name (str, optional): A name for this layer. If set None, the layer
+                         will be named automatically. Default: None.
+        default_initializer(Initializer, optional): Where use initializer to initialize the Weight
+                         If set None, defaule initializer will be used. Default: None.
+        seed(int, optional): Seed for dropout in LSTM, If it's -1, dropout will use random seed. Default: 1.
+
+
+    Returns:
+        tuple ( :ref:`api_guide_Variable_en` , :ref:`api_guide_Variable_en` , :ref:`api_guide_Variable_en` ) :
+
+                        Three tensors, rnn_out, last_h, last_c:
+
+                        - rnn_out is result of LSTM hidden, shape is :math:`[seq\_len, batch\_size, hidden\_size]` \
+                          if is_bidirec set to True, shape will be :math:`[seq\_len, batch\_size, hidden\_size*2]`
+                        - last_h is the hidden state of the last step of LSTM \
+                          shape is :math:`[num\_layers, batch\_size, hidden\_size]` \
+                          if is_bidirec set to True, shape will be :math:`[num\_layers*2, batch\_size, hidden\_size]`
+                        - last_c(Tensor): the cell state of the last step of LSTM \
+                          shape is :math:`[num\_layers, batch\_size, hidden\_size]` \
+                          if is_bidirec set to True, shape will be :math:`[num\_layers*2, batch\_size, hidden\_size]`
+
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle.fluid as fluid
+            import paddle.fluid.layers as layers
+
+            emb_dim = 256
+            vocab_size = 10000
+            data = fluid.data(name='x', shape=[None, 100], dtype='int64')
+            emb = fluid.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True)
+            batch_size = 20
+            max_len = 100
+            dropout_prob = 0.2
+            input_size = 100
+            hidden_size = 150
+            num_layers = 1
+            init_h = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0 )
+            init_c = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0 )
+            rnn_out, last_h, last_c = layers.lstm( emb, init_h, init_c, \
+                    max_len, hidden_size, num_layers, \
+                    dropout_prob=dropout_prob)
+            rnn_out.shape  # (-1, 100, 150)
+            last_h.shape  # (1, 20, 150)
+            last_c.shape  # (1, 20, 150)
+    """
+
+    helper = LayerHelper('cudnn_lstm', **locals())
+
+    dtype = input.dtype
+    input_shape = list(input.shape)
+    input_size = input_shape[-1]
+    weight_size = 0
+    for i in range(num_layers):
+        if i == 0:
+            input_weight_size = (input_size * hidden_size) * 4
+        else:
+            if is_bidirec:
+                input_weight_size = (hidden_size * 2 * hidden_size) * 4
+            else:
+                input_weight_size = (hidden_size * hidden_size) * 4
+
+        hidden_weight_size = (hidden_size * hidden_size) * 4
+
+        if is_bidirec:
+            weight_size += (input_weight_size + hidden_weight_size) * 2
+            weight_size += hidden_size * 8 * 2
+        else:
+            weight_size += input_weight_size + hidden_weight_size
+            weight_size += hidden_size * 8
+
+    weight = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[weight_size],
+        dtype=dtype,
+        default_initializer=default_initializer)
+
+    out = helper.create_variable_for_type_inference(dtype)
+    last_h = helper.create_variable_for_type_inference(dtype)
+    last_c = helper.create_variable_for_type_inference(dtype)
+
+    cache = helper.create_variable(
+        persistable=True, type=core.VarDesc.VarType.RAW, stop_gradient=True)
+
+    helper.append_op(
+        type='cudnn_lstm',
+        inputs={
+            'Input': input,
+            'InitH': init_h,
+            'InitC': init_c,
+            'W': weight,
+            'Cache': cache,
+        },
+        outputs={
+            'Out': out,
+            'last_h': last_h,
+            'last_c': last_c,
+        },
+        attrs={
+            'max_len': max_len,
+            'is_bidirec': is_bidirec,
+            'input_size': input_size,
+            'hidden_size': hidden_size,
+            'num_layers': num_layers,
+            'is_test': is_test,
+            'dropout_prob': dropout_prob,
+            'seed': seed,
+        })
+    return out, last_h, last_c
+
+
+def dynamic_lstmp(input,
+                  size,
+                  proj_size,
+                  param_attr=None,
+                  bias_attr=None,
+                  use_peepholes=True,
+                  is_reverse=False,
+                  gate_activation='sigmoid',
+                  cell_activation='tanh',
+                  candidate_activation='tanh',
+                  proj_activation='tanh',
+                  dtype='float32',
+                  name=None,
+                  h_0=None,
+                  c_0=None,
+                  cell_clip=None,
+                  proj_clip=None):
+    """
+    **Note**:
+        1. In order to improve efficiency, users must first map the input of dimension [T, hidden_size] to input of [T, 4 * hidden_size], and then pass it to this OP.
+
+    This OP implements the LSTMP (LSTM Projected) layer.
+    The LSTMP layer has a separate linear mapping layer behind the LSTM layer. -- `Sak, H., Senior, A., & Beaufays, F. (2014) <https://ai.google/research/pubs/pub43905.pdf>`_ .
+
+    Compared with the standard LSTM layer, LSTMP has an additional linear mapping layer,
+    which is used to map from the original hidden state :math:`h_t` to the lower dimensional state :math:`r_t` .
+    This reduces the total number of parameters and computational complexity, especially when the output unit is relatively large.
+
+    The default implementation of the OP contains diagonal/peephole connections,
+    please refer to `Gers, F. A., & Schmidhuber, J. (2000) <ftp://ftp.idsia.ch/pub/juergen/TimeCount-IJCNN2000.pdf>`_ .
+    If you need to disable the peephole connections, set use_peepholes to False.
+
+    This OP computes each timestep as follows:
+
+    .. math::
+      i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i)
+    .. math::
+          f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f)
+    .. math::
+          o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_{t-1} + b_o)
+    .. math::
+          \widetilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c)
+    .. math::
+          c_t = f_t \odot c_{t-1} + i_t \odot \widetilde{c_t}
+    .. math::
+          h_t = o_t \odot act_h(c_t)
+    .. math::
+          r_t = \overline{act_h}(W_{rh}h_t)
+
+    The symbolic meanings in the formula are as follows:
+
+    - :math:`x_{t}` represents the input at timestep :math:`t`
+    - :math:`h_{t}` represents the hidden state at timestep :math:`t`
+    - :math:`r_{t}` : represents the state of the projected output of the hidden state :math:`h_{t}`
+    - :math:`h_{t-1}, c_{t-1}, r_{t-1}` represent the hidden state, cell state and projected output at timestep :math:`t-1` , respectively
+    - :math:`\widetilde{c_t}` represents the candidate cell state
+    - :math:`i_t` , :math:`f_t` and :math:`o_t` represent input gate, forget gate, output gate, respectively
+    - :math:`W` represents weight (e.g., :math:`W_{ix}` is the weight of a linear transformation of input :math:`x_{t}` when calculating input gate :math:`i_t` )
+    - :math:`b` represents bias (e.g., :math:`b_{i}` is the bias of input gate)
+    - :math:`\sigma` represents nonlinear activation function for gate, default sigmoid
+    - :math:`\odot` represents the Hadamard product of a matrix, i.e. multiplying the elements of the same position for two matrices with the same dimension to get another matrix with the same dimension
+
+    Parameters:
+        input( :ref:`api_guide_Variable_en` ): The input of dynamic_lstmp layer, which supports
+                         variable-time length input sequence.
+                         It is a multi-dimensional LODTensor of shape :math:`[T, 4*hidden\_size]` . Data type is float32 or float64.
+        size(int): must be 4 * hidden_size.
+        proj_size(int): The size of projection output.
+        param_attr(ParamAttr, optional): Parameter attribute of weight. If it is None, the default weight parameter attribute is used. Please refer to ref:`api_fluid_ParamAttr' .
+                              If the user needs to set this parameter, the dimension must be :math:`[hidden\_size, 4*hidden\_size]` . Default: None.
+
+                              - Weights = :math:`\{ W_{cr},W_{ir},W_{fr},W_{or} \}` , the shape is [P, 4*hidden_size] , where P is the projection size.
+                              - Projection weight  = :math:`\{ W_{rh} \}` , the shape is [hidden_size, P].
+
+        bias_attr (ParamAttr, optional): The bias attribute for the learnable bias
+                              weights, which contains two parts, input-hidden
+                              bias weights and peephole connections weights if
+                              setting `use_peepholes` to `True`.
+                              Please refer to ref:`api_fluid_ParamAttr' . Default: None.
+
+                              1. `use_peepholes = False`
+                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                 - The shape is [1, 4*hidden_size].
+                              2. `use_peepholes = True`
+                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                                 W_{fc}, W_{oc}`}.
+                                 - The shape is [1, 7*hidden_size].
+
+        use_peepholes (bool, optional): Whether to use peephole connection or not. Default True.
+        is_reverse (bool, optional): Whether to calculate reverse LSTM. Default False.
+        gate_activation (str, optional): The activation for input gate, forget gate and output gate. Default "sigmoid".
+        cell_activation (str, optional): The activation for cell output. Default "tanh".
+        candidate_activation (str, optional): The activation for candidate hidden state. Default "tanh".
+        proj_activation(str, optional): The activation for projection output. Default "tanh".
+        dtype (str, optional): Data type, can be "float32" or "float64". Default "float32".
+        name (str, optional): A name for this layer. Please refer to :ref:`api_guide_Name` . Default: None.
+        h_0( :ref:`api_guide_Variable` , optional): The initial hidden state is an optional input, default is zero.
+                       This is a tensor with shape :math:`[batch\_size, P]` , where P is the projection size. Default: None.
+        c_0( :ref:`api_guide_Variable` , optional): The initial cell state is an optional input, default is zero.
+                       This is a tensor with shape :math:`[batch\_size, P]` , where P is the projection size.
+                       `h_0` and `c_0` can be None but only at the same time. Default: None.
+        cell_clip(float, optional): If not None, the cell state is clipped
+                             by this value prior to the cell output activation. Default: None.
+        proj_clip(float, optional): If `num_proj > 0` and `proj_clip` is
+                            provided, then the projected values are clipped elementwise to within
+                            `[-proj_clip, proj_clip]`. Default: None.
+
+    Returns:
+        tuple ( :ref:`api_guide_Variable` , :ref:`api_guide_Variable` ) :
+
+                The hidden state and cell state of LSTMP
+
+                - hidden: LoDTensor with shape of :math:`[T, P]` , and its lod and dtype is the same as the input.
+                - cell: LoDTensor with shape of :math:`[T, hidden\_size]` , and its lod and dtype is the same as the input.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            dict_dim, emb_dim = 128, 64
+            data = fluid.data(name='sequence', shape=[None], dtype='int64', lod_level=1)
+            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
+            hidden_dim, proj_dim = 512, 256
+            fc_out = fluid.layers.fc(input=emb, size=hidden_dim * 4,
+                                    act=None, bias_attr=None)
+            proj_out, last_c = fluid.layers.dynamic_lstmp(input=fc_out,
+                                                    size=hidden_dim * 4,
+                                                    proj_size=proj_dim,
+                                                    use_peepholes=False,
+                                                    is_reverse=True,
+                                                    cell_activation="tanh",
+                                                    proj_activation="tanh")
+            proj_out.shape  # (-1, 256)
+            last_c.shape  # (-1, 512)
+    """
+
+    assert in_dygraph_mode(
+    ) is not True, "please use lstm instead of dynamic_lstmp in dygraph mode!"
+
+    assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
+    helper = LayerHelper('lstmp', **locals())
+    size = size // 4
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[proj_size, 4 * size], dtype=dtype)
+    proj_weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, proj_size], dtype=dtype)
+    bias_size = [1, 7 * size]
+    if not use_peepholes:
+        bias_size[1] = 4 * size
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    projection = helper.create_variable_for_type_inference(dtype)
+    cell = helper.create_variable_for_type_inference(dtype)
+    ordered_proj0 = helper.create_variable_for_type_inference(dtype)
+    batch_hidden = helper.create_variable_for_type_inference(dtype)
+    batch_gate = helper.create_variable_for_type_inference(dtype)
+    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
+    inputs = {
+        'Input': input,
+        'Weight': weight,
+        'ProjWeight': proj_weight,
+        'Bias': bias
+    }
+    batch_size = input.shape[0]
+    if h_0:
+        assert h_0.shape == (batch_size, proj_size), \
+            'The shape of h0 should be (batch_size, %d)' % proj_size
+        inputs['H0'] = h_0
+    if c_0:
+        assert c_0.shape == (batch_size, size), \
+            'The shape of c0 should be (batch_size, %d)' % size
+        inputs['C0'] = c_0
+
+    if cell_clip:
+        assert cell_clip >= 0, "cell_clip should not be negtive."
+    if proj_clip:
+        assert proj_clip >= 0, "proj_clip should not be negtive."
+
+    helper.append_op(
+        type='lstmp',
+        inputs=inputs,
+        outputs={
+            'Projection': projection,
+            'Cell': cell,
+            'BatchHidden': batch_hidden,
+            'BatchGate': batch_gate,
+            'BatchCellPreAct': batch_cell_pre_act
+        },
+        attrs={
+            'use_peepholes': use_peepholes,
+            'cell_clip': cell_clip,
+            'proj_clip': proj_clip,
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'cell_activation': cell_activation,
+            'candidate_activation': candidate_activation,
+            'proj_activation': proj_activation
+        })
+    return projection, cell
+
+
+def dynamic_gru(input,
+                size,
+                param_attr=None,
+                bias_attr=None,
+                is_reverse=False,
+                gate_activation='sigmoid',
+                candidate_activation='tanh',
+                h_0=None,
+                origin_mode=False):
+    """
+    **Note: The input type of this must be LoDTensor. If the input type to be
+    processed is Tensor, use** :ref:`api_fluid_layers_StaticRNN` .
+
+    This operator is used to perform the calculations for a single layer of
+    Gated Recurrent Unit (GRU) on full sequences step by step. The calculations
+    in one time step support these two modes:
+
+    If ``origin_mode`` is True, then the formula used is from paper
+    `Learning Phrase Representations using RNN Encoder Decoder for Statistical
+    Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_ .
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
+
+    if ``origin_mode`` is False, then the formula used is from paper
+    `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
+    Modeling  <https://arxiv.org/pdf/1412.3555.pdf>`_
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t}
+
+    :math:`x_t` is the input of current time step, but it is not from ``input`` .
+    This operator does not include the calculations :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` ,
+    **Note** thus a fully-connect layer whose size is 3 times of ``size`` should
+    be used before this operator, and the output should be used as ``input`` here.
+    :math:`h_{t-1}` is the hidden state from previous time step. 
+    :math:`u_t` , :math:`r_t` , :math:`\\tilde{h_t}` and :math:`h_t` stand for
+    update gate, reset gate, candidate hidden and hidden output separately.
+    :math:`W_{uh}, b_u` , :math:`W_{rh}, b_r` and :math:`W_{ch}, b_c` stand for
+    the weight matrix and bias used in update gate, reset gate, candidate hidden
+    calculations. For implementation, the three weight matrix are merged into a
+    tensor shaped :math:`[D, D \\times 3]` , the three bias are concatenated as
+    a tensor shaped :math:`[1, D \\times 3]` , where :math:`D` stands for the
+    hidden size; The data layout of weight tensor is: :math:`W_{uh}` and :math:`W_{rh}`
+    are concatenated with shape :math:`[D, D  \\times 2]` lying on the first part,
+    and :math:`W_{ch}` lying on the latter part with shape :math:`[D, D]` .
+
+
+    Args:
+        input(Variable): A LoDTensor whose lod level is 1, representing the input
+            after linear projection. Its shape should be :math:`[T, D \\times 3]` ,
+            where :math:`T` stands for the total sequence lengths in this mini-batch,
+            :math:`D` for the hidden size. The data type should be float32 or float64.
+        size(int): Indicate the hidden size.
+        param_attr(ParamAttr, optional):  To specify the weight parameter property.
+            Default: None, which means the default weight parameter property is used.
+            See usage for details in :ref:`api_fluid_ParamAttr` .
+        bias_attr (ParamAttr, optional): To specify the bias parameter property.
+            Default: None, which means the default bias parameter property is used.
+            See usage for details in :ref:`api_fluid_ParamAttr` .
+        is_reverse(bool, optional): Whether to compute in the reversed order of
+            input sequences. Default False.
+        gate_activation(str, optional): The activation fuction corresponding to
+            :math:`act_g` in the formula. "sigmoid", "tanh", "relu" and "identity"
+            are supported. Default "sigmoid".
+        candidate_activation(str, optional): The activation fuction corresponding to
+            :math:`act_c` in the formula. "sigmoid", "tanh", "relu" and "identity"
+            are supported. Default "tanh".
+        h_0 (Variable, optional): A Tensor representing the initial hidden state.
+            It not provided, the default initial hidden state is 0. The shape is
+            :math:`[N, D]` , where :math:`N` is the number of sequences in the
+            mini-batch, :math:`D` for the hidden size. The data type should be
+            same as ``input`` . Default None.
+
+    Returns:
+        Variable: A LoDTensor whose lod level is 1 and shape is :math:`[T, D]` , \
+            where :math:`T` stands for the total sequence lengths in this mini-batch \
+            :math:`D` for the hidden size. It represents GRU transformed sequence output, \
+            and has the same lod and data type with ``input`` .
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            dict_dim, emb_dim = 128, 64
+            data = fluid.data(name='sequence',
+                      shape=[None],
+                      dtype='int64',
+                      lod_level=1)
+            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
+            hidden_dim = 512
+            x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
+            hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim)
+    """
+
+    assert in_dygraph_mode(
+    ) is not True, "please use gru instead of dynamic_gru in dygraph mode!"
+
+    helper = LayerHelper('gru', **locals())
+    dtype = helper.input_dtype()
+
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
+    batch_size = input.shape[0]
+    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+    if h_0:
+        assert h_0.shape == (
+            batch_size, size
+        ), 'The shape of h0 should be(batch_size, %d)' % size
+        inputs['H0'] = h_0
+
+    hidden = helper.create_variable_for_type_inference(dtype)
+    batch_gate = helper.create_variable_for_type_inference(dtype)
+    batch_reset_hidden_prev = helper.create_variable_for_type_inference(dtype)
+    batch_hidden = helper.create_variable_for_type_inference(dtype)
+
+    helper.append_op(
+        type='gru',
+        inputs=inputs,
+        outputs={
+            'Hidden': hidden,
+            'BatchGate': batch_gate,
+            'BatchResetHiddenPrev': batch_reset_hidden_prev,
+            'BatchHidden': batch_hidden
+        },
+        attrs={
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'activation': candidate_activation,
+            'origin_mode': origin_mode
+        })
+    return hidden
+
+
+def gru_unit(input,
+             hidden,
+             size,
+             param_attr=None,
+             bias_attr=None,
+             activation='tanh',
+             gate_activation='sigmoid',
+             origin_mode=False):
+    """
+    Gated Recurrent Unit (GRU) RNN cell. This operator performs GRU calculations for
+    one time step and it supports these two modes:
+
+    If ``origin_mode`` is True, then the formula used is from paper
+    `Learning Phrase Representations using RNN Encoder Decoder for Statistical
+    Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_ .
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
+
+    if ``origin_mode`` is False, then the formula used is from paper
+    `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
+    Modeling  <https://arxiv.org/pdf/1412.3555.pdf>`_
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t}
+
+    :math:`x_t` is the input of current time step, but it is not ``input`` .
+    This operator does not include the calculations :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` ,
+    **Note** thus a fully-connect layer whose size is 3 times of GRU hidden size should
+    be used before this operator, and the output should be used as ``input`` here.
+    :math:`h_{t-1}` is the hidden state from previous time step. 
+    :math:`u_t` , :math:`r_t` , :math:`\\tilde{h_t}` and :math:`h_t` stand for
+    update gate, reset gate, candidate hidden and hidden output separately.
+    :math:`W_{uh}, b_u` , :math:`W_{rh}, b_r` and :math:`W_{ch}, b_c` stand for
+    the weight matrix and bias used in update gate, reset gate, candidate hidden
+    calculations. For implementation, the three weight matrix are merged into a
+    tensor shaped :math:`[D, D \\times 3]` , the three bias are concatenated as
+    a tensor shaped :math:`[1, D \\times 3]` , where :math:`D` stands for the
+    hidden size; The data layout of weight tensor is: :math:`W_{uh}` and :math:`W_{rh}`
+    are concatenated with shape :math:`[D, D  \\times 2]` lying on the first part,
+    and :math:`W_{ch}` lying on the latter part with shape :math:`[D, D]` .
+
+
+    Args:
+        input(Variable): A 2D Tensor representing the input after linear projection
+            after linear projection. Its shape should be :math:`[N, D \\times 3]` ,
+            where :math:`N` stands for batch size, :math:`D` for the hidden size.
+            The data type should be float32 or float64.
+        hidden(Variable): A 2D Tensor representing the hidden state from previous step.
+            Its shape should be :math:`[N, D]` , where :math:`N` stands for batch size,
+            :math:`D` for the hidden size. The data type should be same as ``input`` .
+        size(int): Indicate the hidden size.
+        param_attr(ParamAttr, optional):  To specify the weight parameter property.
+            Default: None, which means the default weight parameter property is used.
+            See usage for details in :ref:`api_fluid_ParamAttr` .
+        bias_attr (ParamAttr, optional): To specify the bias parameter property.
+            Default: None, which means the default bias parameter property is used.
+            See usage for details in :ref:`api_fluid_ParamAttr` .
+        activation(str, optional): The activation fuction corresponding to
+            :math:`act_c` in the formula. "sigmoid", "tanh", "relu" and "identity"
+            are supported. Default "tanh".
+        gate_activation(str, optional): The activation fuction corresponding to
+            :math:`act_g` in the formula. "sigmoid", "tanh", "relu" and "identity"
+            are supported. Default "sigmoid".
+
+    Returns:
+        tuple: The tuple contains three Tensor variables with the same data type \
+            as ``input`` . They represent the hidden state for next time step ( :math:`h_t` ), \
+            reseted previous hidden state ( :math:`r_t \odot h_{t-1}` ), and the \
+            concatenation of :math:`h_t, r_t, \\tilde{h_t}` . And they have shape \
+            :math:`[N, D]` , :math:`[N, D]` , :math:`[N, D \times 3]` separately. \
+            Usually only the hidden state for next time step ( :math:`h_t` ) is used \
+            as output and state, the other two are intermediate results of calculations.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            dict_dim, emb_dim = 128, 64
+            data = fluid.data(name='step_data', shape=[None], dtype='int64')
+            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
+            hidden_dim = 512
+            x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
+            pre_hidden = fluid.data(
+                name='pre_hidden', shape=[None, hidden_dim], dtype='float32')
+            hidden = fluid.layers.gru_unit(
+                input=x, hidden=pre_hidden, size=hidden_dim * 3)
+
+    """
+    activation_dict = dict(
+        identity=0,
+        sigmoid=1,
+        tanh=2,
+        relu=3, )
+    activation = activation_dict[activation]
+    gate_activation = activation_dict[gate_activation]
+
+    helper = LayerHelper('gru_unit', **locals())
+    dtype = helper.input_dtype()
+    size = size // 3
+
+    # create weight
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
+
+    gate = helper.create_variable_for_type_inference(dtype)
+    reset_hidden_pre = helper.create_variable_for_type_inference(dtype)
+    updated_hidden = helper.create_variable_for_type_inference(dtype)
+    inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight}
+    # create bias
+    if helper.bias_attr:
+        bias_size = [1, 3 * size]
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+        inputs['Bias'] = bias
+
+    helper.append_op(
+        type='gru_unit',
+        inputs=inputs,
+        outputs={
+            'Gate': gate,
+            'ResetHiddenPrev': reset_hidden_pre,
+            'Hidden': updated_hidden,
+        },
+        attrs={
+            'activation': 2,  # tanh
+            'gate_activation': 1,  # sigmoid
+            'origin_mode': origin_mode
+        })
+
+    return updated_hidden, reset_hidden_pre, gate
+
+
+def beam_search(pre_ids,
+                pre_scores,
+                ids,
+                scores,
+                beam_size,
+                end_id,
+                level=0,
+                is_accumulated=True,
+                name=None,
+                return_parent_idx=False):
+    """
+    Beam search is a classical algorithm for selecting candidate words in a
+    machine translation task.
+
+    Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
+    for more details.
+
+    **This operator only supports LoDTensor.** It is used after finishing
+    scores calculation to perform beam search for one time step. Specifically,
+    after ``ids`` and ``scores`` have been produced, it selects the top-K
+    ( `k` is ``beam_size`` ) candidate word ids of current step from ``ids``
+    according to the correspongding ``scores``. Additionally, ``pre_id`` and
+    ``pre_scores`` are the output of `beam_search` at previous step, they
+    are needed for special use to handle ended candidate translations.
+
+    Note that if ``is_accumulated`` is True, the ``scores`` passed in should
+    be accumulated scores. Otherwise, the ``scores`` are
+    considered as the probabilities of single step and would be transformed to
+    the log field and added up with ``pre_scores`` for final scores in this
+    operator. Length penalty should be done with extra operators before calculating
+    the accumulated scores if needed.
+
+    Please see the following demo for a fully beam search usage example:
+
+        fluid/tests/book/test_machine_translation.py
+
+    Args:
+        pre_ids(Variable): A LodTensor variable (lod level is 2), representing
+            the selected ids of previous step. It is the output of beam_search
+            at previous step. Its shape is `[batch_size, 1]` and its lod is
+            `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at the
+            first step. The data type should be int64.
+        pre_scores(Variable): A LodTensor variable has the same shape and lod
+            with ``pre_ids`` , representing the accumulated scores corresponding
+            to the selected ids of previous step. It is the output of
+            beam_search at previous step. The data type should be float32.
+        ids(Variable|None): A LodTensor variable containing the candidates ids.
+            It has the same lod with ``pre_ids`` and its shape should be
+            `[batch_size * beam_size, K]`, where `K` supposed to be greater than
+            ``beam_size`` and the first dimension size (decrease as samples reach
+            to the end) should be same as that of ``pre_ids`` . The data type
+            should be int64. It can be None, which use indice in ``scores`` as
+            ids.
+        scores(Variable): A LodTensor variable containing the accumulated
+            scores corresponding to ``ids`` . Both its shape and lod are same as
+            thoes of ``ids`` . The data type should be float32.
+        beam_size(int): The beam width used in beam search.
+        end_id(int): The id of end token.
+        level(int): **It can be ignored and mustn't change currently.**
+            The 2 level lod used in this operator has the following
+            meaning: The first level describes how many beams each sample has,
+            which would change to 0 when beams of the sample all end (batch reduce);
+            The second level describes how many times each beam is selected.
+            Default 0, which shouldn't be changed currently.
+        is_accumulated(bool): Whether the input ``score`` is accumulated scores.
+            Default True.
+        name(str, optional): For detailed information, please refer 
+            to :ref:`api_guide_Name`. Usually name is no need to set and 
+            None by default.
+        return_parent_idx(bool, optional): Whether to return an extra Tensor variable
+            in output, which stores the selected ids' parent indice in
+            ``pre_ids`` and can be used to update RNN's states by gather operator.
+            Default False.
+
+    Returns:
+        tuple: The tuple contains two or three LodTensor variables. The two LodTensor, \
+            representing the selected ids and the corresponding accumulated scores of \
+            current step, have the same shape `[batch_size, beam_size]` and lod with 2 levels, \
+            and have data types int64 and float32. If ``return_parent_idx`` is True, \
+            an extra Tensor variable preserving the selected ids' parent indice \
+            is included, whose shape is `[batch_size * beam_size]` and data type \
+            is int64.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            # Suppose `probs` contains predicted results from the computation
+            # cell and `pre_ids` and `pre_scores` is the output of beam_search
+            # at previous step.
+            beam_size = 4
+            end_id = 1
+            pre_ids = fluid.data(
+                name='pre_id', shape=[None, 1], lod_level=2, dtype='int64')
+            pre_scores = fluid.data(
+                name='pre_scores', shape=[None, 1], lod_level=2, dtype='float32')
+            probs = fluid.data(
+                name='probs', shape=[None, 10000], dtype='float32')
+            topk_scores, topk_indices = fluid.layers.topk(probs, k=beam_size)
+            accu_scores = fluid.layers.elementwise_add(
+                x=fluid.layers.log(x=topk_scores),
+                y=fluid.layers.reshape(pre_scores, shape=[-1]),
+                axis=0)
+            selected_ids, selected_scores = fluid.layers.beam_search(
+                pre_ids=pre_ids,
+                pre_scores=pre_scores,
+                ids=topk_indices,
+                scores=accu_scores,
+                beam_size=beam_size,
+                end_id=end_id)
+    """
+    helper = LayerHelper('beam_search', **locals())
+    score_type = pre_scores.dtype
+    id_type = pre_ids.dtype
+
+    inputs = {"pre_ids": pre_ids, "pre_scores": pre_scores, "scores": scores}
+    if ids is not None:
+        inputs["ids"] = ids
+
+    selected_scores = helper.create_variable_for_type_inference(
+        dtype=score_type)
+    selected_ids = helper.create_variable_for_type_inference(dtype=id_type)
+    # parent_idx is a tensor used to gather cell states at the next time
+    # step. Though lod in selected_ids can also be used to gather by
+    # sequence_expand, it is not efficient.
+    # gather_op's index input only supports int32 dtype currently
+    parent_idx = helper.create_variable_for_type_inference(dtype="int32")
+
+    helper.append_op(
+        type='beam_search',
+        inputs=inputs,
+        outputs={
+            'selected_ids': selected_ids,
+            'selected_scores': selected_scores,
+            'parent_idx': parent_idx
+        },
+        attrs={
+            # TODO(ChunweiYan) to assure other value support
+            'level': level,
+            'beam_size': beam_size,
+            'end_id': end_id,
+            'is_accumulated': is_accumulated,
+        })
+    if return_parent_idx:
+        return selected_ids, selected_scores, parent_idx
+    else:
+        return selected_ids, selected_scores
+
+
+def beam_search_decode(ids, scores, beam_size, end_id, name=None):
+    """
+    This operator is used after beam search has completed. It constructs the
+    full predicted sequences for each sample by walking back along the search
+    paths stored in lod of ``ids`` . The result sequences are stored in a
+    LoDTensor, which uses the following way to parse:
+
+    .. code-block:: text
+
+        If lod = [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]]
+
+        The first level of lod stands for: There are 2 samples each having 3
+        (beam width) predicted sequence.
+
+        The second level of lod stands for: The lengths of the first sample's
+        3 predicted sequences are 12, 12, 16; The lengths of the second sample's
+        3 predicted sequences are 14, 13, 15.
+
+
+    Please see the following demo for a fully beam search usage example:
+        fluid/tests/book/test_machine_translation.py
+
+    Args:
+        ids(Variable): The LoDTensorArray variable containing the selected ids
+            of all steps. Each LoDTensor in it has int64 data type and 2 level
+            lod which can be used to get the search paths.
+        scores(Variable): The LodTensorArray variable containing the accumulated
+            scores corresponding to selected ids of all steps. It has the same size
+            as ``ids`` . Each LoDTensor in it has the same shape and lod as the
+            counterpart in ``ids`` , and has a float32 data type.
+        beam_size(int): The beam width used in beam search.
+        end_id(int): The id of end token.
+        name(str, optional): For detailed information, please refer 
+            to :ref:`api_guide_Name`. Usually name is no need to set and 
+            None by default.
+
+    Returns:
+        tuple: The tuple contains two LodTensor variables. The two LodTensor, \
+            containing the full sequences of ids and the correspongding accumulated \
+            scores, have the same shape flattened to 1D and have the same 2 level \
+            lod. The lod can be used to get how many predicted sequences each sample \
+            has and how many ids each predicted sequence has.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            # Suppose `ids` and `scores` are LodTensorArray variables reserving
+            # the selected ids and scores of all steps
+            ids = fluid.layers.create_array(dtype='int64')
+            scores = fluid.layers.create_array(dtype='float32')
+            finished_ids, finished_scores = fluid.layers.beam_search_decode(
+                ids, scores, beam_size=5, end_id=0)
+    """
+    helper = LayerHelper('beam_search_decode', **locals())
+    sentence_ids = helper.create_variable_for_type_inference(dtype=ids.dtype)
+    sentence_scores = helper.create_variable_for_type_inference(dtype=ids.dtype)
+
+    helper.append_op(
+        type="beam_search_decode",
+        inputs={"Ids": ids,
+                "Scores": scores},
+        outputs={
+            "SentenceIds": sentence_ids,
+            "SentenceScores": sentence_scores
+        },
+        attrs={"beam_size": beam_size,
+               "end_id": end_id})
+
+    return sentence_ids, sentence_scores
+
+
+def lstm_unit(x_t,
+              hidden_t_prev,
+              cell_t_prev,
+              forget_bias=0.0,
+              param_attr=None,
+              bias_attr=None,
+              name=None):
+    """
+    Long-Short Term Memory (LSTM) RNN cell. This operator performs LSTM calculations for
+    one time step, whose implementation is based on calculations described in `RECURRENT
+    NEURAL NETWORK REGULARIZATION <http://arxiv.org/abs/1409.2329>`_  .
+
+    We add forget_bias to the biases of the forget gate in order to
+    reduce the scale of forgetting. The formula is as follows:
+    
+    .. math::
+
+        i_{t} & = \sigma(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+
+        f_{t} & = \sigma(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+
+        c_{t} & = f_{t}c_{t-1} + i_{t} tanh (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+
+        o_{t} & = \sigma(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+
+        h_{t} & = o_{t} tanh (c_{t})
+
+    :math:`x_{t}` stands for ``x_t`` , corresponding to the input of current time step;
+    :math:`h_{t-1}` and :math:`c_{t-1}` correspond to ``hidden_t_prev`` and ``cell_t_prev`` ,
+    representing the output of from previous time step.
+    :math:`i_{t}, f_{t}, c_{t}, o_{t}, h_{t}` are input gate, forget gate, cell, output gate
+    and hidden calculation.
+
+    Args:
+        x_t(Variable): A 2D Tensor representing the input of current time step.
+            Its shape should be :math:`[N, M]` , where :math:`N` stands for batch
+            size, :math:`M` for the feature size of input. The data type should
+            be float32 or float64.
+        hidden_t_prev(Variable): A 2D Tensor representing the hidden value from
+            previous step. Its shape should be :math:`[N, D]` , where :math:`N`
+            stands for batch size, :math:`D` for the hidden size. The data type
+            should be same as ``x_t`` .
+        cell_t_prev(Variable): A 2D Tensor representing the cell value from
+            previous step. It has the same shape and data type with ``hidden_t_prev`` .
+        forget_bias (float, optional): :math:`forget\\_bias` added to the biases
+            of the forget gate. Default 0.
+        param_attr(ParamAttr, optional):  To specify the weight parameter property.
+            Default: None, which means the default weight parameter property is used.
+            See usage for details in :ref:`api_fluid_ParamAttr` .
+        bias_attr (ParamAttr, optional): To specify the bias parameter property.
+            Default: None, which means the default bias parameter property is used.
+            See usage for details in :ref:`api_fluid_ParamAttr` .
+        name(str, optional): For detailed information, please refer 
+            to :ref:`api_guide_Name`. Usually name is no need to set and 
+            None by default.
+
+    Returns:
+        tuple: The tuple contains two Tensor variables with the same shape and \
+            data type with ``hidden_t_prev`` , representing the hidden value and \
+            cell value which correspond to :math:`h_{t}` and :math:`c_{t}` in \
+            the formula.
+
+    Raises:
+        ValueError: Rank of x_t must be 2.
+        ValueError: Rank of hidden_t_prev must be 2.
+        ValueError: Rank of cell_t_prev must be 2.
+        ValueError: The 1st dimensions of x_t, hidden_t_prev and cell_t_prev must be the same.
+        ValueError: The 2nd dimensions of hidden_t_prev and cell_t_prev must be the same.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            dict_dim, emb_dim, hidden_dim = 128, 64, 512
+            data = fluid.data(name='step_data', shape=[None], dtype='int64')
+            x = fluid.embedding(input=data, size=[dict_dim, emb_dim])
+            pre_hidden = fluid.data(
+                name='pre_hidden', shape=[None, hidden_dim], dtype='float32')
+            pre_cell = fluid.data(
+                name='pre_cell', shape=[None, hidden_dim], dtype='float32')
+            hidden = fluid.layers.lstm_unit(
+                x_t=x,
+                hidden_t_prev=pre_hidden,
+                cell_t_prev=pre_cell)
+    """
+    helper = LayerHelper('lstm_unit', **locals())
+
+    if len(x_t.shape) != 2:
+        raise ValueError("Rank of x_t must be 2.")
+
+    if len(hidden_t_prev.shape) != 2:
+        raise ValueError("Rank of hidden_t_prev must be 2.")
+
+    if len(cell_t_prev.shape) != 2:
+        raise ValueError("Rank of cell_t_prev must be 2.")
+
+    if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[
+            0] != cell_t_prev.shape[0]:
+        raise ValueError("The 1st dimensions of x_t, hidden_t_prev and "
+                         "cell_t_prev must be the same.")
+
+    if hidden_t_prev.shape[1] != cell_t_prev.shape[1]:
+        raise ValueError("The 2nd dimensions of hidden_t_prev and "
+                         "cell_t_prev must be the same.")
+
+    if bias_attr is None:
+        bias_attr = ParamAttr()
+
+    size = cell_t_prev.shape[1]
+    concat_out = nn.concat(input=[x_t, hidden_t_prev], axis=1)
+    fc_out = nn.fc(input=concat_out,
+                   size=4 * size,
+                   param_attr=param_attr,
+                   bias_attr=bias_attr)
+    dtype = x_t.dtype
+    c = helper.create_variable_for_type_inference(dtype)
+    h = helper.create_variable_for_type_inference(dtype)
+
+    helper.append_op(
+        type='lstm_unit',
+        inputs={"X": fc_out,
+                "C_prev": cell_t_prev},
+        outputs={"C": c,
+                 "H": h},
+        attrs={"forget_bias": forget_bias})
+
+    return h, c
--- a/python/paddle/fluid/layers/sequence_lod.py
+++ b/python/paddle/fluid/layers/sequence_lod.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from .layer_function_generator import templatedoc
+from ..framework import Variable, in_dygraph_mode
+from ..layer_helper import LayerHelper
+
+__all__ = [
+    'sequence_conv',
+    'sequence_softmax',
+    'sequence_pool',
+    'sequence_concat',
+    'sequence_first_step',
+    'sequence_last_step',
+    'sequence_slice',
+    'sequence_expand',
+    'sequence_expand_as',
+    'sequence_pad',
+    'sequence_unpad',
+    'sequence_reshape',
+    'sequence_scatter',
+    'sequence_enumerate',
+    'sequence_mask',
+    'sequence_reverse',
+]
+
+
+@templatedoc()
+def sequence_conv(input,
+                  num_filters,
+                  filter_size=3,
+                  filter_stride=1,
+                  padding=True,
+                  padding_start=None,
+                  bias_attr=None,
+                  param_attr=None,
+                  act=None,
+                  name=None):
+    """
+    **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use conv2d Op.(fluid.layers.** :ref:`api_fluid_layers_conv2d` ).
+
+    This operator receives input sequences with variable length and other convolutional
+    configuration parameters(num_filters, filter_size) to apply the convolution operation.
+    It fills all-zero padding data on both sides of the sequence by default to ensure that
+    the output is the same length as the input. You can customize the padding behavior by
+    configuring the parameter :attr:`padding\_start` .
+    
+    **Warning:** the parameter :attr:`padding` take no effect and will be deprecated in the future.
+
+    .. code-block:: text
+
+            Here we will illustrate the details of the padding operation:
+            For a mini-batch of 2 variable lengths sentences, containing 3, and 1 time-steps:
+            Assumed input (X) is a [4, N] float LoDTensor, and for the sake of simplicity, we assume N=2.
+            input.data = [[1, 1],
+                          [2, 2],
+                          [3, 3],
+                          [4, 4]]
+
+            This is to say that input (X) has 4 words and the dimension of each word
+            representation is 2.
+
+            * Case1:
+
+                If padding_start is -1 and filter_size is 3.
+                The length of padding data is calculated as follows:
+                up_pad_len = max(0, -padding_start) = 1
+                down_pad_len = max(0, filter_size + padding_start - 1) = 1
+
+                The output of the input sequence after padding is:
+                data_aftet_padding = [[0, 0, 1, 1, 2, 2],
+                                      [1, 1, 2, 2, 3, 3],
+                                      [2, 2, 3, 3, 0, 0],
+                                      [0, 0, 4, 4, 0, 0]]
+
+                It will be multiplied by the filter weight to get the final output.
+                Assume num_filters = 3
+                output.data = [[ 0.3234, -0.2334,  0.7433],
+                               [ 0.5646,  0.9464, -0.1223],
+                               [-0.1343,  0.5653,  0.4555],
+                               [ 0.9954, -0.1234, -0.1234]]
+                output.shape = [4, 3]     # 3 = num_filters
+                output.lod = [[0, 3, 4]]  # Remain the same
+
+
+    Args:
+        input (Variable): LoDTensor with shape :math:`(M, K)`, where M is the total time-step of mini-batch
+            and K is hidden_size of input. Only lod_level of 1 is supported. The data type should be float32 or
+            float64.
+        num_filters (int): the number of filters.
+        filter_size (int): the height of filter. Specified filter width is not supported, the width is
+            hidden_size by default. Default: 3.
+        filter_stride (int): stride of the filter. Currently only supports :attr:`stride` = 1.
+        padding (bool): the parameter :attr:`padding` take no effect and will be discarded in the
+            future. Currently, it will always pad input to make sure the length of the output is
+            the same as input whether :attr:`padding` is set true or false. Because the length of
+            input sequence may be shorter than :attr:`filter\_size`, which will cause the convolution
+            result to not be computed correctly. These padding data will not be trainable or updated
+            while trainnig. Default: True.
+        padding_start (int): It is used to indicate the start index for padding the input
+            sequence, which can be negative. The negative number means to pad
+            :attr:`|padding_start|` time-steps of all-zero data at the beginning of each instance.
+            The positive number means to skip :attr:`padding_start` time-steps of each instance,
+            and it will pad :math:`filter\_size + padding\_start - 1` time-steps of all-zero data
+            at the end of the sequence to ensure that the output is the same length as the input.
+            If set None, the same length :math:`\\frac{filter\_size}{2}` of data will be filled
+            on both sides of the sequence. If set 0, the length of :math:`filter\_size - 1` data
+            is padded at the end of each input sequence. Default: None.
+        bias_attr (ParamAttr): To specify the bias parameter property. Default: None, which means the
+            default bias parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` .
+        param_attr (ParamAttr): To specify the weight parameter property. Default: None, which means the
+            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` .
+        act (str): Activation to be applied to the output of this layer, such as tanh, softmax,
+            sigmoid, relu. For more information, please refer to :ref:`api_guide_activations_en` . Default: None.
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Variable: LoDTensor with the same length as input. The data type is float32 or float64, which is same as input.
+
+    Examples:
+
+        .. code-block:: python
+
+             import paddle.fluid as fluid
+
+             x = fluid.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
+             x_conved = fluid.layers.sequence_conv(input=x, num_filters=2, filter_size=3, padding_start=-1)
+    """
+
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
+    helper = LayerHelper('sequence_conv', **locals())
+    dtype = helper.input_dtype()
+    filter_shape = [filter_size * input.shape[1], num_filters]
+    filter_param = helper.create_parameter(
+        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
+    pre_bias = helper.create_variable_for_type_inference(dtype)
+    if padding_start is None:
+        padding_start = -int(filter_size // 2)
+
+    helper.append_op(
+        type='sequence_conv',
+        inputs={
+            'X': [input],
+            'Filter': [filter_param],
+        },
+        outputs={"Out": pre_bias},
+        attrs={
+            'contextStride': filter_stride,
+            'contextStart': padding_start,
+            'contextLength': filter_size,
+        })
+    pre_act = helper.append_bias_op(pre_bias)
+    return helper.append_activation(pre_act)
+
+
+def sequence_softmax(input, use_cudnn=False, name=None):
+    """
+    **Note**:
+    
+    **The input type of the OP must be LoDTensor. For Tensor, use:** :ref:`api_fluid_layers_softmax` 
+
+    A LoD-tensor can be regarded as several sequences, and this op apply softmax algo on each sequence.
+    The shape of input Tensor can be :math:`[N, 1]` or :math:`[N]`, where :math:`N`
+    is the sum of the length of all sequences. Recommended usage: :math:`[N]`.
+
+    For i-th sequence in a mini-batch:
+
+    .. math::
+
+        Out(X[lod[i]:lod[i+1]], :) = \\frac{\exp(X[lod[i]:lod[i+1], :])}{\sum(\exp(X[lod[i]:lod[i+1], :]))}
+
+    For example, for a LoD-Tensor with 6 sequences ([3, 2, 4, 1, 2, 3] - sequence length list in order), 
+    the lod in the runtime is [[0, 3, 5, 9, 10, 12, 15]],
+    then softmax will be computed among :math:`X[0:3,:],X[3:5,:],X[5:9,:],X[9:10,:],X[10:12,:],X[12:15,:]`,
+    and :math:`N` turns out to be 15.
+
+    .. code-block:: text
+
+        *Case 1:
+
+            Given:
+                input.data = [0.7, 1, 0.6,
+                              1.5, 1.1,
+                              1.2, 0.2, 0.6, 1.9,
+                              3.1,
+                              2.5, 0.8,
+                              0.1, 2.4, 1.3]
+                input.lod = [[0, 3, 5, 9, 10, 12, 15]]
+            then:
+                 output.data = [0.30724832, 0.41474187, 0.2780098,
+                                0.59868765, 0.40131235,
+                                0.2544242, 0.09359743, 0.13963096, 0.5123474, 
+                                1.,
+                                0.84553474, 0.15446526,
+                                0.06995796, 0.69777346, 0.23226859]
+                 output.lod = [[0, 3, 5, 9, 10, 12, 15]]    
+    
+
+    Args:
+        input (Variable):A LoDTensor with shape of  :math:`[N, 1]` or  :math:`[N]`, Recommended usage: :math:`[N]`. 
+                         Supported data types: float32, float64. 
+        use_cudnn (bool, optional): Use cudnn kernel or not. Effective only when the cudnn version of the paddle 
+                                    library is installed and GPU is used for training or reasoning. Default: False.
+        name (str, optional): The default value is None. Normally there is no need for user to set this property. 
+                              For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Variable: A LoD-Tensor which has the same shape and data type with input.
+
+    Examples:
+
+        .. code-block:: python
+
+             import paddle.fluid as fluid
+             x = fluid.data(name='x', shape=[7, 1],
+                              dtype='float32', lod_level=1)
+             x_sequence_softmax_1 = fluid.layers.sequence_softmax(input=x)  
+
+             y = fluid.data(name='y', shape=[7],
+                 dtype='float32', lod_level=1)
+             x_sequence_softmax_2 = fluid.layers.sequence_softmax(input=y)  
+    """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
+    helper = LayerHelper('sequence_softmax', **locals())
+    dtype = helper.input_dtype()
+    softmax_out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type="sequence_softmax",
+        inputs={"X": input},
+        outputs={"Out": softmax_out},
+        attrs={"use_cudnn": use_cudnn})
+    return softmax_out
+
+
+def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
+    """
+    **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use pool2d Op.(fluid.layers.** :ref:`api_fluid_layers_pool2d` ).
+
+    This operator only supports LoDTensor as input. It will apply specified pooling
+    operation on the input LoDTensor. It pools features of all time-steps of each
+    sequence at the last lod_level using :attr:`pool_type` mentioned in the parameters,
+    such as sum, average, sqrt, etc.
+
+    It supports six pool_type:
+
+    - average: :math:`Out[i] = \\frac{\sum_i X_i}{N}`
+    - sum:     :math:`Out[i] = \sum_jX_{ij}`
+    - sqrt:    :math:`Out[i] = \\frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}`
+    - max:     :math:`Out[i] = max(X_i)`
+    - last:    :math:`Out[i] = X_{N_i}`
+    - first:   :math:`Out[i]` = X_0
+
+    where :math:`N_i` is the length of i-th input sequence.
+
+    .. code-block:: text
+
+        Case 1:
+        input is a 1-level LoDTensor and pad_value = 0.0:
+            input.lod = [[0, 2, 5, 7, 7]]
+            input.data = [[1.], [3.], [2.], [4.], [6.], [5.], [1.]]
+            input.shape = [7, 1]
+
+        output is LoDTensor:
+            out.shape = [4, 1]
+            with condition out.shape[0] == len(x.lod[-1]) == 4
+
+        for different pool_type:
+            average: out.data = [[2.], [4.], [3.], [0.0]], where 2.=(1. + 3.)/2, 4.=(2. + 4. + 6.)/3, 3.=(5. + 1.)/2
+            sum    : out.data = [[4.], [12.], [6.], [0.0]], where 4.=1. + 3., 12.=2. + 4. + 6., 6.=5. + 1.
+            sqrt   : out.data = [[2.82], [6.93], [4.24], [0.0]], where 2.82=(1. + 3.)/sqrt(2), 6.93=(2. + 4. + 6.)/sqrt(3), 4.24=(5. + 1.)/sqrt(2)
+            max    : out.data = [[3.], [6.], [5.], [0.0]], where 3.=max(1., 3.), 6.=max(2., 4., 6.), 5.=max(5., 1.)
+            last   : out.data = [[3.], [6.], [1.], [0.0]], where 3.=last(1., 3.), 6.=last(2., 4., 6.), 1.=last(5., 1.)
+            first  : out.data = [[1.], [2.], [5.], [0.0]], where 1.=first(1., 3.), 2.=first(2., 4., 6.), 5.=first(5., 1.)
+
+            and all above [0.0] at last of out.data is padding data.
+
+        Case 2:
+        input is a 2-level LoDTensor containing 3 sequences with length info [2, 0, 3],
+        where 0 means empty sequence.
+        The first sequence contains 2 subsequence with length info [1, 2];
+        The last sequence contains 3 subsequence with length info [1, 0, 3].
+            input.lod = [[0, 2, 2, 5], [0, 1, 3, 4, 4, 7]]
+            input.data = [[1.], [3.], [2.], [4.], [6.], [5.], [1.]]
+            input.shape = [7, 1]
+
+        If pool_typ = sum, it will apply pooling on last lod_level [0, 1, 3, 4, 4, 7]. pad_value = 0.0
+        output is LoDTensor:
+            out.shape= [5, 1]
+            out.lod = [[0, 2, 2, 5]]
+            where out.shape[0] == len(x.lod[-1]) == 5
+            sum: out.data = [[1.], [5.], [4.], [0.0], [12.]]
+            where 1.=1., 5.=3. + 2., 4.=4., 0.0=pad_value, 12.=6. + 5. + 1.
+
+    Args:
+        input (variable): LoDTensor with lod_level no more than 2. The data type should be float32.
+        pool_type (str): The pooling type that supports average, sum, sqrt, max, last or first.
+        is_test (bool): Only works when :attr:`pool_type` is max. If set False, a temporary Tenosr maxIndex is
+            created to record the index information corresponding to the maximum value, which is used for backward
+            gradient calculation in the training phase. Default: False.
+        pad_value (float): Used to pad the pooling result for empty input sequence. Default: 0.0
+
+    Returns:
+        Variable: LoDTensor after pooling with data type float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+            avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
+            sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
+            sqrt_x = fluid.layers.sequence_pool(input=x, pool_type='sqrt')
+            max_x = fluid.layers.sequence_pool(input=x, pool_type='max')
+            last_x = fluid.layers.sequence_pool(input=x, pool_type='last')
+            first_x = fluid.layers.sequence_pool(input=x, pool_type='first')
+    """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
+    helper = LayerHelper('sequence_pool', **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    max_index = helper.create_variable_for_type_inference(dtype)
+
+    helper.append_op(
+        type="sequence_pool",
+        inputs={"X": input},
+        outputs={"Out": pool_out,
+                 "MaxIndex": max_index},
+        attrs={
+            "pooltype": pool_type.upper(),
+            "is_test": is_test,
+            "pad_value": pad_value
+        })
+
+    # when pool_type is max, variable max_index is initialized,
+    # so we stop the gradient explicitly here
+    if pool_type == 'max':
+        max_index.stop_gradient = True
+
+    return pool_out
+
+
+@templatedoc()
+def sequence_concat(input, name=None):
+    """
+    **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use concat Op.(fluid.layers.** :ref:`api_fluid_layers_concat` ).
+
+    This operator only supports LoDTensor as input. It concatenates the multiple LoDTensor from input by the LoD information,
+    and outputs the concatenated LoDTensor.
+
+    .. code-block:: text
+
+        input is a list of LoDTensor:
+            input = [x1, x2]
+        where:
+            x1.lod = [[0, 3, 5]]
+            x1.data = [[1], [2], [3], [4], [5]]
+            x1.shape = [5, 1]
+
+            x2.lod = [[0, 2, 4]]
+            x2.data = [[6], [7], [8], [9]]
+            x2.shape = [4, 1]
+        and should satisfy: len(x1.lod[0]) == len(x2.lod[0])
+
+        output is LoDTensor:
+            out.lod = [[0, 3+2, 5+4]]
+            out.data = [[1], [2], [3], [6], [7], [4], [5], [8], [9]]
+            out.shape = [9, 1]
+
+    Args:
+        input(list of Variable): List of LoDTensor to be concatenated. The length of each LoDTensor should be same.
+            The data type can be float32, float64 or int64.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Variable: Output the concatenated LoDTensor. The data type is same as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            x = fluid.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
+            y = fluid.data(name='y', shape=[-1, 10], dtype='float32', lod_level=1)
+            out = fluid.layers.sequence_concat(input=[x, y])
+    """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
+    helper = LayerHelper('sequence_concat', **locals())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    helper.append_op(
+        type='sequence_concat', inputs={'X': input}, outputs={'Out': [out]})
+    return out
+
+
+def sequence_first_step(input):
+    """
+    This operator only supports LoDTensor as input. Given the input LoDTensor, it will
+    select first time-step feature of each sequence as output.
+
+    .. code-block:: text
+
+       Case 1:
+        input is 1-level LoDTensor:
+            input.lod = [[0, 2, 5, 7]]
+            input.data = [[1.], [3.], [2.], [4.], [6.], [5.], [1.]]
+            input.shape = [7, 1]
+
+        output is a LoDTensor:
+            out.shape = [3, 1]
+            out.shape[0] == len(x.lod[-1]) == 3
+            out.data = [[1.], [2.], [5.]], where 1.=first(1., 3.), 2.=first(2., 4., 6.), 5.=first(5., 1.)
+
+        Case 2:
+        input is a 2-level LoDTensor containing 3 sequences with length info [2, 0, 3],
+        where 0 means empty sequence.
+        The first sequence contains 2 subsequence with length info [1, 2];
+        The last sequence contains 3 subsequence with length info [1, 0, 3].
+            input.lod = [[0, 2, 2, 5], [0, 1, 3, 4, 4, 7]]
+            input.data = [[1.], [3.], [2.], [4.], [6.], [5.], [1.]]
+            input.shape = [7, 1]
+
+        It will apply pooling on last lod_level [0, 1, 3, 4, 4, 7]. pad_value = 0.0
+        output is a LoDTensor:
+            out.shape= [5, 1]
+            out.lod = [[0, 2, 2, 5]]
+            out.shape[0] == len(x.lod[-1]) == 5
+            out.data = [[1.], [3.], [4.], [0.0], [6.]]
+            where 1.=first(1.), 3.=first(3., 2.), 4.=first(4.), 0.0 = pad_value, 6.=first(6., 5., 1.)
+
+    Args:
+        input(Variable): LoDTensor with lod_level no more than 2. The data type should be float32.
+
+    Returns:
+        Variable: LoDTensor consist of the sequence's first step vector. The data type is float32.
+
+    Examples:
+
+        .. code-block:: python
+
+             import paddle.fluid as fluid
+             x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+             x_first_step = fluid.layers.sequence_first_step(input=x)
+    """
+    return sequence_pool(input=input, pool_type="first")
+
+
+def sequence_last_step(input):
+    """
+    This operator only supports LoDTensor as input. Given the input LoDTensor, it will
+    select last time-step feature of each sequence as output.
+
+    .. code-block:: text
+
+        Case 1:
+        input is 1-level LoDTensor:
+            input.lod = [[0, 2, 5, 7]]
+            input.data = [[1.], [3.], [2.], [4.], [6.], [5.], [1.]]
+            input.shape = [7, 1]
+
+        output is a LoDTensor:
+            out.shape = [3, 1]
+            out.shape[0] == len(x.lod[-1]) == 3
+            out.data = [[3.], [6.], [1.]], where 3.=last(1., 3.), 6.=last(2., 4., 6.), 1.=last(5., 1.)
+
+        Case 2:
+        input is a 2-level LoDTensor containing 3 sequences with length info [2, 0, 3],
+        where 0 means empty sequence.
+        The first sequence contains 2 subsequence with length info [1, 2];
+        The last sequence contains 3 subsequence with length info [1, 0, 3].
+            input.lod = [[0, 2, 2, 5], [0, 1, 3, 4, 4, 7]]
+            input.data = [[1.], [3.], [2.], [4.], [6.], [5.], [1.]]
+            input.shape = [7, 1]
+
+        It will apply pooling on last lod_level [0, 1, 3, 4, 4, 7]. pad_value = 0.0
+        output is a LoDTensor:
+            out.shape= [5, 1]
+            out.lod = [[0, 2, 2, 5]]
+            out.shape[0] == len(x.lod[-1]) == 5
+            out.data = [[1.], [2.], [4.], [0.0], [1.]]
+            where 1.=last(1.), 2.=last(3., 2.), 4.=last(4.), 0.0 = pad_value, 1=last(6., 5., 1.)
+
+
+    Args:
+        input(Variable): LoDTensor with lod_level no more than 2. The data type should be float32.
+
+    Returns:
+        Variable: LoDTensor consist of the sequence's last step vector. The data type is float32.
+
+    Examples:
+
+        .. code-block:: python
+
+             import paddle.fluid as fluid
+             x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+             x_last_step = fluid.layers.sequence_last_step(input=x)
+    """
+    return sequence_pool(input=input, pool_type="last")
+
+
+def sequence_slice(input, offset, length, name=None):
+    """
+    **Sequence Slice Layer**
+
+    The layer crops a subsequence from given sequence with given start
+    offset and subsequence length.
+
+    It only supports sequence data (LoDTensor with lod_level equal to 1).
+
+    .. code-block:: text
+
+              - Case:
+
+            Given the input Variable **input**:
+
+                input.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]],
+                input.lod = [[3, 2]],
+                input.dims = (5, 2),
+
+            with offset.data = [[0], [1]] and length.data = [[2], [1]],
+
+            the output Variable will be
+
+                out.data = [[a1, a2], [b1, b2], [e1, e2]],
+                out.lod = [[2, 1]],
+                out.dims = (3, 2).
+
+    Note:
+          The first dimension size of **input**, **offset** and **length**
+          should be equal. The **offset** should start from 0.
+
+    Args:
+        input(Variable): LoDTensor, The input Variable which consists of the complete
+                         sequences.The data type is float32 or float64.
+        offset(Variable): LoDTensor, The offset to slice each sequence.The data
+                         type is int32 or int64.
+        length(Variable): LoDTensor, The length of each subsequence.The data
+                         type is int32 or int64.
+        name(str|None): The default value is None.  Normally there is no need
+                        for user to set this property.  For more information,
+                        please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Variable: The output subsequences.
+
+    Examples:
+
+        .. code-block:: python
+
+             import paddle.fluid as fluid
+             import numpy as np
+             seqs = fluid.data(name='x', shape=[10, 5],
+                              dtype='float32', lod_level=1)
+             offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32"))
+             length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32"))
+             subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
+                                                   length=length)
+    """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
+    helper = LayerHelper("sequence_slice", **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_variable_for_type_inference(dtype)
+
+    offset.stop_gradient = True
+    length.stop_gradient = True
+
+    helper.append_op(
+        type="sequence_slice",
+        inputs={"X": input,
+                "Offset": offset,
+                "Length": length},
+        outputs={"Out": out})
+
+    return out
+
+
+def sequence_expand(x, y, ref_level=-1, name=None):
+    """Sequence Expand Layer. This layer will expand the input variable ``x`` \
+        according to specified level ``ref_level`` lod of ``y``. Please note that \
+        the lod level of ``x`` is at most 1. If the lod level of ``x`` is 1, than \
+        the size of lod of ``x`` must be equal to the length of ``ref_level`` lod \
+        of ``y``. If the lod level of ``x`` is 0, then the first dim of ``x`` should \
+        be equal to the size of ``ref_level`` of ``y``. The rank of **x** is at least 2. \
+        When rank of ``x`` is greater than 2, then it would be viewed as a 2-D tensor.
+
+    Please note that the input ``x`` should be LodTensor or Tensor, \
+        and input ``y`` must be LodTensor.
+
+    Following examples will explain how sequence_expand works:
+
+    .. code-block:: text
+
+        Case 1
+
+        Consider 2 sequences [a][b] and [c][d], now we want to expand them to [a][b], [a][b], [c][d] and [c][d].
+        Sequence [a][b] expand twice and [c][d] expands twice, so the lod which according to is [2, 2].
+
+        Input x is a 1-level LoDTensor:
+            x.lod  = [[2,        2]]    #lod based on length may be easier to understand
+            x.data = [[a], [b], [c], [d]]
+            x.dims = [4, 1]
+
+        input y is a LoDTensor:
+            y.lod = [[2,    2],    #the 0th level lod, according to this level
+                     [3, 3, 1, 1]] #the 1st level lod, it has nothing to do with this level
+
+        ref_level: 0
+
+        then output is a 1-level LoDTensor out:
+            out.lod =  [[2,        2,        2,        2]]    #lod based on offfset
+            out.data = [[a], [b], [a], [b], [c], [d], [c], [d]]
+            out.dims = [8, 1]
+
+
+        Case 2
+
+        Consider 3 sequences [a], [b], [c], now we want to expand them to [a][a], [c][c][c].
+        It's obvious that the lod info of expanded sequences is [2, 0, 3].
+
+        x is a Tensor:
+            x.data = [[a], [b], [c]]
+            x.dims = [3, 1]
+
+        y is a LoDTensor:
+            y.lod = [[2, 0, 3]]
+
+        ref_level: -1
+
+        then output is a 1-level LodTensor:
+            out.data = [[a], [a], [c], [c], [c]]
+            out.dims = [5, 1]
+
+    Args:
+        x (Variable): The input variable which is a Tensor or LoDTensor, with the \
+            dims ``[M, K]``. The lod level is at most 1. The data type should be \
+            float32, float64, int8, int32 or int64.
+        y (Variable): The input variable which is a LoDTensor, the lod level is \
+            at least 1.
+        ref_level (int): Lod level of ``y`` to be referred by ``x``. If set to -1, \
+                         refer the last level of lod.
+        name(str, optional): For detailed information, please refer \
+            to :ref:`api_guide_Name`. Usually name is no need to set and \
+            None by default. 
+
+    Returns: The expanded variable which is a LoDTensor, with dims ``[N, K]``. \
+            ``N`` depends on the lod info of ``x`` and ``y``. \
+            The data type is same as input.
+
+    Return Type: Variable
+
+    Examples:
+        .. code-block:: python
+	
+            import paddle.fluid as fluid
+            import paddle.fluid.layers as layers
+            import numpy as np
+
+            x = fluid.data(name='x', shape=[4, 1], dtype='float32')
+            y = fluid.data(name='y', shape=[8, 1],
+                        dtype='float32', lod_level=1)
+            out = layers.sequence_expand(x=x, y=y, ref_level=0)
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            place = fluid.CPUPlace()
+
+            np_data = np.array([[1], [2], [3], [4]]).astype('float32')
+            x_lod_tensor = fluid.create_lod_tensor(np_data, [[2, 2]], place)
+            print(x_lod_tensor)
+            #lod: [[0, 2, 4]]
+            #    dim: 4, 1
+            #    layout: NCHW
+            #    dtype: float
+            #    data: [1 2 3 4]
+
+            np_data = np.array([[1], [2], [3], [4], [5], [6], [7], [8]]).astype('float32')
+	    y_lod_tensor = fluid.create_lod_tensor(np_data, [[2, 2], [3,3,1,1]], place)
+            print(y_lod_tensor)
+            #lod: [[0, 2, 4][0, 3, 6, 7, 8]]
+            #    dim: 8, 1
+            #    layout: NCHW
+            #    dtype: int64_t
+            #    data: [0 0 1 1 1 1 1 0]
+
+            out_main = exe.run(fluid.default_main_program(),
+                            feed={'x': x_lod_tensor, 'y': y_lod_tensor},
+                            fetch_list=[out], return_numpy=False)
+            print(out_main[0])
+            #lod: [[0, 2, 4, 6, 8]]
+            #    dim: 8, 1
+            #    layout: NCHW
+            #    dtype: float
+            #    data: [1 2 1 2 3 4 3 4]
+    """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
+    helper = LayerHelper('sequence_expand', input=x, **locals())
+    dtype = helper.input_dtype()
+    tmp = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='sequence_expand',
+        inputs={'X': x,
+                'Y': y},
+        outputs={'Out': tmp},
+        attrs={'ref_level': ref_level})
+    return tmp
+
+
+def sequence_expand_as(x, y, name=None):
+    """Sequence Expand As Layer. This OP will expand the input variable ``x`` \
+        according to the zeroth level lod of ``y``. Current implementation requires \
+        the level number of ``y``'s lod must be 1, and the first dimension of \
+        ``x`` should be equal to the size of ``y``'s zeroth level lod, thus \
+        the expanded LodTensor has the same lod info as ``y``. The expanded result \
+        has nothing to do with ``x``'s lod, so the lod of Input(X) is not considered.
+
+    Please note that the input ``x`` should be LodTensor or Tensor, \
+        and input ``y`` must be LodTensor.
+
+    Following examples will explain how sequence_expand_as works:
+
+    .. code-block:: text
+
+        Case 1:
+
+        Consider 4 sequences [a], [b], [c], [d], now we want to expand them to [a][a][a], [b][b][b], [c] and [d].
+        It's obvious that the lod info of expanded sequences is [0, 3, 6, 7, 8].
+        Given a 1-level LodTensor ``x``: 
+            x.data = [[a], [b], [c], [d]]
+            x.dims = [4, 1]
+        and input ``y``
+            y.lod = [[3, 3, 1, 1]] #lod based on length may be easier to understand
+
+        then we get 1-level LoDTensor out:
+            Out.lod =  [[0,            3,              6,  7,  8]] #based on offset
+            Out.data = [[a], [a], [a], [b], [b], [b], [c], [d]]
+            Out.dims = [8, 1]
+
+
+        Case 2:
+
+        Given a common Tensor ``x``:
+            x.data = [[a, b], [c, d], [e, f]]
+            x.dims = [3, 2]
+        and input ``y``:
+            y.lod = [[0, 2, 3, 6]]
+
+        then we get a 1-level LoDTensor:
+            out.lod =  [[0,             2,     3,                    6]]
+            out.data = [[a, b], [a, b] [c, d], [e, f], [e, f], [e, f]]
+            out.dims = [6, 2]
+
+    Args:
+        x (Variable): The input variable which is a Tensor or LoDTensor, with the \
+            dims ``[M, K]``. The data type should be float32, float64, int8, int32 \
+            or int64.
+        y (Variable): The input variable which is a LoDTensor with 1-level lod.
+        name (str, optional): For detailed information, please refer \
+            to :ref:`api_guide_Name`. Usually name is no need to set and \
+            None by default.
+
+    Returns: The expanded variable which is a LoDTensor with the dims ``[N, K]``. \
+            ``N`` depends on the lod of ``y``, and the lod level must be 1. \
+            The data type is same as input.
+
+    Return Type: Variable
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            import paddle.fluid.layers as layers
+            import numpy as np
+
+            x = fluid.data(name='x', shape=[4, 1], dtype='float32')
+            y = fluid.data(name='y', shape=[8, 1], dtype='float32', lod_level=1)
+            out = layers.sequence_expand_as(x=x, y=y)
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            place = fluid.CPUPlace()
+
+            np_data = np.array([[1], [2], [3], [4]]).astype('float32')
+            x_lod_tensor = fluid.create_lod_tensor(np_data, [[2, 2]], place)
+            print(x_lod_tensor)
+            #lod: [[0, 2, 4]]
+            #    dim: 4, 1
+            #    layout: NCHW
+            #    dtype: float
+            #    data: [1 2 3 4]
+
+            np_data = np.array([[1], [2], [3], [4], [5], [6], [7], [8]]).astype('float32')
+	    y_lod_tensor = fluid.create_lod_tensor(np_data, [[3,3,1,1]], place)
+            print(y_lod_tensor)
+            #lod: [[0, 3, 6, 7, 8]]
+            #    dim: 8, 1
+            #    layout: NCHW
+            #    dtype: int64_t
+            #    data: [0 0 1 0 1 1 1 0]
+
+            out_main = exe.run(fluid.default_main_program(),
+                            feed={'x': x_lod_tensor, 'y': y_lod_tensor},
+                            fetch_list=[out], return_numpy=False)
+            print(out_main[0])
+            #lod: [[0, 3, 6, 7, 8]]
+            #    dim: 8, 1
+            #    layout: NCHW
+            #    dtype: float
+            #    data: [1 1 1 2 2 2 3 4]
+    """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
+    helper = LayerHelper('sequence_expand_as', input=x, **locals())
+    dtype = helper.input_dtype()
+    tmp = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='sequence_expand_as',
+        inputs={'X': x,
+                'Y': y},
+        outputs={'Out': tmp})
+    return tmp
+
+
+def sequence_pad(x, pad_value, maxlen=None, name=None):
+    """
+    This layer padding the sequences in a same batch to a common length (according \
+         to ``maxlen``). The padding value is defined by ``pad_value``, and will be \
+        appended to the tail of sequences. The result is a Python tuple ``(Out, Length)``: \
+        the LodTensor ``Out`` is the padded sequences, and LodTensor ``Length`` is \
+        the length information of input sequences. For removing paddding data (unpadding \
+	operation), See :ref:`api_fluid_layers_sequence_unpad` .
+
+    Please note that the input ``x`` should be LodTensor.
+
+    .. code-block:: text
+
+        Case 1:
+        Given input 1-level LoDTensor x:
+            x.lod = [[0,  2,   5]]
+            x.data = [[a],[b],[c],[d],[e]]
+        pad_value:
+            pad_value.data = [0]
+        maxlen = 4
+
+        the output tuple (Out, Length):
+            Out.data = [[[a],[b],[0],[0]],[[c],[d],[e],[0]]]
+            Length.data = [2, 3]      #Original sequences length
+
+        Case 2:
+        Given input 1-level LoDTensor x:
+            x.lod =  [[0,             2,                     5]]
+            x.data = [[a1,a2],[b1,b2],[c1,c2],[d1,d2],[e1,e2]]
+        pad_value:
+            pad_value.data = [0]
+        defualt maxlen = None, (the virtual value is 3, according to the shape of x)
+
+        the output tuple (Out, Length):
+            Out.data = [[[a1,a2],[b1,b2],[0,0]],[[c1,c2],[d1,d2],[e1,e2]]]
+            Length.data = [2, 3]
+
+        Case 3:
+        Given input 1-level LoDTensor x:
+            x.lod =  [[0,             2,                     5]]
+            x.data = [[a1,a2],[b1,b2],[c1,c2],[d1,d2],[e1,e2]]
+        pad_value:
+            pad_value.data = [p1,p2]
+        defualt maxlen = None, (the virtual value is 3)
+
+        get tuple (Out, Length):
+            Out.data = [[[a1,a2],[b1,b2],[p1,p2]],[[c1,c2],[d1,d2],[e1,e2]]]
+            Length.data = [2, 3]
+
+
+
+    Args:
+        x (Variable): Input 1-level LodTensor with dims ``[M, K]``. The batch \
+            size is described by lod infor (the number of sequnces ). \
+            The data type should be float32, float64, int8, int32 or int64.
+        pad_value (Variable): Padding value. It can be a scalar or a 1D tensor \
+            with length ``K``. If it's a scalar, it will be automatically broadcasted \
+            to a Tensor. The data type should be as same as ``x``.
+        maxlen (int, optional): The length of padded sequences, None by default. \
+            When it is None, all sequences will be padded up to the length of the \
+            longest one among them; when it a certain positive value, it must be \
+            greater than the length of the longest original sequence.
+        name (str, optional): For detailed information, please refer \
+            to :ref:`api_guide_Name`. Usually name is no need to set and \
+            None by default.
+
+    Returns: A Python tuple (Out, Length): the 1st is a 0 level LodTensor \
+            ``Out``, with the shape ``[batch_size, maxlen, K]``; the second is the original \
+            sequences length infor ``Length``, which should be a 0-level 1D LodTensor. \
+            The size of ``Length`` is equal to batch size, and the data type is int64.
+
+    Return Type: tuple
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            import numpy
+
+            x = fluid.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
+            pad_value = fluid.layers.assign(
+                input=numpy.array([0.0], dtype=numpy.float32))
+            out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
+    """
+
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
+    helper = LayerHelper('sequence_pad', input=x, **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_variable_for_type_inference(dtype)
+    length = helper.create_variable_for_type_inference(dtype)
+
+    pad_value.stop_gradient = True
+    length.stop_gradient = True
+
+    if maxlen is None:
+        maxlen = -1
+    helper.append_op(
+        type='sequence_pad',
+        inputs={'X': x,
+                'PadValue': pad_value},
+        outputs={'Out': out,
+                 'Length': length},
+        attrs={'padded_length': maxlen})
+    return out, length
+
+
+def sequence_unpad(x, length, name=None):
+    """
+    **Note**:
+    
+    **The input of the OP is Tensor and the output is LoDTensor.  For padding operation, See:**  :ref:`api_fluid_layers_sequence_pad`  
+     
+    The OP removes the padding data from the input based on the length information and returns a LoDTensor.
+
+    .. code-block:: text
+
+	Case 1:
+
+	Given input Variable **x**:
+	    x.data = [[ 1.0,  2.0,  3.0,  4.0,  5.0],
+		      [ 6.0,  7.0,  8.0,  9.0, 10.0],
+		      [11.0, 12.0, 13.0, 14.0, 15.0]],
+
+	in which there are 3 sequences padded to length 5, and the acutal length
+	specified by input Variable **length**:
+
+	    length.data = [2, 3, 4],
+
+	after unpadding, the output Variable will be:
+
+	    out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
+	    out.lod = [[0, 2, 5, 9]]
+
+    Args:
+        x(Variable): A Tensor which contains padding data, and its shape size can not be less than 2.
+                     Supported data types: float32, float64, int32, int64.
+        length(Variable): A 1D Tensor that stores the actual length of each sample, and the Tensor 
+                          has the same shape with the 0th dimension of the X . Supported data types: int64.
+        name(str|None):  The default value is None.  Normally there is no need for user to set this property.  
+                         For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Variable: A LoDTensor whose recursive sequence length is consistent with the information of the length parameter and it has the same data type with input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            import numpy
+
+            # pad data
+            x = fluid.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
+            pad_value = fluid.layers.assign(input=numpy.array([0.0], dtype=numpy.float32))
+            pad_data, len = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
+            
+            # unpad data
+            unpad_data = fluid.layers.sequence_unpad(x=pad_data, length=len)
+    """
+
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
+    helper = LayerHelper('sequence_unpad', input=x, **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_variable_for_type_inference(dtype)
+
+    length.stop_gradient = True
+
+    helper.append_op(
+        type='sequence_unpad',
+        inputs={'X': x,
+                'Length': length},
+        outputs={'Out': out})
+    return out
+
+
+def sequence_reshape(input, new_dim):
+    """
+    **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use reshape Op.(fluid.layers.** :ref:`api_fluid_layers_reshape` ).
+
+    This operator only supports LoDTensor as input. Given :attr:`new_dim` ,
+    it will compute new shape according to original length of each sequence,
+    original dimensions and :attr:`new_dim` . Then it will output a new LoDTensor
+    containing :attr:`new_dim` . Currently it only supports 1-level LoDTensor.
+    Please make sure that (original length * original dimensions) can be divided
+    by the :attr:`new_dim` with no remainder for each sequence.
+
+    .. code-block:: text
+
+        input is a LoDTensor:
+            input.lod  = [[0, 2, 6]]
+            input.data = [[1,  2], [3,  4],
+                          [5,  6], [7,  8],
+                          [9, 10], [11, 12]]
+            input.shape = [6, 2]
+
+        set new_dim = 4
+        out is a LoDTensor:
+            out.lod  = [[0, 1, 3]]
+            out.data = [[1,  2,  3,  4],
+                        [5,  6,  7,  8],
+                        [9, 10, 11, 12]]
+            out.shape = [3, 4]
+
+
+    Args:
+
+       input (Variable): 1-level LoDTensor with shape :math:`[M, K]` . The data type should
+            be int32, int64, float32 or float64.
+       new_dim (int): New dimension that the input LoDTensor is reshaped to.
+
+    Returns:
+        Variable: Reshaped LoDTensor according to new dimension. The data type is same as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            x = fluid.data(name='x', shape=[None, 16], dtype='float32', lod_level=1)
+            x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=4)
+    """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
+    helper = LayerHelper('sequence_reshape', **locals())
+    out = helper.create_variable_for_type_inference(helper.input_dtype())
+    helper.append_op(
+        type='sequence_reshape',
+        inputs={'X': [input]},
+        outputs={'Out': [out]},
+        attrs={'new_dim': new_dim})
+    return out
+
+
+def sequence_scatter(input, index, updates, name=None):
+    """
+    **Note**:
+    
+    **The index and updates parameters of the OP must be LoDTensor.**
+     
+    Plus the updates data to the correspoding input according to the index.
+ 
+    The updated algorithm is as follows: output[instance_index][index [pos]] = input[instance_index][index [pos]] +  updates[pos], 
+    where instance_idx is the K sample corresponding to pos in batch.
+
+    The value of output[i][j] depends on whether j can be found in the i+1th interval of the index. If found, 
+    out[i][j] = input[i][j] + update[m] [n], otherwise, out[i][j] = input[i][j].
+
+    For example, in the following example, the lod information for index is divided into three sequences. Among 
+    them, because the element 0 can be found in the first interval of the index, it is updated with the value of 
+    the corresponding position of the updates, out[0][0] = input[0][0]+updates[0][0] . Because element 1 cannot 
+    be found in the third interval of index, out[2][1] = input[2][1].
+
+    .. code-block:: text
+        
+        *Case 1:
+
+            Given:
+                input.data = [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+                              [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+                              [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
+                              input.dims = [3, 6]
+
+                index.data = [[0], [1], [2], [5], [4], [3], [2], [1], [3], [2], [5], [4]]
+                index.lod =  [[0,        3,                       8,                 12]]
+
+                updates.data = [[0.3], [0.3], [0.4], [0.1], [0.2], [0.3], [0.4], [0.0], [0.2], [0.3], [0.1], [0.4]]
+                updates.lod =  [[  0,            3,                                 8,                         12]]
+
+            Then:
+                out.data = [[1.3, 1.3, 1.4, 1.0, 1.0, 1.0],
+                            [1.0, 1.0, 1.4, 1.3, 1.2, 1.1],
+                            [1.0, 1.0, 1.3, 1.2, 1.4, 1.1]]
+                out.dims = X.dims = [3, 6]
+
+    Args:
+        input (Variable): A Tensor with shape of  :math:`[N, k_1... k_n]`. Supported data types: float32, float64, int32, int64.
+        index (Variable):  A LoDTensor contains index information. Its LoD level must be 1 and its data type must be int64.
+        updates (Variable): A LodTensor contains updates information. It has the same  LoD level with the index and has the 
+                            same data type  with the input. Supported data types: float32, float64, int32, int64.
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, 
+                              please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Variable: A Tensor which has been updated. It has the same shape and data type with input.
+
+    Examples:
+
+        .. code-block:: python
+	
+            import paddle.fluid as fluid
+
+            input = fluid.data( name="x", shape=[None, 3, 6], dtype='float32' )
+            index = fluid.data( name='index', shape=[12, 1],  dtype='int64', lod_level=1)
+            updates = fluid.data( name='updates', shape=[12, 1], dtype='float32', lod_level=1)
+            output = fluid.layers.sequence_scatter(input, index, updates)
+
+    """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
+    helper = LayerHelper('sequence_scatter', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type="sequence_scatter",
+        inputs={"X": input,
+                "Ids": index,
+                "Updates": updates},
+        outputs={"Out": out})
+    return out
+
+
+def sequence_enumerate(input, win_size, pad_value=0, name=None):
+    """
+    Generate a new sequence for the input index sequence with \
+        shape ``[d_1, win_size]``, which enumerates all the \
+        sub-sequences with length ``win_size`` of the input with \
+        shape ``[d_1, 1]``, and padded by ``pad_value`` if necessary in generation.
+
+    Please note that the `input` must be LodTensor.
+
+    .. code-block:: text
+
+        Input x:
+            x.lod = [[0, 3, 5]]
+            x.data = [[1], [2], [3], [4], [5]]
+            x.dims = [5, 1]
+
+        Attrs:
+            win_size = 2
+            pad_value = 0
+
+        Output:
+            out.lod = [[0, 3, 5]]
+            out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
+            out.dims = [5, 2]
+
+
+    Args:
+        input (Variable): The input variable which is a index sequence, \
+            which should be a LodTensor with shape ``[d_1, 1]`` and 1-level lod info. \
+            The data type should be float32, float64, int8, int32 or int64.
+        win_size (int): The window size for enumerating all sub-sequences.
+        pad_value (int, optional): The padding value, default 0.
+        name(str, optional): For detailed information, please refer \
+            to :ref:`api_guide_Name`. Usually name is no need to set and \
+            None by default.
+
+    Returns: The enumerate sequence variable which is a LoDTensor with \
+            shape ``[d_1, win_size]`` and 1-level lod info. \
+            The data type is same as ``input``.
+
+    Return Type: Variable
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            x = fluid.data(name='x', shape=[-1, 1], dtype='int32', lod_level=1)
+            out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
+    """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
+    helper = LayerHelper('sequence_enumerate', **locals())
+    out = helper.create_variable_for_type_inference(
+        helper.input_dtype(), stop_gradient=True)
+    helper.append_op(
+        type='sequence_enumerate',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={'win_size': win_size,
+               'pad_value': pad_value})
+    return out
+
+
+def sequence_mask(x, maxlen=None, dtype='int64', name=None):
+    """
+    **SequenceMask Layer**
+
+    This layer outputs a mask according to the input :code:`x` and
+    :code:`maxlen` with data type of :code:`dtype`.
+
+    Supposing :code:`x` is a Tensor with shape [d_1, d_2, ..., d_n], the
+    :code:`y` is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
+
+    .. math::
+
+        y(i_1, i_2,..., i_n, j) = (j < x(i_1, i_2,..., i_n))
+
+    .. code-block:: text
+
+        Case:
+
+        Consider input:
+            x = [3, 1, 1, 0]    max_len = 4
+
+        then we get out:
+            mask = [[1, 1, 1, 0],
+                    [1, 0, 0, 0],
+                    [1, 0, 0, 0],
+                    [0, 0, 0, 0]]
+
+    Args:
+        x (Variable): Input tensor of sequence_mask layer, \
+            whose elements are integers less than :code:`maxlen`. \
+            Tensor or LodTensor with shape [d_1, d_2, ..., d_n].
+        maxlen (int, optional): Maximum length of the sequence. If :code:`maxlen` \
+                           is None, it would be replace with :math:`max(x)`.
+        dtype (np.dtype|core.VarDesc.VarType|str, optional): Data type of the output, \
+             ``int64`` by default.
+        name(str, optional): For detailed information, please refer \
+            to :ref:`api_guide_Name`. Usually name is no need to set and \
+            None by default.
+
+    Returns: The output sequence mask. Tensor or LodTensor with shape [d_1, d_2, ..., d_n, maxlen] \
+            and data type of :code:`dtype`. The data type should be float32, float64, int8, \
+            int32 or int64.
+
+    Return Type: Variable
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            import paddle.fluid.layers as layers
+
+            x = fluid.data(name='x', shape=[10], dtype='float32', lod_level=1)
+            mask = layers.sequence_mask(x=x)
+
+    """
+    helper = LayerHelper('sequence_mask', **locals())
+    if name is None:
+        out = helper.create_variable_for_type_inference(dtype=dtype)
+    else:
+        out = helper.create_variable_for_type_inference(dtype=dtype, name=name)
+
+    inputs = {'X': [x]}
+    attrs = {'out_dtype': out.dtype}
+    if maxlen is not None:
+        if isinstance(maxlen, Variable):
+            inputs['MaxLenTensor'] = maxlen
+        else:
+            attrs['maxlen'] = maxlen
+
+    helper.append_op(
+        type='sequence_mask', inputs=inputs, outputs={'Y': out}, attrs=attrs)
+
+    out.stop_gradient = True
+    return out
+
+
+@templatedoc()
+def sequence_reverse(x, name=None):
+    """
+    **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use reverse Op.(fluid.layers.** :ref:`api_fluid_layers_reverse` ).
+
+    This operator only supports LoDTensor as input. It will reverse each sequence for input LoDTensor.
+    Currently it only supports 1-level LoDTensor. This operator is very useful when building a
+    reverse :ref:`api_fluid_layers_DynamicRNN` network.
+
+    .. code-block:: text
+
+        input(x) is a LoDTensor:
+            x.lod  = [[0, 2, 5]]
+            x.data = [[1,  2,  3,  4],
+                      [5,  6,  7,  8],
+                      [9, 10, 11, 12],
+                      [13,14, 15, 16],
+                      [17,18, 19, 20]]
+            x.shape = [5, 4]
+
+        output LoDTensor with same shape and LoD info:
+            out.lod  = [[0, 2, 5]]
+            out.data = [[5,  6,  7,  8],
+                        [1,  2,  3,  4],
+                        [17,18, 19, 20],
+                        [13,14, 15, 16],
+                        [9, 10, 11, 12]]
+            out.shape = [5, 4]
+
+    Args:
+        x(Variable): LoDTensor with 1-level LoD info. Currently it only supports 1-level LoDTensor.
+            The data type should be float32, float64, int8, int32 or int64.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Variable: LoDTensor reversed from input. The data type is same with input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+            x_reversed = fluid.layers.sequence_reverse(x)
+    """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
+    helper = LayerHelper("sequence_reverse", **locals())
+    if name is None:
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    else:
+        out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False)
+
+    helper.append_op(
+        type="sequence_reverse",
+        inputs={"X": x},
+        outputs={"Y": out},
+        attrs=dict())
+    return out