From f12434620404fed71daf2738cf1de09fcf03ad6d Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Sat, 9 May 2020 14:20:21 +0800
Subject: [PATCH] Add api docs for RNN related apis.

---
 hapi/text/text.py | 3651 ++++++++++++++++++++++++++++-----------------
 1 file changed, 2283 insertions(+), 1368 deletions(-)

diff --git a/hapi/text/text.py b/hapi/text/text.py
index b5a0cf5..5327bbd 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -45,17 +45,40 @@ from paddle.fluid.dygraph import Layer
 from paddle.fluid.layers import BeamSearchDecoder
 
 __all__ = [
-    'RNNCell', 'BasicLSTMCell', 'BasicGRUCell', 'RNN', 'DynamicDecode',
-    'BeamSearchDecoder', 'MultiHeadAttention', 'FFN',
-    'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer',
-    'TransformerDecoder', 'TransformerCell', 'TransformerBeamSearchDecoder',
-    'LinearChainCRF', 'CRFDecoding', 'SequenceTagging', 'GRUEncoder',
-    'StackedLSTMCell', 'LSTM', 'BidirectionalLSTM', 'StackedGRUCell', 'GRU',
-    'BidirectionalGRU'
+    'RNNCell',
+    'BasicLSTMCell',
+    'BasicGRUCell',
+    'RNN',
+    'StackedLSTMCell',
+    'LSTM',
+    'BidirectionalLSTM',
+    'StackedGRUCell',
+    'GRU',
+    'BidirectionalGRU',
+    'DynamicDecode',
+    'BeamSearchDecoder',
+    'MultiHeadAttention',
+    'FFN',
+    'TransformerEncoderLayer',
+    'TransformerEncoder',
+    'TransformerDecoderLayer',
+    'TransformerDecoder',
+    'TransformerCell',
+    'TransformerBeamSearchDecoder',
+    'LinearChainCRF',
+    'CRFDecoding',
+    'SequenceTagging',
+    'GRUEncoder',
 ]
 
 
 class RNNCell(Layer):
+    """
+    RNNCell is the base class for abstraction representing the calculations
+    mapping the input and state to the output and new state. It is suitable to
+    and mostly used in RNN.
+    """
+
     def get_initial_states(self,
                            batch_ref,
                            shape=None,
@@ -70,16 +93,18 @@ class RNNCell(Layer):
             batch_ref: A (possibly nested structure of) tensor variable[s].
                 The first dimension of the tensor will be used as batch size to
                 initialize states.
-            shape: A (possiblely nested structure of) shape[s], where a shape is
+            shape: A (possibly nested structure of) shape[s], where a shape is
                 represented as a list/tuple of integer). -1(for batch size) will
                 beautomatically inserted if shape is not started with it. If None,
                 property `state_shape` will be used. The default value is None.
-            dtype: A (possiblely nested structure of) data type[s]. The structure
+            dtype: A (possibly nested structure of) data type[s]. The structure
                 must be same as that of `shape`, except when all tensors' in states
                 has the same data type, a single data type can be used. If None and
                 property `cell.state_shape` is not available, float32 will be used
                 as the data type. The default value is None.
             init_value: A float value used to initialize states.
+            batch_dim_idx: An integer indicating which dimension of the tensor in
+                inputs represents batch size.  The default value is 0.
 
         Returns:
             Variable: tensor variable[s] packed in the same structure provided \
@@ -170,46 +195,61 @@ class RNNCell(Layer):
 
 class BasicLSTMCell(RNNCell):
     """
-    ****
-    BasicLSTMUnit class, Using basic operator to build LSTM
-    The algorithm can be described as the code below.
-        .. math::
-           i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i)
-           f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias )
-           o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o)
-           \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
-           c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
-           h_t &= o_t \odot tanh(c_t)
-        - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
-          of weights from the input gate to the input)
-        - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
-        - sigmoid is the logistic sigmoid function.
-        - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
-          and cell activation vectors, respectively, all of which have the same size as
-          the cell output activation vector $h$.
-        - The :math:`\odot` is the element-wise product of the vectors.
-        - :math:`tanh` is the activation functions.
-        - :math:`\\tilde{c_t}` is also called candidate hidden state,
-          which is computed based on the current input and the previous hidden state.
-    Args:
-        name_scope(string) : The name scope used to identify parameter and bias name
-        hidden_size (integer): The hidden size used in the Unit.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, lstm_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of LSTM unit.
-            If it is set to None or one attribute of ParamAttr, lstm_unit will
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized as zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cells (actNode).
-                             Default: 'fluid.layers.tanh'
-        forget_bias(float|1.0): forget bias used when computing forget gate
-        dtype(string): data type used in this unit
+    Long-Short Term Memory(LSTM) RNN cell.
+
+    The formula used is as follows:
+
+    .. math::
+
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+
+        h_{t} & = o_{t} act_c (c_{t})
+
+    Please refer to `An Empirical Exploration of Recurrent Network Architectures
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_
+    for more details.
+
+    Parameters:
+        input_size (int): The input size in the LSTM cell.
+        hidden_size (int): The hidden size in the LSTM cell.
+        param_attr(ParamAttr, optional): The parameter attribute for the learnable
+            weight matrix. Default: None.
+        bias_attr (ParamAttr, optional): The parameter attribute for the bias
+            of LSTM. Default: None.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias(float, optional): forget bias used when computing forget gate.
+            Default 1.0
+        dtype(string, optional): The data type used in this cell. Default float32.
+        forget_gate_weights (dict, optional): A dict includes `w`, `h` and `b`
+            as keys, and the corresponding values should be instances of Parameter
+            which represent :math:`W_{x_{f}}, W_{h_{f}}, b_{f}` and have shape
+            [input_size, hidden_size], [hidden_size, hidden_size], [hidden_size]
+            separately. It is used for reusing and sharing weights when provided,
+            otherwise create these parameters. Note that parameters from input
+            gate, forget gate and cell would be concatenated in implementation.
+        input_gate_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
+            and the corresponding values should be instances of Parameter which
+            represent :math:`W_{x_{i}}, W_{h_{i}}, b_{i}` separately. It has the
+            same usage as :attr:`forget_gate_weights`.
+        output_gate_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
+            and the corresponding values should be instances of Parameter which
+            represent :math:`W_{x_{o}}, W_{h_{o}}, b_{o}` separately. It has the
+            same usage as :attr:`forget_gate_weights`.
+        cell_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
+            and the corresponding values should be instances of Parameter which
+            represent :math:`W_{x_{c}}, W_{h_{c}}, b_{c}` separately. It has the
+            same usage as :attr:`forget_gate_weights`.
     """
 
     def __init__(self,
@@ -480,41 +520,63 @@ class BasicLSTMCell(RNNCell):
 
     @property
     def state_shape(self):
+        """
+        The `state_shape` of BasicLSTMCell is a list with two shapes: `[[hidden_size], [hidden_size]]`
+        (-1 for batch size would be automatically inserted into shape). These two
+        shapes correspond to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
+        """
         return [[self._hidden_size], [self._hidden_size]]
 
 
 class BasicGRUCell(RNNCell):
     """
-    ****
-    BasicGRUUnit class, using basic operators to build GRU
-    The algorithm can be described as the equations below.
-
-        .. math::
-            u_t & = actGate(W_ux xu_{t} + W_uh h_{t-1} + b_u)
-
-            r_t & = actGate(W_rx xr_{t} + W_rh h_{t-1} + b_r)
-
-            m_t & = actNode(W_cx xm_t + W_ch dot(r_t, h_{t-1}) + b_m)
-
-            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
-
-    Args:
-        hidden_size (integer): The hidden size used in the Unit.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, gru_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of GRU unit.
-            If it is set to None or one attribute of ParamAttr, gru_unit will 
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cell (actNode).
-                             Default: 'fluid.layers.tanh'
-        dtype(string): data type used in this unit
+    Gated Recurrent Unit (GRU) RNN cell.
+
+    The formula for GRU used is as follows:
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
+    Please refer to `An Empirical Exploration of Recurrent Network Architectures
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_
+    for more details.
+
+    Parameters:
+        input_size (int): The input size for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        param_attr(ParamAttr, optional): The parameter attribute for the learnable
+            weight matrix. Default: None.
+        bias_attr (ParamAttr, optional): The parameter attribute for the bias
+            of LSTM. Default: None.
+        gate_activation (function, optional): The activation function for gates
+            of GRU, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            GRU, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        dtype(string, optional): The data type used in this cell. Default float32.
+        update_gate_weights (dict, optional): A dict includes `w`, `h` and `b`
+            as keys, and the corresponding values should be instances of Parameter
+            which represent :math:`W_{ux}, W_{uh}, b_{u}` and have shape
+            [input_size, hidden_size], [hidden_size, hidden_size], [hidden_size]
+            separately. It is used for reusing and sharing weights when provided,
+            otherwise create these parameters. Note that parameters from update
+            gate and reset gate would be concatenated in implementation.
+        reset_gate_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
+            and the corresponding values should be instances of Parameter which
+            represent :math:`W_{rx}, W_{rh}, b_{r}` separately. It has the
+            same usage as :attr:`update_gate_weights`.
+        cell_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
+            and the corresponding values should be instances of Parameter which
+            represent :math:`W_{cx}, W_{ch}, b_{c}`` separately. It has the
+            same usage as :attr:`update_gate_weights`.
     """
 
     def __init__(self,
@@ -678,7 +740,7 @@ class BasicGRUCell(RNNCell):
 
             if "b" in reset_gate_weights and reset_gate_weights[
                     "b"] is not None:
-                self.rg_b = reused_params["b"]
+                self.rg_b = reset_gate_weights["b"]
             else:
                 if gate_bias_attr is not None and gate_bias_attr.name is not None:
                     tmp_param_attr = copy.deepcopy(gate_bias_attr)
@@ -771,10 +833,44 @@ class BasicGRUCell(RNNCell):
 
     @property
     def state_shape(self):
+        """
+        The `state_shape` of BasicGRUCell is a shape `[hidden_size]` (-1 for batch
+        size would be automatically inserted into shape). The shape corresponds
+        to :math:`h_{t-1}`.
+        """
         return [self._hidden_size]
 
 
-class RNN(fluid.dygraph.Layer):
+class RNN(Layer):
+    """
+    RNN creates a recurrent neural network specified by RNNCell `cell`, which
+    performs :code:`cell.forward()` repeatedly until reaches to the maximum
+    length of `inputs`.
+
+    Parameters:
+        cell(RNNCell): An instance of `RNNCell`.
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import StackedLSTMCell, RNN
+
+            inputs = paddle.rand((2, 4, 32))
+            cell = StackedLSTMCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell, inputs=inputs)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+
     def __init__(self, cell, is_reverse=False, time_major=False):
         super(RNN, self).__init__()
         self.cell = cell
@@ -790,6 +886,38 @@ class RNN(fluid.dygraph.Layer):
                 initial_states=None,
                 sequence_length=None,
                 **kwargs):
+        """
+        Performs :code:`cell.forward()` repeatedly until reaches to the maximum
+        length of `inputs`.
+
+        Parameters:
+            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
+                The shape of tensor should be `[batch_size, sequence_length, ...]`
+                for `time_major == False` or `[sequence_length, batch_size, ...]`
+                for `time_major == True`. It represents the inputs to be unrolled
+                in RNN.
+            initial_states (Variable, optional): A (possibly nested structure of)
+                tensor variable[s], representing the initial state for RNN. 
+                If not provided, `cell.get_initial_states` would be used to produce
+                the initial state. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. 
+
+        Returns:
+            tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \
+                outputs and states, both are Tensor or nested structure of Tensor. \
+                `final_outputs` has the same structure and data types as \
+                the returned `outputs` of :code:`cell.forward` , and each Tenser in `final_outputs` \
+                stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \
+                for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \
+                `final_states` is the counterpart at last time step of initial states, \
+                thus has the same structure with it and has tensors with same shapes \
+                and data types.
+        """
         if fluid.in_dygraph_mode():
 
             class ArrayWrapper(object):
@@ -878,1474 +1006,2261 @@ class RNN(fluid.dygraph.Layer):
         return final_outputs, final_states
 
 
-class DynamicDecode(Layer):
-    def __init__(self,
-                 decoder,
-                 max_step_num=None,
-                 output_time_major=False,
-                 impute_finished=False,
-                 is_test=False,
-                 return_length=False):
-        super(DynamicDecode, self).__init__()
-        self.decoder = decoder
-        self.max_step_num = max_step_num
-        self.output_time_major = output_time_major
-        self.impute_finished = impute_finished
-        self.is_test = is_test
-        self.return_length = return_length
+class StackedRNNCell(RNNCell):
+    """
+    Wrapper allowing a stack of RNN cells to behave as a single cell. It is used
+    to implement stacked RNNs.
 
-    def forward(self, inits=None, **kwargs):
-        if fluid.in_dygraph_mode():
+    Parameters:
+        cells (list|tuple): List of RNN cell instances.
 
-            class ArrayWrapper(object):
-                def __init__(self, x):
-                    self.array = [x]
+    Examples:
 
-                def append(self, x):
-                    self.array.append(x)
-                    return self
+        .. code-block:: python
 
-                def __getitem__(self, item):
-                    return self.array.__getitem__(item)
+            from paddle.incubate.hapi.text import BasicLSTMCell, StackedRNNCell
 
-            def _maybe_copy(state, new_state, step_mask):
-                # TODO: use where_op
-                state_dtype = state.dtype
-                if convert_dtype(state_dtype) in ["bool"]:
-                    state = layers.cast(state, dtype="float32")
-                    new_state = layers.cast(new_state, dtype="float32")
-                if step_mask.dtype != state.dtype:
-                    step_mask = layers.cast(step_mask, dtype=state.dtype)
-                    # otherwise, renamed bool gradients of would be summed up leading
-                    # to sum(bool) error.
-                    step_mask.stop_gradient = True
-                new_state = layers.elementwise_mul(
-                    state, step_mask, axis=0) - layers.elementwise_mul(
-                        new_state, (step_mask - 1), axis=0)
-                if convert_dtype(state_dtype) in ["bool"]:
-                    new_state = layers.cast(new_state, dtype=state_dtype)
-                return new_state
+            cells = [BasicLSTMCell(32, 32), BasicLSTMCell(32, 32)]
+            stack_rnn = StackedRNNCell(cells)
+    """
 
-            initial_inputs, initial_states, initial_finished = self.decoder.initialize(
-                inits)
-            inputs, states, finished = (initial_inputs, initial_states,
-                                        initial_finished)
-            cond = layers.logical_not((layers.reduce_all(initial_finished)))
-            sequence_lengths = layers.cast(
-                layers.zeros_like(initial_finished), "int64")
-            outputs = None
+    def __init__(self, cells):
+        self.cells = []
+        for i, cell in enumerate(cells):
+            self.cells.append(self.add_sublayer("cell_%d" % i, cell))
 
-            step_idx = 0
-            step_idx_tensor = layers.fill_constant(
-                shape=[1], dtype="int64", value=step_idx)
-            while cond.numpy():
-                (step_outputs, next_states, next_inputs,
-                 next_finished) = self.decoder.step(step_idx_tensor, inputs,
-                                                    states, **kwargs)
-                if not self.decoder.tracks_own_finished:
-                    # BeamSearchDecoder would track it own finished, since
-                    # beams would be reordered and the finished status of each
-                    # entry might change. Otherwise, perform logical OR which
-                    # would not change the already finished.
-                    next_finished = layers.logical_or(next_finished, finished)
-                    # To confirm states.finished/finished be consistent with
-                    # next_finished.
-                    layers.assign(next_finished, finished)
-                next_sequence_lengths = layers.elementwise_add(
-                    sequence_lengths,
-                    layers.cast(
-                        layers.logical_not(finished), sequence_lengths.dtype))
+    def forward(self, inputs, states, **kwargs):
+        """
+        Performs :code:`cell.forward` for all including cells sequentially.
+        Each cell's `inputs` is the `outputs` of the previous cell. And each
+        cell's `states` is the corresponding one in `states`.
 
-                if self.impute_finished:  # rectify the states for the finished.
-                    next_states = map_structure(
-                        lambda x, y: _maybe_copy(x, y, finished), states,
-                        next_states)
-                outputs = map_structure(
-                    lambda x: ArrayWrapper(x),
-                    step_outputs) if step_idx == 0 else map_structure(
-                        lambda x, x_array: x_array.append(x), step_outputs,
-                        outputs)
-                inputs, states, finished, sequence_lengths = (
-                    next_inputs, next_states, next_finished,
-                    next_sequence_lengths)
+        Parameters:
+            inputs (Variable): The inputs for the first cell. Mostly it is a
+                float32 or float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+            **kwargs: Additional keyword arguments, which passed to `cell.forward`
+                for all including cells.
 
-                layers.increment(x=step_idx_tensor, value=1.0, in_place=True)
-                step_idx += 1
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ). `outputs` is the \
+                `outputs` of the last cell. `new_states` is a list composed \
+                of all cells' `new_states`, and its structure and data type is \
+                same as that of `states` argument.
+        """
+        new_states = []
+        for cell, state in zip(self.cells, states):
+            outputs, new_state = cell(inputs, state, **kwargs)
+            inputs = outputs
+            new_states.append(new_state)
+        return outputs, new_states
 
-                layers.logical_not(layers.reduce_all(finished), cond)
-                if self.max_step_num is not None and step_idx > self.max_step_num:
-                    break
+    @staticmethod
+    def stack_param_attr(param_attr, n):
+        """
+        If `param_attr` is a list or tuple, convert every element in it to a
+        ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
+        construct a list, and rename every one by appending a increasing index
+        suffix to avoid having same names when `param_attr` contains a name.
 
-            final_outputs = map_structure(
-                lambda x: fluid.layers.stack(x.array, axis=0), outputs)
-            final_states = states
+        Parameters:
+            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+                converted to a ParamAttr instance by `ParamAttr._to_attr`.
+            n (int): The times to repeat to construct a list when `param_attr`
+                is not a list or tuple.
 
-            try:
-                final_outputs, final_states = self.decoder.finalize(
-                    final_outputs, final_states, sequence_lengths)
-            except NotImplementedError:
-                pass
+        Returns:
+            list: A list composed of each including cell's `param_attr`.
+        """
+        if isinstance(param_attr, (list, tuple)):
+            assert len(param_attr) == n, (
+                "length of param_attr should be %d when it is a list/tuple" %
+                n)
+            param_attrs = [
+                fluid.ParamAttr._to_attr(attr) for attr in param_attr
+            ]
+        else:
+            param_attrs = []
+            attr = fluid.ParamAttr._to_attr(param_attr)
+            for i in range(n):
+                attr_i = copy.deepcopy(attr)
+                if attr.name:
+                    attr_i.name = attr_i.name + "_" + str(i)
+                param_attrs.append(attr_i)
+        return param_attrs
 
-            if not self.output_time_major:
-                final_outputs = map_structure(
-                    lambda x: layers.transpose(x, [1, 0] + list(
-                        range(2, len(x.shape)))), final_outputs)
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of StackedRNNCell is a list composed of each including
+        cell's `state_shape`.
 
-            return (final_outputs, final_states,
-                    sequence_lengths) if self.return_length else (
-                        final_outputs, final_states)
-        else:
-            return fluid.layers.dynamic_decode(
-                self.decoder,
-                inits,
-                max_step_num=self.max_step_num,
-                output_time_major=self.output_time_major,
-                impute_finished=self.impute_finished,
-                is_test=self.is_test,
-                return_length=self.return_length,
-                **kwargs)
+        Returns:
+            list: A list composed of each including cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
 
 
-class TransformerCell(Layer):
+class StackedLSTMCell(RNNCell):
     """
-    Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be
-    used as RNNCell
+    Wrapper allowing a stack of LSTM cells to behave as a single cell. It is used
+    to implement stacked LSTM.
+
+    The formula for LSTM used here is as follows:
+
+    .. math::
+
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+
+        h_{t} & = o_{t} act_c (c_{t})
+
+
+    Parameters:
+        input_size (int): The input size for the first LSTM cell.
+        hidden_size (int): The hidden size for every LSTM cell.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias (float, optional): forget bias used when computing forget
+            gate. It also can accept a boolean value `True`, which would set
+            :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
+            :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
+            Default 1.0.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            LSTM. It also can be a list or tuple, including dropout probabilities
+            for the corresponding LSTM. Default 0.0
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import StackedLSTMCell, RNN
+
+            inputs = paddle.rand((2, 4, 32))
+            cell = StackedLSTMCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell, inputs=inputs)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
     """
 
-    def __init__(self, decoder, embedding_fn=None, output_fn=None):
-        super(TransformerCell, self).__init__()
-        self.decoder = decoder
-        self.embedding_fn = embedding_fn
-        self.output_fn = output_fn
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype="float32"):
+        super(StackedLSTMCell, self).__init__()
+        self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
+                                             float)
+        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
+        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
 
-    def forward(self, inputs, states, trg_src_attn_bias, enc_output,
-                static_caches):
-        trg_word, trg_pos = inputs
-        for cache, static_cache in zip(states, static_caches):
-            cache.update(static_cache)
-        if self.embedding_fn is not None:
-            dec_input = self.embedding_fn(trg_word, trg_pos)
-            outputs = self.decoder(dec_input, enc_output, None,
-                                   trg_src_attn_bias, states)
-        else:
-            outputs = self.decoder(trg_word, trg_pos, enc_output, None,
-                                   trg_src_attn_bias, states)
-        if self.output_fn is not None:
-            outputs = self.output_fn(outputs)
-        if len(outputs.shape) == 3:
-            # squeeze to adapt to BeamSearchDecoder which use 2D logits
-            outputs = layers.squeeze(outputs, [1])
-        new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states]
+        self.cells = []
+        for i in range(num_layers):
+            if forget_bias is True:
+                bias_attrs[
+                    i].initializer = fluid.initializer.NumpyArrayInitializer(
+                        np.concatenate(
+                            np.zeros(2 * hidden_size),
+                            np.ones(hidden_size), np.zeros(hidden_size))
+                        .astype(dtype))
+                forget_bias = 0.0
+            self.cells.append(
+                self.add_sublayer(
+                    "lstm_%d" % i,
+                    BasicLSTMCell(
+                        input_size=input_size if i == 0 else hidden_size,
+                        hidden_size=hidden_size,
+                        gate_activation=gate_activation,
+                        activation=activation,
+                        forget_bias=forget_bias,
+                        param_attr=param_attrs[i],
+                        bias_attr=bias_attrs[i],
+                        dtype=dtype)))
+
+    def forward(self, inputs, states):
+        """
+        Performs the stacked LSTM cells sequentially. Each cell's `inputs` is
+        the `outputs` of the previous cell. And each cell's `states` is the
+        corresponding one in `states`.
+
+        Parameters:
+            inputs (Variable): The inputs for the first cell. It is a float32 or
+                float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+            **kwargs: Additional keyword arguments, which passed to `cell.forward`
+                for all including cells.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
+                a tensor with shape `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}` in the formula of the last LSTM; `new_states` \
+                is a list composed of every LSTM `new_states` which is a pair \
+                of tensors standing for :math:`h_{t}, c_{t}` in the formula, \
+                and the data type and structure of these tensors all is same \
+                as that of `states`.
+        """
+        new_states = []
+        for i, cell in enumerate(self.cells):
+            outputs, new_state = cell(inputs, states[i])
+            outputs = layers.dropout(
+                outputs,
+                self.dropout[i],
+                dropout_implementation='upscale_in_train') if self.dropout[
+                    i] > 0 else outputs
+            inputs = outputs
+            new_states.append(new_state)
         return outputs, new_states
 
     @property
     def state_shape(self):
-        return [{
-            "k": [self.decoder.n_head, 0, self.decoder.d_key],
-            "v": [self.decoder.n_head, 0, self.decoder.d_value],
-        } for i in range(len(self.decoder.n_layer))]
+        """
+        The `state_shape` of StackedLSTMCell is a list composed of each including
+        LSTM cell's `state_shape`.
 
+        Returns:
+            list: A list composed of each including LSTM cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
 
-class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
-    def __init__(self, cell, start_token, end_token, beam_size,
-                 var_dim_in_state):
-        super(TransformerBeamSearchDecoder,
-              self).__init__(cell, start_token, end_token, beam_size)
-        self.cell = cell
-        self.var_dim_in_state = var_dim_in_state
-
-    def _merge_batch_beams_with_var_dim(self, x):
-        # init length of cache is 0, and it increases with decoding carrying on,
-        # thus need to reshape elaborately
-        var_dim_in_state = self.var_dim_in_state + 1  # count in beam dim
-        x = layers.transpose(x,
-                             list(range(var_dim_in_state, len(x.shape))) +
-                             list(range(0, var_dim_in_state)))
-        x = layers.reshape(
-            x, [0] * (len(x.shape) - var_dim_in_state
-                      ) + [self.batch_size * self.beam_size] +
-            [int(size) for size in x.shape[-var_dim_in_state + 2:]])
-        x = layers.transpose(
-            x,
-            list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) +
-            list(range(0, (len(x.shape) + 1 - var_dim_in_state))))
-        return x
-
-    def _split_batch_beams_with_var_dim(self, x):
-        var_dim_size = layers.shape(x)[self.var_dim_in_state]
-        x = layers.reshape(
-            x, [-1, self.beam_size] +
-            [int(size)
-             for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] +
-            [int(size) for size in x.shape[self.var_dim_in_state + 1:]])
-        return x
-
-    def step(self, time, inputs, states, **kwargs):
-        # compared to RNN, Transformer has 3D data at every decoding step
-        inputs = layers.reshape(inputs, [-1, 1])  # token
-        pos = layers.ones_like(inputs) * time  # pos
-        cell_states = map_structure(self._merge_batch_beams_with_var_dim,
-                                    states.cell_states)
-
-        cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states,
-                                                   **kwargs)
-        cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
-        next_cell_states = map_structure(self._split_batch_beams_with_var_dim,
-                                         next_cell_states)
-
-        beam_search_output, beam_search_state = self._beam_search_step(
-            time=time,
-            logits=cell_outputs,
-            next_cell_states=next_cell_states,
-            beam_state=states)
-        next_inputs, finished = (beam_search_output.predicted_ids,
-                                 beam_search_state.finished)
-
-        return (beam_search_output, beam_search_state, next_inputs, finished)
 
-
-### Transformer Modules ###
-class PrePostProcessLayer(Layer):
+class LSTM(Layer):
     """
-    PrePostProcessLayer
+    Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input
+    sequence.
+
+    The formula for LSTM used here is as follows:
+
+    .. math::
+
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+
+        h_{t} & = o_{t} act_c (c_{t})
+
+
+    Parameters:
+        input_size (int): The input feature size for the first LSTM.
+        hidden_size (int): The hidden size for every LSTM.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias (float, optional): forget bias used when computing forget
+            gate. It also can accept a boolean value `True`, which would set
+            :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
+            :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
+            Default 1.0.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            LSTM. It also can be a list or tuple, including dropout probabilities
+            for the corresponding LSTM. Default 0.0
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import LSTM
+
+            inputs = paddle.rand((2, 4, 32))
+            lstm = LSTM(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = lstm(inputs)  # [2, 4, 64]
     """
 
     def __init__(self,
-                 process_cmd,
-                 d_model,
-                 dropout_rate,
-                 reused_layer_norm=None):
-        super(PrePostProcessLayer, self).__init__()
-        self.process_cmd = process_cmd
-        self.functors = []
-        for cmd in self.process_cmd:
-            if cmd == "a":  # add residual connection
-                self.functors.append(
-                    lambda x, y: x + y if y is not None else x)
-            elif cmd == "n":  # add layer normalization
-                if reused_layer_norm is not None:
-                    layer_norm = reused_layer_norm
-                else:
-                    layer_norm = LayerNorm(
-                        normalized_shape=d_model,
-                        param_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(1.)),
-                        bias_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(0.)))
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 is_reverse=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(LSTM, self).__init__()
+        lstm_cell = StackedLSTMCell(input_size, hidden_size, gate_activation,
+                                    activation, forget_bias, num_layers,
+                                    dropout, param_attr, bias_attr, dtype)
+        self.lstm = RNN(lstm_cell, is_reverse, time_major)
 
-                self.functors.append(
-                    self.add_sublayer(
-                        "layer_norm_%d" % len(
-                            self.sublayers(include_sublayers=False)),
-                        layer_norm))
-            elif cmd == "d":  # add dropout
-                self.functors.append(lambda x: layers.dropout(
-                    x, dropout_prob=dropout_rate, is_test=False)
-                                     if dropout_rate else x)
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        """
+        Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs`
+        is the `inputs` of the subsequent one.
 
-    def forward(self, x, residual=None):
-        for i, cmd in enumerate(self.process_cmd):
-            if cmd == "a":
-                x = self.functors[i](x, residual)
-            else:
-                x = self.functors[i](x)
-        return x
+        Parameters:
+            inputs (Variable): The inputs for the first LSTM. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked LSTM, and the initial states of each LSTM is a pair
+                of tensors shaped `[batch_size, hidden_size]`. If not provided,
+                use 0 as initial states. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last LSTM and it is a tensor with shape \
+                `[batch_size, sequence_length, hidden_size]` and has the same \
+                data type as `inputs`, `final_states` is the counterpart of \
+                `initial_states` at last time step, thus has the same structure \
+                with it and has tensors with same shapes data types. 
+        """
+        return self.lstm(inputs, initial_states, sequence_length)
 
 
-class MultiHeadAttention(Layer):
+class BidirectionalRNN(Layer):
     """
-    Multi-Head Attention
+    Wrapper for bidirectional RNN. It assembles two RNNCell instances to perform
+    forward and backward RNN separately, and merge outputs of these two RNN
+    according to `merge_mode`.
+
+    Parameters:
+        cell_fw (RNNCell): A RNNCell instance used for forward RNN.
+        cell_bw (RNNCell): A RNNCell instance used for backward RNN.
+        merge_mode (str|None, optional): The way to merget outputs of forward and
+            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
+            where None stands for make the two `outputs` as a tuple, `zip` stands
+            for make each two corresponding tensors of the two `outputs` as a tuple.
+            Default `concat`
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle.incubate.hapi.text import BasicLSTMCell, StackedRNNCell
+
+            inputs = paddle.rand((2, 4, 32))
+            cell_fw = StackedLSTMCell(32, 64)
+            cell_bw = StackedLSTMCell(32, 64)
+            bi_rnn = BidirectionalRNN(cell_fw, cell_bw)
+            outputs, _ = bi_rnn(inputs)  # [2, 4, 128]
     """
 
     def __init__(self,
-                 d_key,
-                 d_value,
-                 d_model,
-                 n_head=1,
-                 dropout_rate=0.0,
-                 reused_query_fc=None,
-                 reused_key_fc=None,
-                 reused_value_fc=None,
-                 reused_proj_fc=None):
-
-        super(MultiHeadAttention, self).__init__()
-        self.n_head = n_head
-        self.d_key = d_key
-        self.d_value = d_value
-        self.d_model = d_model
-        self.dropout_rate = dropout_rate
-
-        if reused_query_fc is not None:
-            self.q_fc = reused_query_fc
-        else:
-            self.q_fc = Linear(
-                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        if reused_key_fc is not None:
-            self.k_fc = reused_key_fc
-        else:
-            self.k_fc = Linear(
-                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        if reused_value_fc is not None:
-            self.v_fc = reused_value_fc
-        else:
-            self.v_fc = Linear(
-                input_dim=d_model,
-                output_dim=d_value * n_head,
-                bias_attr=False)
-        if reused_proj_fc is not None:
-            self.proj_fc = reused_proj_fc
+                 cell_fw,
+                 cell_bw,
+                 merge_mode='concat',
+                 time_major=False,
+                 cell_cls=None,
+                 **kwargs):
+        super(BidirectionalRNN, self).__init__()
+        self.rnn_fw = RNN(cell_fw, is_reverse=False, time_major=time_major)
+        self.rnn_bw = RNN(cell_bw, is_reverse=True, time_major=time_major)
+        if merge_mode == 'concat':
+            self.merge_func = lambda x, y: layers.concat([x, y], -1)
+        elif merge_mode == 'sum':
+            self.merge_func = lambda x, y: layers.elementwise_add(x, y)
+        elif merge_mode == 'ave':
+            self.merge_func = lambda x, y: layers.scale(
+                layers.elementwise_add(x, y), 0.5)
+        elif merge_mode == 'mul':
+            self.merge_func = lambda x, y: layers.elementwise_mul(x, y)
+        elif merge_mode == 'zip':
+            self.merge_func = lambda x, y: (x, y)
+        elif merge_mode is None:
+            self.merge_func = None
         else:
-            self.proj_fc = Linear(
-                input_dim=d_value * n_head,
-                output_dim=d_model,
-                bias_attr=False)
+            raise ValueError('Unsupported value for `merge_mode`: %s' %
+                             merge_mode)
 
-    def _prepare_qkv(self, queries, keys, values, cache=None):
-        if keys is None:  # self-attention
-            keys, values = queries, queries
-            static_kv = False
-        else:  # cross-attention
-            static_kv = True
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        """
+        Performs forward and backward RNN separately, and merge outputs of these
+        two RNN according to `merge_mode`.
 
-        q = self.q_fc(queries)
-        q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
-        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
+        Parameters:
+            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
+                The shape of tensor should be `[batch_size, sequence_length, ...]`
+                for `time_major == False` or `[sequence_length, batch_size, ...]`
+                for `time_major == True`. It represents the inputs to be unrolled
+                in both forward and backward RNN.
+            initial_states (Variable|list|tuple): If it is a list or tuple, its
+                length should be 2 to include initial states of forward and backward
+                RNN separately. Otherwise it would be used twice for the two RNN. 
+                If None, `cell.get_initial_states` would be used to produce the initial
+                states. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`.
 
-        if cache is not None and static_kv and "static_k" in cache:
-            # for encoder-decoder attention in inference and has cached
-            k = cache["static_k"]
-            v = cache["static_v"]
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is produced by merge outputs of forward and backward RNN according \
+                to `merge_mode`, `final_states` is a pair including `final_states` \
+                of forward and backward RNN.
+        """
+        if isinstance(initial_states, (list, tuple)):
+            assert len(
+                initial_states
+            ) == 2, "length of initial_states should be 2 when it is a list/tuple"
         else:
-            k = self.k_fc(keys)
-            v = self.v_fc(values)
-            k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
-            k = layers.transpose(x=k, perm=[0, 2, 1, 3])
-            v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
-            v = layers.transpose(x=v, perm=[0, 2, 1, 3])
-
-        if cache is not None:
-            if static_kv and not "static_k" in cache:
-                # for encoder-decoder attention in inference and has not cached
-                cache["static_k"], cache["static_v"] = k, v
-            elif not static_kv:
-                # for decoder self-attention in inference
-                cache_k, cache_v = cache["k"], cache["v"]
-                k = layers.concat([cache_k, k], axis=2)
-                v = layers.concat([cache_v, v], axis=2)
-                cache["k"], cache["v"] = k, v
+            initial_states = [initial_states, initial_states]
+        outputs_fw, states_fw = self.rnn_fw(inputs, initial_states[0],
+                                            sequence_length, **kwargs)
+        outputs_bw, states_bw = self.rnn_bw(inputs, initial_states[1],
+                                            sequence_length, **kwargs)
+        outputs = map_structure(
+            self.merge_func, outputs_fw,
+            outputs_bw) if self.merge_func else (outputs_fw, outputs_bw)
+        return outputs, (states_fw, states_bw)
 
-        return q, k, v
+    @staticmethod
+    def bidirect_param_attr(param_attr):
+        """
+        Converts `param_attr` to a pair of `param_attr` when it is not a list
+        or tuple with length 2, also rename every one by appending a suffix to
+        avoid having same names when `param_attr` contains a name.
 
-    def forward(self, queries, keys, values, attn_bias, cache=None):
-        # compute q ,k ,v
-        q, k, v = self._prepare_qkv(queries, keys, values, cache)
+        Parameters:
+            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+                converted to a ParamAttr instance by `ParamAttr._to_attr`. When
+                it is a list or tuple, its length must be 2.
 
-        # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5)
-        if attn_bias is not None:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if self.dropout_rate:
-            weights = layers.dropout(
-                weights, dropout_prob=self.dropout_rate, is_test=False)
-
-        out = layers.matmul(weights, v)
-
-        # combine heads
-        out = layers.transpose(out, perm=[0, 2, 1, 3])
-        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
-
-        # project to output
-        out = self.proj_fc(out)
-        return out
-
-    def cal_kv(self, keys, values):
-        k = self.k_fc(keys)
-        v = self.v_fc(values)
-        k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
-        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
-        v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
-        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
-        return k, v
+        Returns:
+            list: A pair composed of forward and backward RNN cell's `param_attr`.
+        """
+        if isinstance(param_attr, (list, tuple)):
+            assert len(
+                param_attr
+            ) == 2, "length of param_attr should be 2 when it is a list/tuple"
+            param_attrs = param_attr
+        else:
+            param_attrs = []
+            attr = fluid.ParamAttr._to_attr(param_attr)
+            attr_fw = copy.deepcopy(attr)
+            if attr.name:
+                attr_fw.name = attr_fw.name + "_fw"
+            param_attrs.append(attr_fw)
+            attr_bw = copy.deepcopy(attr)
+            if attr.name:
+                attr_bw.name = attr_bw.name + "_bw"
+            param_attrs.append(attr_bw)
+        return param_attrs
 
 
-class FFN(Layer):
+class BidirectionalLSTM(Layer):
     """
-    Feed-Forward Network
+    Applies a bidirectional multi-layer long short-term memory (LSTM) RNN to an
+    input sequence. 
+    
+    Bidirection interaction can happen after each layer or only after the last
+    layer according to the  `merge_each_layer` setting. The way to interact,
+    that is how to merge outputs of the two direction, is determined by `merge_mode`.
+
+    The formula for LSTM used here is as follows:
+
+    .. math::
+
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+
+        h_{t} & = o_{t} act_c (c_{t})
+
+
+    Parameters:
+        input_size (int): The input feature size for the first LSTM.
+        hidden_size (int): The hidden size for every LSTM.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias (float, optional): forget bias used when computing forget
+            gate. It also can accept a boolean value `True`, which would set
+            :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
+            :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
+            Default 1.0.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            LSTM. It also can be a list or tuple, including dropout probabilities
+            for the corresponding LSTM. Default 0.0
+        merge_mode (str|None, optional): The way to merget outputs of forward and
+            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
+            where None stands for make the two `outputs` as a tuple, `zip` stands
+            for make each two corresponding tensors of the two `outputs` as a tuple.
+            Default `concat`
+        merge_each_layer (bool, optional): Indicate whether bidirection interaction
+            happens after each layer or only after the last layer. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import BidirectionalLSTM
+
+            inputs = paddle.rand((2, 4, 32))
+            bi_lstm = BidirectionalLSTM(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = bi_lstm(inputs)  # [2, 4, 128]
     """
 
     def __init__(self,
-                 d_inner_hid,
-                 d_model,
-                 dropout_rate,
-                 fc1_act="relu",
-                 reused_fc1=None,
-                 reused_fc2=None):
-        super(FFN, self).__init__()
-        self.dropout_rate = dropout_rate
-        if reused_fc1 is not None:
-            self.fc1 = reused_fc1
-        else:
-            self.fc1 = Linear(
-                input_dim=d_model, output_dim=d_inner_hid, act=fc1_act)
-        if reused_fc2 is not None:
-            self.fc2 = reused_fc2
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 merge_mode='concat',
+                 merge_each_layer=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(BidirectionalLSTM, self).__init__()
+        self.num_layers = num_layers
+        self.merge_mode = merge_mode
+        self.merge_each_layer = merge_each_layer
+        param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
+        bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
+        if not merge_each_layer:
+            cell_fw = StackedLSTMCell(input_size, hidden_size, gate_activation,
+                                      activation, forget_bias, num_layers,
+                                      dropout, param_attrs[0], bias_attrs[0],
+                                      dtype)
+            cell_bw = StackedLSTMCell(input_size, hidden_size, gate_activation,
+                                      activation, forget_bias, num_layers,
+                                      dropout, param_attrs[1], bias_attrs[1],
+                                      dtype)
+            self.lstm = BidirectionalRNN(
+                cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major)
         else:
-            self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
+            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
+                                                             num_layers)
+            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
+                                                             num_layers)
+            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
+                                                            num_layers)
+            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
+                                                            num_layers)
 
-    def forward(self, x):
-        hidden = self.fc1(x)
-        if self.dropout_rate:
-            hidden = layers.dropout(
-                hidden, dropout_prob=self.dropout_rate, is_test=False)
-        out = self.fc2(hidden)
-        return out
+            # maybe design cell including both forward and backward later
+            self.lstm = []
+            for i in range(num_layers):
+                cell_fw = StackedLSTMCell(
+                    input_size if i == 0 else (hidden_size * 2
+                                               if merge_mode == 'concat' else
+                                               hidden_size), hidden_size,
+                    gate_activation, activation, forget_bias, 1, dropout,
+                    fw_param_attrs[i], fw_bias_attrs[i], dtype)
+                cell_bw = StackedLSTMCell(
+                    input_size if i == 0 else (hidden_size * 2
+                                               if merge_mode == 'concat' else
+                                               hidden_size), hidden_size,
+                    gate_activation, activation, forget_bias, 1, dropout,
+                    bw_param_attrs[i], bw_bias_attrs[i], dtype)
+                self.lstm.append(
+                    self.add_sublayer(
+                        "lstm_%d" % i,
+                        BidirectionalRNN(
+                            cell_fw,
+                            cell_bw,
+                            merge_mode=merge_mode,
+                            time_major=time_major)))
 
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        """
+        Performs bidirectional multi-layer LSTM layer by layer. Each LSTM's `outputs`
+        is the `inputs` of the subsequent one, or when `merge_each_layer` is True,
+        merged outputs would be the `inputs` of the subsequent one.
 
-class TransformerEncoderLayer(Layer):
+        Parameters:
+            inputs (Variable): The inputs for the first LSTM. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked LSTM. If `merge_each_layer` is True, the length of
+                list should be `num_layers` and a single value would be reused for
+                `num_layers`; Otherwise, the length should be 2 and a single value
+                would be reused twice. If not provided, use 0 as initial states.
+                Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last bidirectional LSTM; `final_states` is a \
+                pair including `final_states` of forward and backward LSTM when \
+                `merge_each_layer` is False or a list including `final_states` \
+                of all stacked bidirectional LSTM, and it has tensors with same \
+                shapes data types as `initial_states`.
+        """
+        if not self.merge_each_layer:
+            return self.lstm(inputs, initial_states, sequence_length)
+        else:
+            if isinstance(initial_states, (list, tuple)):
+                assert len(initial_states) == self.num_layers, (
+                    "length of initial_states should be %d when it is a list/tuple"
+                    % self.num_layers)
+            else:
+                initial_states = [initial_states] * self.num_layers
+            stacked_states = []
+            for i in range(self.num_layers):
+                outputs, states = self.lstm[i](inputs, initial_states[i],
+                                               sequence_length)
+                inputs = outputs
+                stacked_states.append(states)
+            return outputs, stacked_states
+
+
+class StackedGRUCell(RNNCell):
     """
-    EncoderLayer
+    Wrapper allowing a stack of GRU cells to behave as a single cell. It is used
+    to implement stacked GRU.
+
+    The formula for GRU used here is as follows:
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
+
+    Parameters:
+        input_size (int): The input size for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        gate_activation (function, optional): The activation function for gates
+            of GRU, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            GRU, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            GRU. It also can be a list or tuple, including dropout probabilities
+            for the corresponding GRU. Default 0.0
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import StackedLSTMCell, RNN
+
+            inputs = paddle.rand((2, 4, 32))
+            cell = StackedGRUCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell, inputs=inputs)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
     """
 
     def __init__(self,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da",
-                 ffn_fc1_act="relu",
-                 reused_pre_selatt_layernorm=None,
-                 reused_multihead_att_weights={
-                     "reused_query_fc": None,
-                     "reused_key_fc": None,
-                     "reused_value_fc": None,
-                     "reused_proj_fc": None
-                 },
-                 reused_post_selfatt_layernorm=None,
-                 reused_pre_ffn_layernorm=None,
-                 reused_ffn_weights={"reused_fc1": None,
-                                     "reused_fc2": None},
-                 reused_post_ffn_layernorm=None):
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 num_layers=1,
+                 dropout=0.0,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype="float32"):
+        super(StackedGRUCell, self).__init__()
+        self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
+                                             float)
+        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
+        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
 
-        super(TransformerEncoderLayer, self).__init__()
+        self.cells = []
+        for i in range(num_layers):
+            self.cells.append(
+                self.add_sublayer(
+                    "gru_%d" % i,
+                    BasicGRUCell(
+                        input_size=input_size if i == 0 else hidden_size,
+                        hidden_size=hidden_size,
+                        gate_activation=gate_activation,
+                        activation=activation,
+                        param_attr=param_attrs[i],
+                        bias_attr=bias_attrs[i],
+                        dtype=dtype)))
 
-        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_selatt_layernorm)
-        self.self_attn = MultiHeadAttention(
-            d_key,
-            d_value,
-            d_model,
-            n_head,
-            attention_dropout,
-            reused_query_fc=reused_multihead_att_weights["reused_query_fc"],
-            reused_key_fc=reused_multihead_att_weights["reused_key_fc"],
-            reused_value_fc=reused_multihead_att_weights["reused_value_fc"],
-            reused_proj_fc=reused_multihead_att_weights["reused_proj_fc"])
-        self.postprocesser1 = PrePostProcessLayer(
-            postprocess_cmd, d_model, prepostprocess_dropout,
-            reused_post_selfatt_layernorm)
+    def forward(self, inputs, states):
+        """
+        Performs the stacked GRU cells sequentially. Each cell's `inputs` is
+        the `outputs` of the previous cell. And each cell's `states` is the
+        corresponding one in `states`.
 
-        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_ffn_layernorm)
-        self.ffn = FFN(d_inner_hid,
-                       d_model,
-                       relu_dropout,
-                       fc1_act=ffn_fc1_act,
-                       reused_fc1=reused_ffn_weights["reused_fc1"],
-                       reused_fc2=reused_ffn_weights["reused_fc2"])
-        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout,
-                                                  reused_post_ffn_layernorm)
+        Parameters:
+            inputs (Variable): The inputs for the first cell. It is a float32 or
+                float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+            **kwargs: Additional keyword arguments, which passed to `cell.forward`
+                for all including cells.
 
-    def forward(self, enc_input, attn_bias):
-        attn_output = self.self_attn(
-            self.preprocesser1(enc_input), None, None, attn_bias)
-        attn_output = self.postprocesser1(attn_output, enc_input)
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
+                a tensor with shape `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}` in the formula of the last GRU; `new_states` \
+                is a list composed of every GRU `new_states` which is also \
+                :math:`h_{t}` in the formula, and the data type and structure \
+                of these tensors all is same as that of `states`.
+        """
+        new_states = []
+        for i, cell in enumerate(self.cells):
+            outputs, new_state = cell(inputs, states[i])
+            outputs = layers.dropout(
+                outputs,
+                self.dropout[i],
+                dropout_implementation='upscale_in_train') if self.dropout[
+                    i] > 0 else outputs
+            inputs = outputs
+            new_states.append(new_state)
+        return outputs, new_states
 
-        ffn_output = self.ffn(self.preprocesser2(attn_output))
-        ffn_output = self.postprocesser2(ffn_output, attn_output)
-        return ffn_output
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of StackedGRUCell is a list composed of each including
+        GRU cell's `state_shape`.
 
+        Returns:
+            list: A list composed of each including GRU cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
 
-class TransformerEncoder(Layer):
+
+class GRU(Layer):
     """
-    encoder
+    Applies a stacked multi-layer gated recurrent unit (GRU) RNN to an input
+    sequence.
+
+    The formula for GRU used here is as follows:
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
+
+    Parameters:
+        input_size (int): The input size for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        gate_activation (function, optional): The activation function for gates
+            of GRU, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            GRU, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        num_layers(int, optional): The number of GRU to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            GRU. It also can be a list or tuple, including dropout probabilities
+            for the corresponding GRU. Default 0.0
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import LSTM
+
+            inputs = paddle.rand((2, 4, 32))
+            gru = GRU(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = gru(inputs)  # [2, 4, 64]
     """
 
     def __init__(self,
-                 n_layer,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da",
-                 ffn_fc1_act="relu"):
-
-        super(TransformerEncoder, self).__init__()
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 num_layers=1,
+                 dropout=0.0,
+                 is_reverse=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(GRU, self).__init__()
+        gru_cell = StackedGRUCell(input_size, hidden_size, gate_activation,
+                                  activation, num_layers, dropout, param_attr,
+                                  bias_attr, dtype)
+        self.gru = RNN(gru_cell, is_reverse, time_major)
 
-        self.encoder_layers = list()
-        for i in range(n_layer):
-            self.encoder_layers.append(
-                self.add_sublayer(
-                    "layer_%d" % i,
-                    TransformerEncoderLayer(
-                        n_head,
-                        d_key,
-                        d_value,
-                        d_model,
-                        d_inner_hid,
-                        prepostprocess_dropout,
-                        attention_dropout,
-                        relu_dropout,
-                        preprocess_cmd,
-                        postprocess_cmd,
-                        ffn_fc1_act=ffn_fc1_act)))
-        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
-                                             prepostprocess_dropout)
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        """
+        Performs the stacked multi-layer GRU layer by layer. Each GRU's `outputs`
+        is the `inputs` of the subsequent one.
 
-    def forward(self, enc_input, attn_bias):
-        for encoder_layer in self.encoder_layers:
-            enc_output = encoder_layer(enc_input, attn_bias)
-            enc_input = enc_output
+        Parameters:
+            inputs (Variable): The inputs for the first GRU. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked GRU, and the initial states of each GRU is a tensor
+                shaped `[batch_size, hidden_size]`. If not provided, use 0 as initial
+                states. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
 
-        return self.processer(enc_output)
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last GRU and it is a tensor with shape \
+                `[batch_size, sequence_length, hidden_size]` and has the same \
+                data type as `inputs`, `final_states` is the counterpart of \
+                `initial_states` at last time step, thus has the same structure \
+                with it and has tensors with same shapes data types.
+        """
+        return self.gru(inputs, initial_states, sequence_length)
 
 
-class TransformerDecoderLayer(Layer):
+class BidirectionalGRU(Layer):
     """
-    decoder
+    Applies a bidirectional multi-layer gated recurrent unit (GRU) RNN to an input
+    sequence.
+    
+    Bidirection interaction can happen after each layer or only after the last
+    layer according to the  `merge_each_layer` setting. The way to interact,
+    that is how to merge outputs of the two direction, is determined by `merge_mode`.
+
+    The formula for GRU used here is as follows:
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
+
+    Parameters:
+        input_size (int): The input size for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        gate_activation (function, optional): The activation function for gates
+            of GRU, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            GRU, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        num_layers(int, optional): The number of GRU to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            GRU. It also can be a list or tuple, including dropout probabilities
+            for the corresponding GRU. Default 0.0
+        merge_mode (str|None, optional): The way to merget outputs of forward and
+            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
+            where None stands for make the two `outputs` as a tuple, `zip` stands
+            for make each two corresponding tensors of the two `outputs` as a tuple.
+            Default `concat`
+        merge_each_layer (bool, optional): Indicate whether bidirection interaction
+            happens after each layer or only after the last layer. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import BidirectionalGRU
+
+            inputs = paddle.rand((2, 4, 32))
+            gru = BidirectionalGRU(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = bi_gru(inputs)  # [2, 4, 128]
     """
 
     def __init__(self,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da",
-                 reused_pre_selfatt_layernorm=None,
-                 reused_self_multihead_att_weights={
-                     "reused_query_fc": None,
-                     "reused_key_fc": None,
-                     "reused_value_fc": None,
-                     "reused_proj_fc": None
-                 },
-                 reused_post_selfatt_layernorm=None,
-                 reused_pre_crossatt_layernorm=None,
-                 reused_cross_multihead_att_weights={
-                     "reused_query_fc": None,
-                     "reused_key_fc": None,
-                     "reused_value_fc": None,
-                     "reused_proj_fc": None
-                 },
-                 reused_post_crossatt_layernorm=None,
-                 reused_pre_ffn_layernorm=None,
-                 reused_ffn_weights={"reused_fc1": None,
-                                     "reused_fc2": None},
-                 reused_post_ffn_layernorm=None):
-        super(TransformerDecoderLayer, self).__init__()
-
-        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_selfatt_layernorm)
-        self.self_attn = MultiHeadAttention(
-            d_key,
-            d_value,
-            d_model,
-            n_head,
-            attention_dropout,
-            reused_query_fc=reused_self_multihead_att_weights[
-                "reused_query_fc"],
-            reused_key_fc=reused_self_multihead_att_weights["reused_key_fc"],
-            reused_value_fc=reused_self_multihead_att_weights[
-                "reused_value_fc"],
-            reused_proj_fc=reused_self_multihead_att_weights["reused_proj_fc"])
-        self.postprocesser1 = PrePostProcessLayer(
-            postprocess_cmd, d_model, prepostprocess_dropout,
-            reused_post_selfatt_layernorm)
-
-        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_crossatt_layernorm)
-        self.cross_attn = MultiHeadAttention(
-            d_key,
-            d_value,
-            d_model,
-            n_head,
-            attention_dropout,
-            reused_query_fc=reused_cross_multihead_att_weights[
-                "reused_query_fc"],
-            reused_key_fc=reused_cross_multihead_att_weights["reused_key_fc"],
-            reused_value_fc=reused_cross_multihead_att_weights[
-                "reused_value_fc"],
-            reused_proj_fc=reused_cross_multihead_att_weights[
-                "reused_proj_fc"])
-        self.postprocesser2 = PrePostProcessLayer(
-            postprocess_cmd, d_model, prepostprocess_dropout,
-            reused_post_crossatt_layernorm)
-
-        self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_ffn_layernorm)
-        self.ffn = FFN(d_inner_hid,
-                       d_model,
-                       relu_dropout,
-                       reused_fc1=reused_ffn_weights["reused_fc1"],
-                       reused_fc2=reused_ffn_weights["reused_fc2"])
-        self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout,
-                                                  reused_post_ffn_layernorm)
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 merge_mode='concat',
+                 merge_each_layer=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(BidirectionalGRU, self).__init__()
+        self.num_layers = num_layers
+        self.merge_mode = merge_mode
+        self.merge_each_layer = merge_each_layer
+        param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
+        bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
+        if not merge_each_layer:
+            cell_fw = StackedGRUCell(input_size, hidden_size, gate_activation,
+                                     activation, num_layers, dropout,
+                                     param_attrs[0], bias_attrs[0], dtype)
+            cell_bw = StackedGRUCell(input_size, hidden_size, gate_activation,
+                                     activation, num_layers, dropout,
+                                     param_attrs[1], bias_attrs[1], dtype)
+            self.gru = BidirectionalRNN(
+                cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major)
+        else:
+            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
+                                                             num_layers)
+            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
+                                                             num_layers)
+            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
+                                                            num_layers)
+            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
+                                                            num_layers)
 
-    def forward(self,
-                dec_input,
-                enc_output,
-                self_attn_bias,
-                cross_attn_bias,
-                cache=None):
-        self_attn_output = self.self_attn(
-            self.preprocesser1(dec_input), None, None, self_attn_bias, cache)
-        self_attn_output = self.postprocesser1(self_attn_output, dec_input)
+            # maybe design cell including both forward and backward later
+            self.gru = []
+            for i in range(num_layers):
+                cell_fw = StackedGRUCell(input_size if i == 0 else (
+                    hidden_size * 2 if merge_mode == 'concat' else
+                    hidden_size), hidden_size, gate_activation, activation, 1,
+                                         dropout, fw_param_attrs[i],
+                                         fw_bias_attrs[i], dtype)
+                cell_bw = StackedGRUCell(input_size if i == 0 else (
+                    hidden_size * 2 if merge_mode == 'concat' else
+                    hidden_size), hidden_size, gate_activation, activation, 1,
+                                         dropout, bw_param_attrs[i],
+                                         bw_bias_attrs[i], dtype)
+                self.gru.append(
+                    self.add_sublayer(
+                        "gru_%d" % i,
+                        BidirectionalRNN(
+                            cell_fw,
+                            cell_bw,
+                            merge_mode=merge_mode,
+                            time_major=time_major)))
 
-        cross_attn_output = self.cross_attn(
-            self.preprocesser2(self_attn_output), enc_output, enc_output,
-            cross_attn_bias, cache)
-        cross_attn_output = self.postprocesser2(cross_attn_output,
-                                                self_attn_output)
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        """
+        Performs bidirectional multi-layer GRU layer by layer. Each GRU's `outputs`
+        is the `inputs` of the subsequent one, or when `merge_each_layer` is True,
+        merged outputs would be the `inputs` of the subsequent one.
 
-        ffn_output = self.ffn(self.preprocesser3(cross_attn_output))
-        ffn_output = self.postprocesser3(ffn_output, cross_attn_output)
+        Parameters:
+            inputs (Variable): The inputs for the first GRU. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked GRU. If `merge_each_layer` is True, the length of
+                list should be `num_layers` and a single value would be reused for
+                `num_layers`; Otherwise, the length should be 2 and a single value
+                would be reused twice. If not provided, use 0 as initial states.
+                Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
 
-        return ffn_output
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last bidirectional GRU; `final_states` is a \
+                pair including `final_states` of forward and backward GRU when \
+                `merge_each_layer` is False or a list including `final_states` \
+                of all stacked bidirectional GRU, and it has tensors with same \
+                shapes data types as `initial_states`.
+        """
+        if not self.merge_each_layer:
+            return self.gru(inputs, initial_states, sequence_length)
+        else:
+            if isinstance(initial_states, (list, tuple)):
+                assert len(initial_states) == self.num_layers, (
+                    "length of initial_states should be %d when it is a list/tuple"
+                    % self.num_layers)
+            else:
+                initial_states = [initial_states] * self.num_layers
+            stacked_states = []
+            for i in range(self.num_layers):
+                outputs, states = self.gru[i](inputs, initial_states[i],
+                                              sequence_length)
+                inputs = outputs
+                stacked_states.append(states)
+            return outputs, stacked_states
 
 
-class TransformerDecoder(Layer):
+class DynamicDecode(Layer):
     """
-    decoder
+    DynamicDecode integrates an Decoder instance to perform dynamic decoding.
+
+    It performs :code:`decoder.step()` repeatedly until the returned Tensor
+    indicating finished status contains all True values or the number of
+    decoding step reaches to :attr:`max_step_num`.
+
+    :code:`decoder.initialize()` would be called once before the decoding loop.
+    If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
+    would be called once after the decoding loop.
+
+    Parameters:
+        decoder (Decoder): An instance of `Decoder`.
+        max_step_num (int, optional): The maximum number of steps. If not provided,
+            decode until the decoder is fully done, or in other words, the returned
+            Tensor by :code:`decoder.step()` indicating finished status contains
+            all True. Default `None`.
+        output_time_major (bool, optional): Indicate the data layout of Tensor included
+            in the final outputs(the first returned value of this method). If
+            attr:`False`, the data layout would be batch major with shape
+            `[batch_size, seq_len, ...]`.  If attr:`True`, the data layout would
+            be time major with shape `[seq_len, batch_size, ...]`. Default: `False`.
+        impute_finished (bool, optional): If `True`, then states get copied through
+            for batch entries which are marked as finished, which differs with the
+            unfinished using the new states returned by :code:`decoder.step()` and
+            ensures that the final states have the correct values. Otherwise, states
+            wouldn't be copied through when finished. If the returned `final_states`
+            is needed, it should be set as True, which causes some slowdown.
+            Default `False`.
+        is_test (bool, optional): A flag indicating whether to use test mode. In
+            test mode, it is more memory saving. Default `False`.
+        return_length (bool, optional):  A flag indicating whether to return an
+            extra Tensor variable in the output tuple, which stores the actual
+            lengths of all decoded sequences. Default `False`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import StackedLSTMCell, RNN
+
+            vocab_size, d_model, = 100, 32
+            encoder_output = paddle.rand((2, 4, d_model))
+            trg_embeder = fluid.dygraph.Embedding(size=[vocab_size, d_model])
+            output_layer = fluid.dygraph.Linear(d_model, vocab_size)
+            cell = StackedLSTMCell(input_size=d_model, hidden_size=d_model)
+            decoder = BeamSearchDecoder(decoder_cell,
+                                        start_token=0,
+                                        end_token=1,
+                                        beam_size=4,
+                                        embedding_fn=trg_embeder,
+                                        output_fn=output_layer)
+            dynamic_decoder = DynamicDecode(decoder, max_step_num=10)
+            outputs = dynamic_decoder(cell.get_initial_states(encoder_output))
     """
 
-    def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
-                 prepostprocess_dropout, attention_dropout, relu_dropout,
-                 preprocess_cmd, postprocess_cmd):
-        super(TransformerDecoder, self).__init__()
-
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.d_key = d_key
-        self.d_value = d_value
+    def __init__(self,
+                 decoder,
+                 max_step_num=None,
+                 output_time_major=False,
+                 impute_finished=False,
+                 is_test=False,
+                 return_length=False):
+        super(DynamicDecode, self).__init__()
+        self.decoder = decoder
+        self.max_step_num = max_step_num
+        self.output_time_major = output_time_major
+        self.impute_finished = impute_finished
+        self.is_test = is_test
+        self.return_length = return_length
 
-        self.decoder_layers = list()
-        for i in range(n_layer):
-            self.decoder_layers.append(
-                self.add_sublayer(
-                    "layer_%d" % i,
-                    TransformerDecoderLayer(
-                        n_head, d_key, d_value, d_model, d_inner_hid,
-                        prepostprocess_dropout, attention_dropout,
-                        relu_dropout, preprocess_cmd, postprocess_cmd)))
-        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
-                                             prepostprocess_dropout)
+    def forward(self, inits=None, **kwargs):
+        """
+        Performs :code:`decoder.step()` repeatedly until the returned Tensor
+        indicating finished status contains all True values or the number of
+        decoding step reaches to :attr:`max_step_num`.
 
-    def forward(self,
-                dec_input,
-                enc_output,
-                self_attn_bias,
-                cross_attn_bias,
-                caches=None):
-        for i, decoder_layer in enumerate(self.decoder_layers):
-            dec_output = decoder_layer(dec_input, enc_output, self_attn_bias,
-                                       cross_attn_bias, None
-                                       if caches is None else caches[i])
-            dec_input = dec_output
+        :code:`decoder.initialize()` would be called once before the decoding loop.
+        If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
+        would be called once after the decoding loop.
 
-        return self.processer(dec_output)
+        Parameters:
+            inits (object, optional): Argument passed to `decoder.initialize`.
+                Default `None`.
+            **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`.
 
-    def prepare_static_cache(self, enc_output):
-        return [
-            dict(
-                zip(("static_k", "static_v"),
-                    decoder_layer.cross_attn.cal_kv(enc_output, enc_output)))
-            for decoder_layer in self.decoder_layers
-        ]
+        Returns:
+            tuple: A tuple( :code:`(final_outputs, final_states, sequence_lengths)` ) \
+                when `return_length` is True, otherwise a tuple( :code:`(final_outputs, final_states)` ). \
+                The final outputs and states, both are Tensor or nested structure of Tensor. \
+                `final_outputs` has the same structure and data types as the :code:`outputs` \
+                returned by :code:`decoder.step()` , and each Tenser in `final_outputs` \
+                is the stacked of all decoding steps' outputs, which might be revised \
+                by :code:`decoder.finalize()` if the decoder has implemented `finalize`. \
+                `final_states` is the counterpart at last time step of initial states \
+                returned by :code:`decoder.initialize()` , thus has the same structure \
+                with it and has tensors with same shapes and data types. `sequence_lengths` \
+                is an `int64` tensor with the same shape as `finished` returned \
+                by :code:`decoder.initialize()` , and it stores the actual lengths of \
+                all decoded sequences.
+        """
+        if fluid.in_dygraph_mode():
 
-    def prepare_incremental_cache(self, enc_output):
-        return [{
-            "k": layers.fill_constant_batch_size_like(
-                input=enc_output,
-                shape=[-1, self.n_head, 0, self.d_key],
-                dtype=enc_output.dtype,
-                value=0),
-            "v": layers.fill_constant_batch_size_like(
-                input=enc_output,
-                shape=[-1, self.n_head, 0, self.d_value],
-                dtype=enc_output.dtype,
-                value=0),
-        } for i in range(self.n_layer)]
+            class ArrayWrapper(object):
+                def __init__(self, x):
+                    self.array = [x]
 
+                def append(self, x):
+                    self.array.append(x)
+                    return self
 
-#TODO: we should merge GRUCell with BasicGRUCell
-class GRUCell(RNNCell):
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 param_attr=None,
-                 bias_attr=None,
-                 gate_activation='sigmoid',
-                 candidate_activation='tanh',
-                 origin_mode=False):
-        super(GRUCell, self).__init__()
-        self.hidden_size = hidden_size
-        self.fc_layer = Linear(
-            input_size, hidden_size * 3, param_attr=param_attr)
+                def __getitem__(self, item):
+                    return self.array.__getitem__(item)
 
-        self.gru_unit = GRUUnit(
-            hidden_size * 3,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            activation=candidate_activation,
-            gate_activation=gate_activation,
-            origin_mode=origin_mode)
+            def _maybe_copy(state, new_state, step_mask):
+                # TODO: use where_op
+                state_dtype = state.dtype
+                if convert_dtype(state_dtype) in ["bool"]:
+                    state = layers.cast(state, dtype="float32")
+                    new_state = layers.cast(new_state, dtype="float32")
+                if step_mask.dtype != state.dtype:
+                    step_mask = layers.cast(step_mask, dtype=state.dtype)
+                    # otherwise, renamed bool gradients of would be summed up leading
+                    # to sum(bool) error.
+                    step_mask.stop_gradient = True
+                new_state = layers.elementwise_mul(
+                    state, step_mask, axis=0) - layers.elementwise_mul(
+                        new_state, (step_mask - 1), axis=0)
+                if convert_dtype(state_dtype) in ["bool"]:
+                    new_state = layers.cast(new_state, dtype=state_dtype)
+                return new_state
 
-    def forward(self, inputs, states):
-        # for GRUCell, `step_outputs` and `new_states` both are hidden
-        x = self.fc_layer(inputs)
-        hidden, _, _ = self.gru_unit(x, states)
-        return hidden, hidden
+            initial_inputs, initial_states, initial_finished = self.decoder.initialize(
+                inits)
+            inputs, states, finished = (initial_inputs, initial_states,
+                                        initial_finished)
+            cond = layers.logical_not((layers.reduce_all(initial_finished)))
+            sequence_lengths = layers.cast(
+                layers.zeros_like(initial_finished), "int64")
+            outputs = None
 
-    @property
-    def state_shape(self):
-        return [self.hidden_size]
+            step_idx = 0
+            step_idx_tensor = layers.fill_constant(
+                shape=[1], dtype="int64", value=step_idx)
+            while cond.numpy():
+                (step_outputs, next_states, next_inputs,
+                 next_finished) = self.decoder.step(step_idx_tensor, inputs,
+                                                    states, **kwargs)
+                if not self.decoder.tracks_own_finished:
+                    # BeamSearchDecoder would track it own finished, since
+                    # beams would be reordered and the finished status of each
+                    # entry might change. Otherwise, perform logical OR which
+                    # would not change the already finished.
+                    next_finished = layers.logical_or(next_finished, finished)
+                    # To confirm states.finished/finished be consistent with
+                    # next_finished.
+                    layers.assign(next_finished, finished)
+                next_sequence_lengths = layers.elementwise_add(
+                    sequence_lengths,
+                    layers.cast(
+                        layers.logical_not(finished), sequence_lengths.dtype))
 
+                if self.impute_finished:  # rectify the states for the finished.
+                    next_states = map_structure(
+                        lambda x, y: _maybe_copy(x, y, finished), states,
+                        next_states)
+                outputs = map_structure(
+                    lambda x: ArrayWrapper(x),
+                    step_outputs) if step_idx == 0 else map_structure(
+                        lambda x, x_array: x_array.append(x), step_outputs,
+                        outputs)
+                inputs, states, finished, sequence_lengths = (
+                    next_inputs, next_states, next_finished,
+                    next_sequence_lengths)
 
-#TODO: we should merge GRUCell with BasicGRUCell
-class GRUEncoderCell(RNNCell):
-    def __init__(self,
-                 num_layers,
-                 input_size,
-                 hidden_size,
-                 dropout_prob=0.,
-                 init_scale=0.1):
-        super(GRUEncoderCell, self).__init__()
-        self.dropout_prob = dropout_prob
-        # use add_sublayer to add multi-layers
-        self.gru_cells = []
-        for i in range(num_layers):
-            self.gru_cells.append(
-                self.add_sublayer(
-                    "gru_%d" % i,
-                    #BasicGRUCell(
-                    GRUCell(
-                        input_size=input_size if i == 0 else hidden_size,
-                        hidden_size=hidden_size,
-                        param_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.UniformInitializer(
-                                low=-init_scale, high=init_scale)))))
+                layers.increment(x=step_idx_tensor, value=1.0, in_place=True)
+                step_idx += 1
 
-    def forward(self, step_input, states):
-        new_states = []
-        for i, gru_cell in enumerate(self.gru_cells):
-            out, state = gru_cell(step_input, states[i])
-            step_input = layers.dropout(
-                out,
-                self.dropout_prob,
-                dropout_implementation='upscale_in_train'
-            ) if self.dropout_prob > 0 else out
-            new_states.append(step_input)
-        return step_input, new_states
+                layers.logical_not(layers.reduce_all(finished), cond)
+                if self.max_step_num is not None and step_idx > self.max_step_num:
+                    break
 
-    @property
-    def state_shape(self):
-        return [cell.state_shape for cell in self.gru_cells]
+            final_outputs = map_structure(
+                lambda x: fluid.layers.stack(x.array, axis=0), outputs)
+            final_states = states
 
+            try:
+                final_outputs, final_states = self.decoder.finalize(
+                    final_outputs, final_states, sequence_lengths)
+            except NotImplementedError:
+                pass
 
-class BiGRU(fluid.dygraph.Layer):
-    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
-        super(BiGRU, self).__init__()
-        self.gru = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
-                                      init_bound),
-                       is_reverse=False,
-                       time_major=False)
+            if not self.output_time_major:
+                final_outputs = map_structure(
+                    lambda x: layers.transpose(x, [1, 0] + list(
+                        range(2, len(x.shape)))), final_outputs)
 
-        self.gru_r = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
-                                        init_bound),
-                         is_reverse=True,
-                         time_major=False)
+            return (final_outputs, final_states,
+                    sequence_lengths) if self.return_length else (
+                        final_outputs, final_states)
+        else:
+            return fluid.layers.dynamic_decode(
+                self.decoder,
+                inits,
+                max_step_num=self.max_step_num,
+                output_time_major=self.output_time_major,
+                impute_finished=self.impute_finished,
+                is_test=self.is_test,
+                return_length=self.return_length,
+                **kwargs)
 
-    def forward(self, input_feature):
-        pre_gru, pre_state = self.gru(input_feature)
-        gru_r, r_state = self.gru_r(input_feature)
-        bi_merge = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
-        return bi_merge
 
+class TransformerCell(Layer):
+    """
+    Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be
+    used as RNNCell
+    """
 
-class LinearChainCRF(Layer):
-    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(LinearChainCRF, self).__init__()
+    def __init__(self, decoder, embedding_fn=None, output_fn=None):
+        super(TransformerCell, self).__init__()
+        self.decoder = decoder
+        self.embedding_fn = embedding_fn
+        self.output_fn = output_fn
 
-        self._param_attr = param_attr
-        self._dtype = dtype
-        self._size = size
-        self._is_test = is_test
-        self._transition = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._size + 2, self._size],
-            dtype=self._dtype)
+    def forward(self, inputs, states, trg_src_attn_bias, enc_output,
+                static_caches):
+        trg_word, trg_pos = inputs
+        for cache, static_cache in zip(states, static_caches):
+            cache.update(static_cache)
+        if self.embedding_fn is not None:
+            dec_input = self.embedding_fn(trg_word, trg_pos)
+            outputs = self.decoder(dec_input, enc_output, None,
+                                   trg_src_attn_bias, states)
+        else:
+            outputs = self.decoder(trg_word, trg_pos, enc_output, None,
+                                   trg_src_attn_bias, states)
+        if self.output_fn is not None:
+            outputs = self.output_fn(outputs)
+        if len(outputs.shape) == 3:
+            # squeeze to adapt to BeamSearchDecoder which use 2D logits
+            outputs = layers.squeeze(outputs, [1])
+        new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states]
+        return outputs, new_states
 
     @property
-    def weight(self):
-        return self._transition
-
-    @weight.setter
-    def weight(self, value):
-        self._transition = value
+    def state_shape(self):
+        return [{
+            "k": [self.decoder.n_head, 0, self.decoder.d_key],
+            "v": [self.decoder.n_head, 0, self.decoder.d_value],
+        } for i in range(len(self.decoder.n_layer))]
 
-    def forward(self, input, label, length=None):
 
-        alpha = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        emission_exps = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        transition_exps = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        log_likelihood = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        this_inputs = {
-            "Emission": [input],
-            "Transition": self._transition,
-            "Label": [label]
-        }
-        if length is not None:
-            this_inputs['Length'] = [length]
-        self._helper.append_op(
-            type='linear_chain_crf',
-            inputs=this_inputs,
-            outputs={
-                "Alpha": [alpha],
-                "EmissionExps": [emission_exps],
-                "TransitionExps": transition_exps,
-                "LogLikelihood": log_likelihood
-            },
-            attrs={"is_test": self._is_test, })
-        return log_likelihood
+class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
+    def __init__(self, cell, start_token, end_token, beam_size,
+                 var_dim_in_state):
+        super(TransformerBeamSearchDecoder,
+              self).__init__(cell, start_token, end_token, beam_size)
+        self.cell = cell
+        self.var_dim_in_state = var_dim_in_state
 
+    def _merge_batch_beams_with_var_dim(self, x):
+        # init length of cache is 0, and it increases with decoding carrying on,
+        # thus need to reshape elaborately
+        var_dim_in_state = self.var_dim_in_state + 1  # count in beam dim
+        x = layers.transpose(x,
+                             list(range(var_dim_in_state, len(x.shape))) +
+                             list(range(0, var_dim_in_state)))
+        x = layers.reshape(
+            x, [0] * (len(x.shape) - var_dim_in_state
+                      ) + [self.batch_size * self.beam_size] +
+            [int(size) for size in x.shape[-var_dim_in_state + 2:]])
+        x = layers.transpose(
+            x,
+            list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) +
+            list(range(0, (len(x.shape) + 1 - var_dim_in_state))))
+        return x
 
-class CRFDecoding(Layer):
-    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(CRFDecoding, self).__init__()
+    def _split_batch_beams_with_var_dim(self, x):
+        var_dim_size = layers.shape(x)[self.var_dim_in_state]
+        x = layers.reshape(
+            x, [-1, self.beam_size] +
+            [int(size)
+             for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] +
+            [int(size) for size in x.shape[self.var_dim_in_state + 1:]])
+        return x
 
-        self._dtype = dtype
-        self._size = size
-        self._is_test = is_test
-        self._param_attr = param_attr
-        self._transition = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._size + 2, self._size],
-            dtype=self._dtype)
+    def step(self, time, inputs, states, **kwargs):
+        # compared to RNN, Transformer has 3D data at every decoding step
+        inputs = layers.reshape(inputs, [-1, 1])  # token
+        pos = layers.ones_like(inputs) * time  # pos
+        cell_states = map_structure(self._merge_batch_beams_with_var_dim,
+                                    states.cell_states)
 
-    @property
-    def weight(self):
-        return self._transition
+        cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states,
+                                                   **kwargs)
+        cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
+        next_cell_states = map_structure(self._split_batch_beams_with_var_dim,
+                                         next_cell_states)
 
-    @weight.setter
-    def weight(self, value):
-        self._transition = value
+        beam_search_output, beam_search_state = self._beam_search_step(
+            time=time,
+            logits=cell_outputs,
+            next_cell_states=next_cell_states,
+            beam_state=states)
+        next_inputs, finished = (beam_search_output.predicted_ids,
+                                 beam_search_state.finished)
 
-    def forward(self, input, label=None, length=None):
+        return (beam_search_output, beam_search_state, next_inputs, finished)
 
-        viterbi_path = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        this_inputs = {
-            "Emission": [input],
-            "Transition": self._transition,
-            "Label": label
-        }
-        if length is not None:
-            this_inputs['Length'] = [length]
-        self._helper.append_op(
-            type='crf_decoding',
-            inputs=this_inputs,
-            outputs={"ViterbiPath": [viterbi_path]},
-            attrs={"is_test": self._is_test, })
-        return viterbi_path
 
+### Transformer Modules ###
+class PrePostProcessLayer(Layer):
+    """
+    PrePostProcessLayer
+    """
 
-class GRUEncoder(Layer):
     def __init__(self,
-                 input_dim,
-                 grnn_hidden_dim,
-                 init_bound,
-                 num_layers=1,
-                 is_bidirection=False):
-        super(GRUEncoder, self).__init__()
-        self.num_layers = num_layers
-        self.is_bidirection = is_bidirection
-        self.gru_list = []
-        self.gru_r_list = []
-        for i in range(num_layers):
-            self.basic_gru_cell = BasicGRUCell(
-                input_size=input_dim if i == 0 else input_dim * 2,
-                hidden_size=grnn_hidden_dim,
-                param_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
-                        low=-init_bound, high=init_bound),
-                    regularizer=fluid.regularizer.L2DecayRegularizer(
-                        regularization_coeff=1e-4)))
-            self.gru_list.append(
-                self.add_sublayer(
-                    "gru_%d" % i,
-                    RNN(self.basic_gru_cell,
-                        is_reverse=False,
-                        time_major=False)))
-        if self.is_bidirection:
-            for i in range(num_layers):
-                self.basic_gru_cell_r = BasicGRUCell(
-                    input_size=input_dim if i == 0 else input_dim * 2,
-                    hidden_size=grnn_hidden_dim,
-                    param_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.UniformInitializer(
-                            low=-init_bound, high=init_bound),
-                        regularizer=fluid.regularizer.L2DecayRegularizer(
-                            regularization_coeff=1e-4)))
-                self.gru_r_list.append(
+                 process_cmd,
+                 d_model,
+                 dropout_rate,
+                 reused_layer_norm=None):
+        super(PrePostProcessLayer, self).__init__()
+        self.process_cmd = process_cmd
+        self.functors = []
+        for cmd in self.process_cmd:
+            if cmd == "a":  # add residual connection
+                self.functors.append(
+                    lambda x, y: x + y if y is not None else x)
+            elif cmd == "n":  # add layer normalization
+                if reused_layer_norm is not None:
+                    layer_norm = reused_layer_norm
+                else:
+                    layer_norm = LayerNorm(
+                        normalized_shape=d_model,
+                        param_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(1.)),
+                        bias_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(0.)))
+
+                self.functors.append(
                     self.add_sublayer(
-                        "gru_r_%d" % i,
-                        RNN(self.basic_gru_cell_r,
-                            is_reverse=True,
-                            time_major=False)))
+                        "layer_norm_%d" % len(
+                            self.sublayers(include_sublayers=False)),
+                        layer_norm))
+            elif cmd == "d":  # add dropout
+                self.functors.append(lambda x: layers.dropout(
+                    x, dropout_prob=dropout_rate, is_test=False)
+                                     if dropout_rate else x)
 
-    def forward(self, input_feature, h0=None):
-        for i in range(self.num_layers):
-            pre_gru, pre_state = self.gru_list[i](input_feature)
-            if self.is_bidirection:
-                gru_r, r_state = self.gru_r_list[i](input_feature)
-                out = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
+    def forward(self, x, residual=None):
+        for i, cmd in enumerate(self.process_cmd):
+            if cmd == "a":
+                x = self.functors[i](x, residual)
             else:
-                out = pre_gru
-            input_feature = out
-        return out
+                x = self.functors[i](x)
+        return x
 
 
-class SequenceTagging(Layer):
-    def __init__(self,
-                 vocab_size,
-                 num_labels,
-                 word_emb_dim=128,
-                 grnn_hidden_dim=128,
-                 emb_learning_rate=0.1,
-                 crf_learning_rate=0.1,
-                 bigru_num=2,
-                 init_bound=0.1):
-        super(SequenceTagging, self).__init__()
-        """
-        define the sequence tagging network structure
-        word: stores the input of the model
-        for_infer: a boolean value, indicating if the model to be created is for training or predicting.
+class MultiHeadAttention(Layer):
+    """
+    Multi-Head Attention
+    """
 
-        return:
-            for infer: return the prediction
-            otherwise: return the prediction
-        """
-        self.word_emb_dim = word_emb_dim
-        self.vocab_size = vocab_size
-        self.num_labels = num_labels
-        self.grnn_hidden_dim = grnn_hidden_dim
-        self.emb_lr = emb_learning_rate
-        self.crf_lr = crf_learning_rate
-        self.bigru_num = bigru_num
-        self.init_bound = 0.1
+    def __init__(self,
+                 d_key,
+                 d_value,
+                 d_model,
+                 n_head=1,
+                 dropout_rate=0.0,
+                 reused_query_fc=None,
+                 reused_key_fc=None,
+                 reused_value_fc=None,
+                 reused_proj_fc=None):
 
-        self.word_embedding = Embedding(
-            size=[self.vocab_size, self.word_emb_dim],
-            dtype='float32',
-            param_attr=fluid.ParamAttr(
-                learning_rate=self.emb_lr,
-                name="word_emb",
-                initializer=fluid.initializer.Uniform(
-                    low=-self.init_bound, high=self.init_bound)))
+        super(MultiHeadAttention, self).__init__()
+        self.n_head = n_head
+        self.d_key = d_key
+        self.d_value = d_value
+        self.d_model = d_model
+        self.dropout_rate = dropout_rate
 
-        self.gru_encoder = GRUEncoder(
-            input_dim=self.grnn_hidden_dim,
-            grnn_hidden_dim=self.grnn_hidden_dim,
-            init_bound=self.init_bound,
-            num_layers=self.bigru_num,
-            is_bidirection=True)
+        if reused_query_fc is not None:
+            self.q_fc = reused_query_fc
+        else:
+            self.q_fc = Linear(
+                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        if reused_key_fc is not None:
+            self.k_fc = reused_key_fc
+        else:
+            self.k_fc = Linear(
+                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        if reused_value_fc is not None:
+            self.v_fc = reused_value_fc
+        else:
+            self.v_fc = Linear(
+                input_dim=d_model,
+                output_dim=d_value * n_head,
+                bias_attr=False)
+        if reused_proj_fc is not None:
+            self.proj_fc = reused_proj_fc
+        else:
+            self.proj_fc = Linear(
+                input_dim=d_value * n_head,
+                output_dim=d_model,
+                bias_attr=False)
 
-        self.fc = Linear(
-            input_dim=self.grnn_hidden_dim * 2,
-            output_dim=self.num_labels,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-self.init_bound, high=self.init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
+    def _prepare_qkv(self, queries, keys, values, cache=None):
+        if keys is None:  # self-attention
+            keys, values = queries, queries
+            static_kv = False
+        else:  # cross-attention
+            static_kv = True
 
-        self.linear_chain_crf = LinearChainCRF(
-            param_attr=fluid.ParamAttr(
-                name='linear_chain_crfw', learning_rate=self.crf_lr),
-            size=self.num_labels)
+        q = self.q_fc(queries)
+        q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
+        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
 
-        self.crf_decoding = CRFDecoding(
-            param_attr=fluid.ParamAttr(
-                name='crfw', learning_rate=self.crf_lr),
-            size=self.num_labels)
+        if cache is not None and static_kv and "static_k" in cache:
+            # for encoder-decoder attention in inference and has cached
+            k = cache["static_k"]
+            v = cache["static_v"]
+        else:
+            k = self.k_fc(keys)
+            v = self.v_fc(values)
+            k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
+            k = layers.transpose(x=k, perm=[0, 2, 1, 3])
+            v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
+            v = layers.transpose(x=v, perm=[0, 2, 1, 3])
 
-    def forward(self, word, lengths, target=None):
-        """
-        Configure the network
-        """
-        word_embed = self.word_embedding(word)
-        input_feature = word_embed
+        if cache is not None:
+            if static_kv and not "static_k" in cache:
+                # for encoder-decoder attention in inference and has not cached
+                cache["static_k"], cache["static_v"] = k, v
+            elif not static_kv:
+                # for decoder self-attention in inference
+                cache_k, cache_v = cache["k"], cache["v"]
+                k = layers.concat([cache_k, k], axis=2)
+                v = layers.concat([cache_v, v], axis=2)
+                cache["k"], cache["v"] = k, v
 
-        bigru_output = self.gru_encoder(input_feature)
-        emission = self.fc(bigru_output)
+        return q, k, v
 
-        if target is not None:
-            crf_cost = self.linear_chain_crf(
-                input=emission, label=target, length=lengths)
-            avg_cost = fluid.layers.mean(x=crf_cost)
-            self.crf_decoding.weight = self.linear_chain_crf.weight
-            crf_decode = self.crf_decoding(input=emission, length=lengths)
-            return crf_decode, avg_cost, lengths
-        else:
-            self.linear_chain_crf.weight = self.crf_decoding.weight
-            crf_decode = self.crf_decoding(input=emission, length=lengths)
-            return crf_decode, lengths
+    def forward(self, queries, keys, values, attn_bias, cache=None):
+        # compute q ,k ,v
+        q, k, v = self._prepare_qkv(queries, keys, values, cache)
 
+        # scale dot product attention
+        product = layers.matmul(
+            x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5)
+        if attn_bias is not None:
+            product += attn_bias
+        weights = layers.softmax(product)
+        if self.dropout_rate:
+            weights = layers.dropout(
+                weights, dropout_prob=self.dropout_rate, is_test=False)
 
-class StackedRNNCell(RNNCell):
-    def __init__(self, cells):
-        self.cells = []
-        for i, cell in enumerate(cells):
-            self.cells.append(self.add_sublayer("cell_%d" % i, cell))
+        out = layers.matmul(weights, v)
 
-    def forward(self, inputs, states):
-        pass
+        # combine heads
+        out = layers.transpose(out, perm=[0, 2, 1, 3])
+        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
 
-    @staticmethod
-    def stack_param_attr(param_attr, n):
-        if isinstance(param_attr, (list, tuple)):
-            assert len(param_attr) == n, (
-                "length of param_attr should be %d when it is a list/tuple" %
-                n)
-            param_attrs = [
-                fluid.ParamAttr._to_attr(attr) for attr in param_attr
-            ]
-        else:
-            param_attrs = []
-            attr = fluid.ParamAttr._to_attr(param_attr)
-            for i in range(n):
-                attr_i = copy.deepcopy(attr)
-                if attr.name:
-                    attr_i.name = attr_i.name + "_" + str(i)
-                param_attrs.append(attr_i)
-        return param_attrs
+        # project to output
+        out = self.proj_fc(out)
+        return out
 
-    @property
-    def state_shape(self):
-        return [cell.state_shape for cell in self.cells]
+    def cal_kv(self, keys, values):
+        k = self.k_fc(keys)
+        v = self.v_fc(values)
+        k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
+        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
+        v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
+        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
+        return k, v
 
 
-class StackedLSTMCell(RNNCell):
+class FFN(Layer):
     """
+    Feed-Forward Network
     """
 
     def __init__(self,
-                 input_size,
-                 hidden_size,
-                 gate_activation=None,
-                 activation=None,
-                 forget_bias=1.0,
-                 num_layers=1,
-                 dropout=0.0,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype="float32"):
-        super(StackedLSTMCell, self).__init__()
-        self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
-                                             float)
-        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
-        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
+                 d_inner_hid,
+                 d_model,
+                 dropout_rate,
+                 fc1_act="relu",
+                 reused_fc1=None,
+                 reused_fc2=None):
+        super(FFN, self).__init__()
+        self.dropout_rate = dropout_rate
+        if reused_fc1 is not None:
+            self.fc1 = reused_fc1
+        else:
+            self.fc1 = Linear(
+                input_dim=d_model, output_dim=d_inner_hid, act=fc1_act)
+        if reused_fc2 is not None:
+            self.fc2 = reused_fc2
+        else:
+            self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
 
-        self.cells = []
-        for i in range(num_layers):
-            if forget_bias is True:
-                bias_attrs[
-                    i].initializer = fluid.initializer.NumpyArrayInitializer(
-                        np.concatenate(
-                            np.zeros(2 * hidden_size),
-                            np.ones(hidden_size), np.zeros(hidden_size))
-                        .astype(dtype))
-                forget_bias = 0.0
-            self.cells.append(
-                self.add_sublayer(
-                    "lstm_%d" % i,
-                    BasicLSTMCell(
-                        input_size=input_size if i == 0 else hidden_size,
-                        hidden_size=hidden_size,
-                        gate_activation=gate_activation,
-                        activation=activation,
-                        forget_bias=forget_bias,
-                        param_attr=param_attrs[i],
-                        bias_attr=bias_attrs[i],
-                        dtype=dtype)))
+    def forward(self, x):
+        hidden = self.fc1(x)
+        if self.dropout_rate:
+            hidden = layers.dropout(
+                hidden, dropout_prob=self.dropout_rate, is_test=False)
+        out = self.fc2(hidden)
+        return out
 
-    def forward(self, step_input, states):
-        new_states = []
-        for i, cell in enumerate(self.cells):
-            out, new_state = cell(step_input, states[i])
-            step_input = layers.dropout(
-                out,
-                self.dropout[i],
-                dropout_implementation='upscale_in_train') if self.dropout[
-                    i] > 0 else out
-            new_states.append(new_state)
+
+class TransformerEncoderLayer(Layer):
+    """
+    EncoderLayer
+    """
+
+    def __init__(self,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 ffn_fc1_act="relu",
+                 reused_pre_selatt_layernorm=None,
+                 reused_multihead_att_weights={
+                     "reused_query_fc": None,
+                     "reused_key_fc": None,
+                     "reused_value_fc": None,
+                     "reused_proj_fc": None
+                 },
+                 reused_post_selfatt_layernorm=None,
+                 reused_pre_ffn_layernorm=None,
+                 reused_ffn_weights={"reused_fc1": None,
+                                     "reused_fc2": None},
+                 reused_post_ffn_layernorm=None):
+
+        super(TransformerEncoderLayer, self).__init__()
+
+        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout,
+                                                 reused_pre_selatt_layernorm)
+        self.self_attn = MultiHeadAttention(
+            d_key,
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            reused_query_fc=reused_multihead_att_weights["reused_query_fc"],
+            reused_key_fc=reused_multihead_att_weights["reused_key_fc"],
+            reused_value_fc=reused_multihead_att_weights["reused_value_fc"],
+            reused_proj_fc=reused_multihead_att_weights["reused_proj_fc"])
+        self.postprocesser1 = PrePostProcessLayer(
+            postprocess_cmd, d_model, prepostprocess_dropout,
+            reused_post_selfatt_layernorm)
+
+        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout,
+                                                 reused_pre_ffn_layernorm)
+        self.ffn = FFN(d_inner_hid,
+                       d_model,
+                       relu_dropout,
+                       fc1_act=ffn_fc1_act,
+                       reused_fc1=reused_ffn_weights["reused_fc1"],
+                       reused_fc2=reused_ffn_weights["reused_fc2"])
+        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout,
+                                                  reused_post_ffn_layernorm)
+
+    def forward(self, enc_input, attn_bias):
+        attn_output = self.self_attn(
+            self.preprocesser1(enc_input), None, None, attn_bias)
+        attn_output = self.postprocesser1(attn_output, enc_input)
+
+        ffn_output = self.ffn(self.preprocesser2(attn_output))
+        ffn_output = self.postprocesser2(ffn_output, attn_output)
+        return ffn_output
+
+
+class TransformerEncoder(Layer):
+    """
+    encoder
+    """
+
+    def __init__(self,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 ffn_fc1_act="relu"):
+
+        super(TransformerEncoder, self).__init__()
+
+        self.encoder_layers = list()
+        for i in range(n_layer):
+            self.encoder_layers.append(
+                self.add_sublayer(
+                    "layer_%d" % i,
+                    TransformerEncoderLayer(
+                        n_head,
+                        d_key,
+                        d_value,
+                        d_model,
+                        d_inner_hid,
+                        prepostprocess_dropout,
+                        attention_dropout,
+                        relu_dropout,
+                        preprocess_cmd,
+                        postprocess_cmd,
+                        ffn_fc1_act=ffn_fc1_act)))
+        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
+                                             prepostprocess_dropout)
+
+    def forward(self, enc_input, attn_bias):
+        for encoder_layer in self.encoder_layers:
+            enc_output = encoder_layer(enc_input, attn_bias)
+            enc_input = enc_output
+
+        return self.processer(enc_output)
+
+
+class TransformerDecoderLayer(Layer):
+    """
+    decoder
+    """
+
+    def __init__(self,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 reused_pre_selfatt_layernorm=None,
+                 reused_self_multihead_att_weights={
+                     "reused_query_fc": None,
+                     "reused_key_fc": None,
+                     "reused_value_fc": None,
+                     "reused_proj_fc": None
+                 },
+                 reused_post_selfatt_layernorm=None,
+                 reused_pre_crossatt_layernorm=None,
+                 reused_cross_multihead_att_weights={
+                     "reused_query_fc": None,
+                     "reused_key_fc": None,
+                     "reused_value_fc": None,
+                     "reused_proj_fc": None
+                 },
+                 reused_post_crossatt_layernorm=None,
+                 reused_pre_ffn_layernorm=None,
+                 reused_ffn_weights={"reused_fc1": None,
+                                     "reused_fc2": None},
+                 reused_post_ffn_layernorm=None):
+        super(TransformerDecoderLayer, self).__init__()
+
+        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout,
+                                                 reused_pre_selfatt_layernorm)
+        self.self_attn = MultiHeadAttention(
+            d_key,
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            reused_query_fc=reused_self_multihead_att_weights[
+                "reused_query_fc"],
+            reused_key_fc=reused_self_multihead_att_weights["reused_key_fc"],
+            reused_value_fc=reused_self_multihead_att_weights[
+                "reused_value_fc"],
+            reused_proj_fc=reused_self_multihead_att_weights["reused_proj_fc"])
+        self.postprocesser1 = PrePostProcessLayer(
+            postprocess_cmd, d_model, prepostprocess_dropout,
+            reused_post_selfatt_layernorm)
+
+        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout,
+                                                 reused_pre_crossatt_layernorm)
+        self.cross_attn = MultiHeadAttention(
+            d_key,
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            reused_query_fc=reused_cross_multihead_att_weights[
+                "reused_query_fc"],
+            reused_key_fc=reused_cross_multihead_att_weights["reused_key_fc"],
+            reused_value_fc=reused_cross_multihead_att_weights[
+                "reused_value_fc"],
+            reused_proj_fc=reused_cross_multihead_att_weights[
+                "reused_proj_fc"])
+        self.postprocesser2 = PrePostProcessLayer(
+            postprocess_cmd, d_model, prepostprocess_dropout,
+            reused_post_crossatt_layernorm)
+
+        self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout,
+                                                 reused_pre_ffn_layernorm)
+        self.ffn = FFN(d_inner_hid,
+                       d_model,
+                       relu_dropout,
+                       reused_fc1=reused_ffn_weights["reused_fc1"],
+                       reused_fc2=reused_ffn_weights["reused_fc2"])
+        self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout,
+                                                  reused_post_ffn_layernorm)
+
+    def forward(self,
+                dec_input,
+                enc_output,
+                self_attn_bias,
+                cross_attn_bias,
+                cache=None):
+        self_attn_output = self.self_attn(
+            self.preprocesser1(dec_input), None, None, self_attn_bias, cache)
+        self_attn_output = self.postprocesser1(self_attn_output, dec_input)
+
+        cross_attn_output = self.cross_attn(
+            self.preprocesser2(self_attn_output), enc_output, enc_output,
+            cross_attn_bias, cache)
+        cross_attn_output = self.postprocesser2(cross_attn_output,
+                                                self_attn_output)
+
+        ffn_output = self.ffn(self.preprocesser3(cross_attn_output))
+        ffn_output = self.postprocesser3(ffn_output, cross_attn_output)
+
+        return ffn_output
+
+
+class TransformerDecoder(Layer):
+    """
+    decoder
+    """
+
+    def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
+                 prepostprocess_dropout, attention_dropout, relu_dropout,
+                 preprocess_cmd, postprocess_cmd):
+        super(TransformerDecoder, self).__init__()
+
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.d_key = d_key
+        self.d_value = d_value
+
+        self.decoder_layers = list()
+        for i in range(n_layer):
+            self.decoder_layers.append(
+                self.add_sublayer(
+                    "layer_%d" % i,
+                    TransformerDecoderLayer(
+                        n_head, d_key, d_value, d_model, d_inner_hid,
+                        prepostprocess_dropout, attention_dropout,
+                        relu_dropout, preprocess_cmd, postprocess_cmd)))
+        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
+                                             prepostprocess_dropout)
+
+    def forward(self,
+                dec_input,
+                enc_output,
+                self_attn_bias,
+                cross_attn_bias,
+                caches=None):
+        for i, decoder_layer in enumerate(self.decoder_layers):
+            dec_output = decoder_layer(dec_input, enc_output, self_attn_bias,
+                                       cross_attn_bias, None
+                                       if caches is None else caches[i])
+            dec_input = dec_output
+
+        return self.processer(dec_output)
+
+    def prepare_static_cache(self, enc_output):
+        return [
+            dict(
+                zip(("static_k", "static_v"),
+                    decoder_layer.cross_attn.cal_kv(enc_output, enc_output)))
+            for decoder_layer in self.decoder_layers
+        ]
+
+    def prepare_incremental_cache(self, enc_output):
+        return [{
+            "k": layers.fill_constant_batch_size_like(
+                input=enc_output,
+                shape=[-1, self.n_head, 0, self.d_key],
+                dtype=enc_output.dtype,
+                value=0),
+            "v": layers.fill_constant_batch_size_like(
+                input=enc_output,
+                shape=[-1, self.n_head, 0, self.d_value],
+                dtype=enc_output.dtype,
+                value=0),
+        } for i in range(self.n_layer)]
+
+
+#TODO: we should merge GRUCell with BasicGRUCell
+class GRUCell(RNNCell):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 param_attr=None,
+                 bias_attr=None,
+                 gate_activation='sigmoid',
+                 candidate_activation='tanh',
+                 origin_mode=False):
+        super(GRUCell, self).__init__()
+        self.hidden_size = hidden_size
+        self.fc_layer = Linear(
+            input_size, hidden_size * 3, param_attr=param_attr)
+
+        self.gru_unit = GRUUnit(
+            hidden_size * 3,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            activation=candidate_activation,
+            gate_activation=gate_activation,
+            origin_mode=origin_mode)
+
+    def forward(self, inputs, states):
+        # for GRUCell, `step_outputs` and `new_states` both are hidden
+        x = self.fc_layer(inputs)
+        hidden, _, _ = self.gru_unit(x, states)
+        return hidden, hidden
+
+    @property
+    def state_shape(self):
+        return [self.hidden_size]
+
+
+#TODO: we should merge GRUCell with BasicGRUCell
+class GRUEncoderCell(RNNCell):
+    def __init__(self,
+                 num_layers,
+                 input_size,
+                 hidden_size,
+                 dropout_prob=0.,
+                 init_scale=0.1):
+        super(GRUEncoderCell, self).__init__()
+        self.dropout_prob = dropout_prob
+        # use add_sublayer to add multi-layers
+        self.gru_cells = []
+        for i in range(num_layers):
+            self.gru_cells.append(
+                self.add_sublayer(
+                    "gru_%d" % i,
+                    #BasicGRUCell(
+                    GRUCell(
+                        input_size=input_size if i == 0 else hidden_size,
+                        hidden_size=hidden_size,
+                        param_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.UniformInitializer(
+                                low=-init_scale, high=init_scale)))))
+
+    def forward(self, step_input, states):
+        new_states = []
+        for i, gru_cell in enumerate(self.gru_cells):
+            out, state = gru_cell(step_input, states[i])
+            step_input = layers.dropout(
+                out,
+                self.dropout_prob,
+                dropout_implementation='upscale_in_train'
+            ) if self.dropout_prob > 0 else out
+            new_states.append(step_input)
         return step_input, new_states
 
     @property
-    def state_shape(self):
-        return [cell.state_shape for cell in self.cells]
+    def state_shape(self):
+        return [cell.state_shape for cell in self.gru_cells]
+
+
+class BiGRU(fluid.dygraph.Layer):
+    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
+        super(BiGRU, self).__init__()
+        self.gru = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
+                                      init_bound),
+                       is_reverse=False,
+                       time_major=False)
+
+        self.gru_r = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
+                                        init_bound),
+                         is_reverse=True,
+                         time_major=False)
+
+    def forward(self, input_feature):
+        pre_gru, pre_state = self.gru(input_feature)
+        gru_r, r_state = self.gru_r(input_feature)
+        bi_merge = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
+        return bi_merge
+
+
+class LinearChainCRF(Layer):
+    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
+        super(LinearChainCRF, self).__init__()
+
+        self._param_attr = param_attr
+        self._dtype = dtype
+        self._size = size
+        self._is_test = is_test
+        self._transition = self.create_parameter(
+            attr=self._param_attr,
+            shape=[self._size + 2, self._size],
+            dtype=self._dtype)
+
+    @property
+    def weight(self):
+        return self._transition
 
+    @weight.setter
+    def weight(self, value):
+        self._transition = value
 
-class LSTM(Layer):
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 gate_activation=None,
-                 activation=None,
-                 forget_bias=1.0,
-                 num_layers=1,
-                 dropout=0.0,
-                 is_reverse=False,
-                 time_major=False,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype='float32'):
-        super(LSTM, self).__init__()
-        lstm_cell = StackedLSTMCell(input_size, hidden_size, gate_activation,
-                                    activation, forget_bias, num_layers,
-                                    dropout, param_attr, bias_attr, dtype)
-        self.lstm = RNN(lstm_cell, is_reverse, time_major)
+    def forward(self, input, label, length=None):
 
-    def forward(self, inputs, initial_states=None, sequence_length=None):
-        return self.lstm(inputs, initial_states, sequence_length)
+        alpha = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        emission_exps = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        transition_exps = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        log_likelihood = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        this_inputs = {
+            "Emission": [input],
+            "Transition": self._transition,
+            "Label": [label]
+        }
+        if length is not None:
+            this_inputs['Length'] = [length]
+        self._helper.append_op(
+            type='linear_chain_crf',
+            inputs=this_inputs,
+            outputs={
+                "Alpha": [alpha],
+                "EmissionExps": [emission_exps],
+                "TransitionExps": transition_exps,
+                "LogLikelihood": log_likelihood
+            },
+            attrs={"is_test": self._is_test, })
+        return log_likelihood
 
 
-class BidirectionalRNN(Layer):
-    def __init__(self,
-                 cell_fw,
-                 cell_bw,
-                 merge_mode='concat',
-                 time_major=False,
-                 cell_cls=None,
-                 **kwargs):
-        super(BidirectionalRNN, self).__init__()
-        self.rnn_fw = RNN(cell_fw, is_reverse=False, time_major=time_major)
-        self.rnn_bw = RNN(cell_bw, is_reverse=True, time_major=time_major)
-        if merge_mode == 'concat':
-            self.merge_func = lambda x, y: layers.concat([x, y], -1)
-        elif merge_mode == 'sum':
-            self.merge_func = lambda x, y: layers.elementwise_add(x, y)
-        elif merge_mode == 'ave':
-            self.merge_func = lambda x, y: layers.scale(
-                layers.elementwise_add(x, y), 0.5)
-        elif merge_mode == 'mul':
-            self.merge_func = lambda x, y: layers.elementwise_mul(x, y)
-        elif merge_mode == 'zip':
-            self.merge_func = lambda x, y: (x, y)
-        elif merge_mode is None:
-            self.merge_func = None
-        else:
-            raise ValueError('Unsupported value for `merge_mode`: %s' %
-                             merge_mode)
+class CRFDecoding(Layer):
+    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
+        super(CRFDecoding, self).__init__()
 
-    def forward(self, inputs, initial_states=None, sequence_length=None):
-        if isinstance(initial_states, (list, tuple)):
-            assert len(
-                initial_states
-            ) == 2, "length of initial_states should be 2 when it is a list/tuple"
-        else:
-            initial_states = [initial_states, initial_states]
-        outputs_fw, states_fw = self.rnn_fw(inputs, initial_states[0],
-                                            sequence_length)
-        outputs_bw, states_bw = self.rnn_bw(inputs, initial_states[1],
-                                            sequence_length)
-        outputs = map_structure(
-            self.merge_func, outputs_fw,
-            outputs_bw) if self.merge_func else (outputs_fw, outputs_bw)
-        return outputs, (states_fw, states_bw)
+        self._dtype = dtype
+        self._size = size
+        self._is_test = is_test
+        self._param_attr = param_attr
+        self._transition = self.create_parameter(
+            attr=self._param_attr,
+            shape=[self._size + 2, self._size],
+            dtype=self._dtype)
 
-    @staticmethod
-    def bidirect_param_attr(param_attr):
-        if isinstance(param_attr, (list, tuple)):
-            assert len(
-                param_attr
-            ) == 2, "length of param_attr should be 2 when it is a list/tuple"
-            param_attrs = param_attr
-        else:
-            param_attrs = []
-            attr = fluid.ParamAttr._to_attr(param_attr)
-            attr_fw = copy.deepcopy(attr)
-            if attr.name:
-                attr_fw.name = attr_fw.name + "_fw"
-            param_attrs.append(attr_fw)
-            attr_bw = copy.deepcopy(attr)
-            if attr.name:
-                attr_bw.name = attr_bw.name + "_bw"
-            param_attrs.append(attr_bw)
-        return param_attrs
+    @property
+    def weight(self):
+        return self._transition
+
+    @weight.setter
+    def weight(self, value):
+        self._transition = value
 
+    def forward(self, input, label=None, length=None):
 
-class BidirectionalLSTM(Layer):
+        viterbi_path = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        this_inputs = {
+            "Emission": [input],
+            "Transition": self._transition,
+            "Label": label
+        }
+        if length is not None:
+            this_inputs['Length'] = [length]
+        self._helper.append_op(
+            type='crf_decoding',
+            inputs=this_inputs,
+            outputs={"ViterbiPath": [viterbi_path]},
+            attrs={"is_test": self._is_test, })
+        return viterbi_path
+
+
+class GRUEncoder(Layer):
     def __init__(self,
-                 input_size,
-                 hidden_size,
-                 gate_activation=None,
-                 activation=None,
-                 forget_bias=1.0,
+                 input_dim,
+                 grnn_hidden_dim,
+                 init_bound,
                  num_layers=1,
-                 dropout=0.0,
-                 merge_mode='concat',
-                 merge_each_layer=False,
-                 time_major=False,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype='float32'):
-        super(BidirectionalLSTM, self).__init__()
+                 is_bidirection=False):
+        super(GRUEncoder, self).__init__()
         self.num_layers = num_layers
-        self.merge_mode = merge_mode
-        self.merge_each_layer = merge_each_layer
-        param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
-        bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
-        if not merge_each_layer:
-            cell_fw = StackedLSTMCell(input_size, hidden_size, gate_activation,
-                                      activation, forget_bias, num_layers,
-                                      dropout, param_attrs[0], bias_attrs[0],
-                                      dtype)
-            cell_bw = StackedLSTMCell(input_size, hidden_size, gate_activation,
-                                      activation, forget_bias, num_layers,
-                                      dropout, param_attrs[1], bias_attrs[1],
-                                      dtype)
-            self.lstm = BidirectionalRNN(
-                cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major)
-        else:
-            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
-                                                             num_layers)
-            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
-                                                             num_layers)
-            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
-                                                            num_layers)
-            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
-                                                            num_layers)
-
-            # maybe design cell including both forward and backward later
-            self.lstm = []
+        self.is_bidirection = is_bidirection
+        self.gru_list = []
+        self.gru_r_list = []
+        for i in range(num_layers):
+            self.basic_gru_cell = BasicGRUCell(
+                input_size=input_dim if i == 0 else input_dim * 2,
+                hidden_size=grnn_hidden_dim,
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.UniformInitializer(
+                        low=-init_bound, high=init_bound),
+                    regularizer=fluid.regularizer.L2DecayRegularizer(
+                        regularization_coeff=1e-4)))
+            self.gru_list.append(
+                self.add_sublayer(
+                    "gru_%d" % i,
+                    RNN(self.basic_gru_cell,
+                        is_reverse=False,
+                        time_major=False)))
+        if self.is_bidirection:
             for i in range(num_layers):
-                cell_fw = StackedLSTMCell(
-                    input_size if i == 0 else (hidden_size * 2
-                                               if merge_mode == 'concat' else
-                                               hidden_size), hidden_size,
-                    gate_activation, activation, forget_bias, 1, dropout,
-                    fw_param_attrs[i], fw_bias_attrs[i], dtype)
-                cell_bw = StackedLSTMCell(
-                    input_size if i == 0 else (hidden_size * 2
-                                               if merge_mode == 'concat' else
-                                               hidden_size), hidden_size,
-                    gate_activation, activation, forget_bias, 1, dropout,
-                    bw_param_attrs[i], bw_bias_attrs[i], dtype)
-                self.lstm.append(
+                self.basic_gru_cell_r = BasicGRUCell(
+                    input_size=input_dim if i == 0 else input_dim * 2,
+                    hidden_size=grnn_hidden_dim,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.UniformInitializer(
+                            low=-init_bound, high=init_bound),
+                        regularizer=fluid.regularizer.L2DecayRegularizer(
+                            regularization_coeff=1e-4)))
+                self.gru_r_list.append(
                     self.add_sublayer(
-                        "lstm_%d" % i,
-                        BidirectionalRNN(
-                            cell_fw,
-                            cell_bw,
-                            merge_mode=merge_mode,
-                            time_major=time_major)))
+                        "gru_r_%d" % i,
+                        RNN(self.basic_gru_cell_r,
+                            is_reverse=True,
+                            time_major=False)))
 
-    def forward(self, inputs, initial_states=None, sequence_length=None):
-        if not self.merge_each_layer:
-            return self.lstm(inputs, initial_states, sequence_length)
-        else:
-            if isinstance(initial_states, (list, tuple)):
-                assert len(initial_states) == self.num_layers, (
-                    "length of initial_states should be %d when it is a list/tuple"
-                    % self.num_layers)
+    def forward(self, input_feature, h0=None):
+        for i in range(self.num_layers):
+            pre_gru, pre_state = self.gru_list[i](input_feature)
+            if self.is_bidirection:
+                gru_r, r_state = self.gru_r_list[i](input_feature)
+                out = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
             else:
-                initial_states = [initial_states] * self.num_layers
-            stacked_states = []
-            for i in range(self.num_layers):
-                outputs, states = self.lstm[i](inputs, initial_states[i],
-                                               sequence_length)
-                inputs = outputs
-                stacked_states.append(states)
-            return outputs, stacked_states
-
+                out = pre_gru
+            input_feature = out
+        return out
 
-class StackedGRUCell(RNNCell):
-    """
-    """
 
+class SequenceTagging(Layer):
     def __init__(self,
-                 input_size,
-                 hidden_size,
-                 gate_activation=None,
-                 activation=None,
-                 num_layers=1,
-                 dropout=0.0,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype="float32"):
-        super(StackedGRUCell, self).__init__()
-        self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
-                                             float)
-        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
-        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
-
-        self.cells = []
-        for i in range(num_layers):
-            self.cells.append(
-                self.add_sublayer(
-                    "gru_%d" % i,
-                    BasicGRUCell(
-                        input_size=input_size if i == 0 else hidden_size,
-                        hidden_size=hidden_size,
-                        gate_activation=gate_activation,
-                        activation=activation,
-                        param_attr=param_attrs[i],
-                        bias_attr=bias_attrs[i],
-                        dtype=dtype)))
+                 vocab_size,
+                 num_labels,
+                 word_emb_dim=128,
+                 grnn_hidden_dim=128,
+                 emb_learning_rate=0.1,
+                 crf_learning_rate=0.1,
+                 bigru_num=2,
+                 init_bound=0.1):
+        super(SequenceTagging, self).__init__()
+        """
+        define the sequence tagging network structure
+        word: stores the input of the model
+        for_infer: a boolean value, indicating if the model to be created is for training or predicting.
 
-    def forward(self, step_input, states):
-        new_states = []
-        for i, cell in enumerate(self.cells):
-            out, new_state = cell(step_input, states[i])
-            step_input = layers.dropout(
-                out,
-                self.dropout[i],
-                dropout_implementation='upscale_in_train') if self.dropout[
-                    i] > 0 else out
-            new_states.append(new_state)
-        return step_input, new_states
+        return:
+            for infer: return the prediction
+            otherwise: return the prediction
+        """
+        self.word_emb_dim = word_emb_dim
+        self.vocab_size = vocab_size
+        self.num_labels = num_labels
+        self.grnn_hidden_dim = grnn_hidden_dim
+        self.emb_lr = emb_learning_rate
+        self.crf_lr = crf_learning_rate
+        self.bigru_num = bigru_num
+        self.init_bound = 0.1
 
-    @property
-    def state_shape(self):
-        return [cell.state_shape for cell in self.cells]
+        self.word_embedding = Embedding(
+            size=[self.vocab_size, self.word_emb_dim],
+            dtype='float32',
+            param_attr=fluid.ParamAttr(
+                learning_rate=self.emb_lr,
+                name="word_emb",
+                initializer=fluid.initializer.Uniform(
+                    low=-self.init_bound, high=self.init_bound)))
 
+        self.gru_encoder = GRUEncoder(
+            input_dim=self.grnn_hidden_dim,
+            grnn_hidden_dim=self.grnn_hidden_dim,
+            init_bound=self.init_bound,
+            num_layers=self.bigru_num,
+            is_bidirection=True)
 
-class GRU(Layer):
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 gate_activation=None,
-                 activation=None,
-                 num_layers=1,
-                 dropout=0.0,
-                 is_reverse=False,
-                 time_major=False,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype='float32'):
-        super(GRU, self).__init__()
-        gru_cell = StackedGRUCell(input_size, hidden_size, gate_activation,
-                                  activation, num_layers, dropout, param_attr,
-                                  bias_attr, dtype)
-        self.gru = RNN(gru_cell, is_reverse, time_major)
+        self.fc = Linear(
+            input_dim=self.grnn_hidden_dim * 2,
+            output_dim=self.num_labels,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-self.init_bound, high=self.init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
 
-    def forward(self, inputs, initial_states=None, sequence_length=None):
-        return self.gru(inputs, initial_states, sequence_length)
+        self.linear_chain_crf = LinearChainCRF(
+            param_attr=fluid.ParamAttr(
+                name='linear_chain_crfw', learning_rate=self.crf_lr),
+            size=self.num_labels)
 
+        self.crf_decoding = CRFDecoding(
+            param_attr=fluid.ParamAttr(
+                name='crfw', learning_rate=self.crf_lr),
+            size=self.num_labels)
 
-class BidirectionalGRU(Layer):
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 gate_activation=None,
-                 activation=None,
-                 forget_bias=1.0,
-                 num_layers=1,
-                 dropout=0.0,
-                 merge_mode='concat',
-                 merge_each_layer=False,
-                 time_major=False,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype='float32'):
-        super(BidirectionalGRU, self).__init__()
-        self.num_layers = num_layers
-        self.merge_mode = merge_mode
-        self.merge_each_layer = merge_each_layer
-        param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
-        bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
-        if not merge_each_layer:
-            cell_fw = StackedGRUCell(input_size, hidden_size, gate_activation,
-                                     activation, num_layers, dropout,
-                                     param_attrs[0], bias_attrs[0], dtype)
-            cell_bw = StackedGRUCell(input_size, hidden_size, gate_activation,
-                                     activation, num_layers, dropout,
-                                     param_attrs[1], bias_attrs[1], dtype)
-            self.gru = BidirectionalRNN(
-                cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major)
-        else:
-            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
-                                                             num_layers)
-            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
-                                                             num_layers)
-            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
-                                                            num_layers)
-            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
-                                                            num_layers)
+    def forward(self, word, lengths, target=None):
+        """
+        Configure the network
+        """
+        word_embed = self.word_embedding(word)
+        input_feature = word_embed
 
-            # maybe design cell including both forward and backward later
-            self.gru = []
-            for i in range(num_layers):
-                cell_fw = StackedGRUCell(input_size if i == 0 else (
-                    hidden_size * 2 if merge_mode == 'concat' else
-                    hidden_size), hidden_size, gate_activation, activation, 1,
-                                         dropout, fw_param_attrs[i],
-                                         fw_bias_attrs[i], dtype)
-                cell_bw = StackedGRUCell(input_size if i == 0 else (
-                    hidden_size * 2 if merge_mode == 'concat' else
-                    hidden_size), hidden_size, gate_activation, activation, 1,
-                                         dropout, bw_param_attrs[i],
-                                         bw_bias_attrs[i], dtype)
-                self.gru.append(
-                    self.add_sublayer(
-                        "gru_%d" % i,
-                        BidirectionalRNN(
-                            cell_fw,
-                            cell_bw,
-                            merge_mode=merge_mode,
-                            time_major=time_major)))
+        bigru_output = self.gru_encoder(input_feature)
+        emission = self.fc(bigru_output)
 
-    def forward(self, inputs, initial_states=None, sequence_length=None):
-        if not self.merge_each_layer:
-            return self.gru(inputs, initial_states, sequence_length)
+        if target is not None:
+            crf_cost = self.linear_chain_crf(
+                input=emission, label=target, length=lengths)
+            avg_cost = fluid.layers.mean(x=crf_cost)
+            self.crf_decoding.weight = self.linear_chain_crf.weight
+            crf_decode = self.crf_decoding(input=emission, length=lengths)
+            return crf_decode, avg_cost, lengths
         else:
-            if isinstance(initial_states, (list, tuple)):
-                assert len(initial_states) == self.num_layers, (
-                    "length of initial_states should be %d when it is a list/tuple"
-                    % self.num_layers)
-            else:
-                initial_states = [initial_states] * self.num_layers
-            stacked_states = []
-            for i in range(self.num_layers):
-                outputs, states = self.gru[i](inputs, initial_states[i],
-                                              sequence_length)
-                inputs = outputs
-                stacked_states.append(states)
-            return outputs, stacked_states
+            self.linear_chain_crf.weight = self.crf_decoding.weight
+            crf_decode = self.crf_decoding(input=emission, length=lengths)
+            return crf_decode, lengths
-- 
GitLab