[fluid clean] remove 4 fluid.layers api and imigrate 2 fluid.layer api (#48972)

* fluid clean layer * docs

[fluid clean] remove 4 fluid.layers api and imigrate 2 fluid.layer api (#48972)
* fluid clean layer * docs
acee3dd3 · lugimzzz · GitHub · b06a5946 · acee3dd3 · acee3dd3
6 changed file
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -36,503 +36,15 @@ from ..data_feeder import check_variable_and_dtype, check_type, check_dtype
 from collections.abc import Sequence
 __all__ = [
-    'RNNCell',
-    'GRUCell',
-    'LSTMCell',
-    'rnn',
-    'birnn',
    'dynamic_decode',
    'dynamic_lstm',
    'dynamic_lstmp',
    'dynamic_gru',
    'gru_unit',
-    'lstm_unit',
    'lstm',
 ]
-class RNNCell:
-    """
-        :api_attr: Static Graph
-    RNNCell is the base class for abstraction representing the calculations
-    mapping the input and state to the output and new state. It is suitable to
-    and mostly used in RNN.
-    """
-    def call(self, inputs, states, **kwargs):
-        r"""
-        Every cell must implement this method to do the calculations mapping the
-        inputs and states to the output and new states.
-        To be more flexible, both inputs and states can be a tensor variable or
-        a nested structure (list|tuple|namedtuple|dict) of tensor variable, that
-        is, a (possibly nested structure of) tensor variable[s].
-        Parameters:
-            inputs: A (possibly nested structure of) tensor variable[s].
-            states: A (possibly nested structure of) tensor variable[s].
-            **kwargs: Additional keyword arguments, provided by the caller.
-        Returns:
-            tuple: outputs and new_states pair. outputs and new_states both \
-                can be nested structure of tensor variables. new_states must \
-                have the same structure with states.
-        """
-        raise NotImplementedError("RNNCell must implent the call function.")
-    def __call__(self, inputs, states, **kwargs):
-        return self.call(inputs, states, **kwargs)
-    def get_initial_states(
-        self,
-        batch_ref,
-        shape=None,
-        dtype='float32',
-        init_value=0,
-        batch_dim_idx=0,
-    ):
-        r"""
-        Generate initialized states according to provided shape, data type and
-        value.
-        Parameters:
-            batch_ref: A (possibly nested structure of) tensor variable[s].
-                The first dimension of the tensor will be used as batch size to
-                initialize states.
-            shape: A (possibly nested structure of) shape[s], where a shape is
-                represented as a list/tuple of integer). -1(for batch size) will
-                beautomatically inserted if shape is not started with it. If None,
-                property `state_shape` will be used. The default value is None.
-            dtype: A (possibly nested structure of) data type[s]. The structure
-                must be same as that of `shape`, except when all tensors' in states
-                has the same data type, a single data type can be used. If
-                property `cell.state_shape` is not available, float32 will be used
-                as the data type. The default value is float32.
-            init_value: A float value used to initialize states.
-            batch_dim_idx: An integer indicating which dimension of the tensor in
-                inputs represents batch size.  The default value is 0.
-        Returns:
-            Variable: tensor variable[s] packed in the same structure provided \
-                by shape, representing the initialized states.
-        """
-        check_variable_and_dtype(
-            batch_ref,
-            'batch_ref',
-            ['float32', 'float64', 'int32', 'int64'],
-            'RNNCell',
-        )
-        check_type(shape, 'shape', (list, tuple, type(None), int), 'RNNCell')
-        if isinstance(shape, (list, tuple)):
-            shapes = map_structure(lambda x: x, shape)
-            if isinstance(shape, list):
-                for i, _shape in enumerate(shapes):
-                    check_type(_shape, 'shapes[' + str(i) + ']', int, 'RNNCell')
-            else:
-                check_type(shapes, 'shapes', int, 'RNNCell')
-        check_dtype(dtype, 'dtype', ['float32', 'float64'], 'RNNCell')
-        # TODO: use inputs and batch_size
-        batch_ref = flatten(batch_ref)[0]
-        def _is_shape_sequence(seq):
-            """For shape, list/tuple of integer is the finest-grained objection"""
-            if isinstance(seq, list) or isinstance(seq, tuple):
-                if reduce(
-                    lambda flag, x: isinstance(x, int) and flag, seq, True
-                ):
-                    return False
-            # TODO: Add check for the illegal
-            if isinstance(seq, dict):
-                return True
-            return isinstance(seq, Sequence) and not isinstance(seq, str)
-        class Shape:
-            def __init__(self, shape):
-                self.shape = shape if shape[0] == -1 else ([-1] + list(shape))
-        # nested structure of shapes
-        states_shapes = self.state_shape if shape is None else shape
-        is_sequence_ori = utils.is_sequence
-        utils.is_sequence = _is_shape_sequence
-        states_shapes = map_structure(lambda shape: Shape(shape), states_shapes)
-        utils.is_sequence = is_sequence_ori
-        # nested structure of dtypes
-        try:
-            states_dtypes = self.state_dtype if dtype is None else dtype
-        except NotImplementedError:  # use fp32 as default
-            states_dtypes = "float32"
-        if len(flatten(states_dtypes)) == 1:
-            dtype = flatten(states_dtypes)[0]
-            states_dtypes = map_structure(lambda shape: dtype, states_shapes)
-        init_states = map_structure(
-            lambda shape, dtype: tensor.fill_constant_batch_size_like(
-                input=batch_ref,
-                shape=shape.shape,
-                dtype=dtype,
-                value=init_value,
-                input_dim_idx=batch_dim_idx,
-            ),
-            states_shapes,
-            states_dtypes,
-        )
-        return init_states
-    @property
-    def state_shape(self):
-        """
-        Abstract method (property).
-        Used to initialize states.
-        A (possibly nested structure of) shape[s], where a shape is represented
-        as a list/tuple of integers (-1 for batch size would be automatically
-        inserted into a shape if shape is not started with it).
-        Not necessary to be implemented if states are not initialized by
-        `get_initial_states` or the `shape` argument is provided when using
-        `get_initial_states`.
-        """
-        raise NotImplementedError(
-            "Please add implementaion for `state_shape` in the used cell."
-        )
-    @property
-    def state_dtype(self):
-        """
-        Abstract method (property).
-        Used to initialize states.
-        A (possibly nested structure of) data types[s]. The structure must be
-        same as that of `shape`, except when all tensors' in states has the same
-        data type, a single data type can be used.
-        Not necessary to be implemented if states are not initialized
-        by `get_initial_states` or the `dtype` argument is provided when using
-        `get_initial_states`.
-        """
-        raise NotImplementedError(
-            "Please add implementaion for `state_dtype` in the used cell."
-        )
-class GRUCell(RNNCell):
-    r"""
-        :api_attr: Static Graph
-    Gated Recurrent Unit cell. It is a wrapper for
-    `fluid.contrib.layers.rnn_impl.BasicGRUUnit` to make it adapt to RNNCell.
-    The formula used is as follow:
-    .. math::
-        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
-        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
-        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
-        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
-    For more details, please refer to  `Learning Phrase Representations using
-    RNN Encoder Decoder for Statistical Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
-    Examples:
-        .. code-block:: python
-            import paddle.fluid.layers as layers
-            cell = layers.GRUCell(hidden_size=256)
-    """
-    def __init__(
-        self,
-        hidden_size,
-        param_attr=None,
-        bias_attr=None,
-        gate_activation=None,
-        activation=None,
-        dtype="float32",
-        name="GRUCell",
-    ):
-        """
-        Constructor of GRUCell.
-        Parameters:
-            hidden_size (int): The hidden size in the GRU cell.
-            param_attr(ParamAttr, optional): The parameter attribute for the learnable
-                weight matrix. Default: None.
-            bias_attr (ParamAttr, optional): The parameter attribute for the bias
-                of GRU. Default: None.
-            gate_activation (function, optional): The activation function for :math:`act_g`.
-                Default: `fluid.layers.sigmoid`.
-            activation (function, optional): The activation function for :math:`act_c`.
-                Default: `fluid.layers.tanh`.
-            dtype(string, optional): The data type used in this cell. Default float32.
-            name(string, optional) : The name scope used to identify parameters and biases.
-        """
-        check_type(hidden_size, 'hidden_size', (int), 'GRUCell')
-        check_dtype(dtype, 'dtype', ['float32', 'float64'], 'GRUCell')
-        self.hidden_size = hidden_size
-        from .. import contrib  # TODO: resolve recurrent import
-        self.gru_unit = contrib.layers.rnn_impl.BasicGRUUnit(
-            name,
-            hidden_size,
-            param_attr,
-            bias_attr,
-            gate_activation,
-            activation,
-            dtype,
-        )
-    def call(self, inputs, states):
-        r"""
-        Perform calculations of GRU.
-        Parameters:
-            inputs(Variable): A tensor with shape `[batch_size, input_size]`,
-                corresponding to :math:`x_t` in the formula. The data type
-                should be float32 or float64.
-            states(Variable): A tensor with shape `[batch_size, hidden_size]`.
-                corresponding to :math:`h_{t-1}` in the formula. The data type
-                should be float32 or float64.
-        Returns:
-            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` and \
-                `new_states` is the same tensor shaped `[batch_size, hidden_size]`, \
-                corresponding to :math:`h_t` in the formula. The data type of the \
-                tensor is same as that of `states`.
-        """
-        check_variable_and_dtype(
-            inputs, 'inputs', ['float32', 'float64'], 'GRUCell'
-        )
-        check_variable_and_dtype(
-            states, 'states', ['float32', 'float64'], 'GRUCell'
-        )
-        new_hidden = self.gru_unit(inputs, states)
-        return new_hidden, new_hidden
-    @property
-    def state_shape(self):
-        """
-        The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
-        size would be automatically inserted into shape). The shape corresponds
-        to :math:`h_{t-1}`.
-        """
-        return [self.hidden_size]
-class LSTMCell(RNNCell):
-    r"""
-        :api_attr: Static Graph
-    Long-Short Term Memory cell. It is a wrapper for
-    `fluid.contrib.layers.rnn_impl.BasicLSTMUnit` to make it adapt to RNNCell.
-    The formula used is as follow:
-    .. math::
-        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
-        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
-        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
-        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
-        h_{t} & = o_{t} act_c (c_{t})
-    For more details, please refer to `RECURRENT NEURAL NETWORK REGULARIZATION <http://arxiv.org/abs/1409.2329>`_
-    Examples:
-        .. code-block:: python
-            import paddle.fluid.layers as layers
-            cell = layers.LSTMCell(hidden_size=256)
-    """
-    def __init__(
-        self,
-        hidden_size,
-        param_attr=None,
-        bias_attr=None,
-        gate_activation=None,
-        activation=None,
-        forget_bias=1.0,
-        dtype="float32",
-        name="LSTMCell",
-    ):
-        """
-        Constructor of LSTMCell.
-        Parameters:
-            hidden_size (int): The hidden size in the LSTM cell.
-            param_attr(ParamAttr, optional): The parameter attribute for the learnable
-                weight matrix. Default: None.
-            bias_attr (ParamAttr, optional): The parameter attribute for the bias
-                of LSTM. Default: None.
-            gate_activation (function, optional): The activation function for :math:`act_g`.
-                Default: 'fluid.layers.sigmoid'.
-            activation (function, optional): The activation function for :math:`act_h`.
-                Default: 'fluid.layers.tanh'.
-            forget_bias(float, optional): forget bias used when computing forget gate.
-                Default 1.0
-            dtype(string, optional): The data type used in this cell. Default float32.
-            name(string, optional) : The name scope used to identify parameters and biases.
-        """
-        check_type(hidden_size, 'hidden_size', (int), 'LSTMCell')
-        check_dtype(dtype, 'dtype', ['float32', 'float64'], 'LSTMCell')
-        self.hidden_size = hidden_size
-        from .. import contrib  # TODO: resolve recurrent import
-        self.lstm_unit = contrib.layers.rnn_impl.BasicLSTMUnit(
-            name,
-            hidden_size,
-            param_attr,
-            bias_attr,
-            gate_activation,
-            activation,
-            forget_bias,
-            dtype,
-        )
-    def call(self, inputs, states):
-        r"""
-        Perform calculations of LSTM.
-        Parameters:
-            inputs(Variable): A tensor with shape `[batch_size, input_size]`,
-                corresponding to :math:`x_t` in the formula. The data type
-                should be float32 or float64.
-            states(Variable): A list of containing two tensors, each shaped
-                `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}`
-                in the formula. The data type should be float32 or float64.
-        Returns:
-            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
-                a tensor with shape `[batch_size, hidden_size]`, corresponding \
-                to :math:`h_{t}` in the formula; `new_states` is a list containing \
-                two tenser variables shaped `[batch_size, hidden_size]`, corresponding \
-                to :math:`h_{t}, c_{t}` in the formula. The data type of these \
-                tensors all is same as that of `states`.
-        """
-        check_variable_and_dtype(
-            inputs, 'inputs', ['float32', 'float64'], 'LSTMCell'
-        )
-        check_type(states, 'states', list, 'LSTMCell')
-        if isinstance(states, list):
-            for i, state in enumerate(states):
-                check_variable_and_dtype(
-                    state,
-                    'state[' + str(i) + ']',
-                    ['float32', 'float64'],
-                    'LSTMCell',
-                )
-        pre_hidden, pre_cell = states
-        new_hidden, new_cell = self.lstm_unit(inputs, pre_hidden, pre_cell)
-        return new_hidden, [new_hidden, new_cell]
-    @property
-    def state_shape(self):
-        """
-        The `state_shape` of LSTMCell is a list with two shapes: `[[hidden_size], [hidden_size]]`
-        (-1 for batch size would be automatically inserted into shape). These two
-        shapes correspond to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
-        """
-        return [[self.hidden_size], [self.hidden_size]]
-def rnn(
-    cell,
-    inputs,
-    initial_states=None,
-    sequence_length=None,
-    time_major=False,
-    is_reverse=False,
-    **kwargs
-):
-    """
-    rnn creates a recurrent neural network specified by RNNCell `cell`,
-    which performs :code:`cell.call()` (for dygraph mode :code:`cell.forward`)
-    repeatedly until reaches to the maximum length of `inputs`.
-    Arguments:
-        cell(RNNCellBase): An instance of `RNNCellBase`.
-        inputs(Tensor): the input sequences.
-            If time_major is True, the shape is
-            `[time_steps, batch_size, input_size]`
-            else the shape is `[batch_size, time_steps, input_size]`.
-        initial_states(Tensor|tuple|list, optional): the initial state of the
-            rnn cell. Tensor or a possibly nested structure of tensors. If not
-            provided, `cell.get_initial_states` would be called to produce
-            the initial state. Defaults to None.
-        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64
-            or int32. The valid lengths of input sequences. Defaults to None.
-            If `sequence_length` is not None, the inputs are treated as
-            padded sequences. In each input sequence, elements whose time step
-            index are not less than the valid length are treated as paddings.
-        time_major (bool): Whether the first dimension of the input means the
-            time steps. Defaults to False.
-        is_reverse (bool, optional): Indicate whether to calculate in the reverse
-            order of input sequences. Defaults to False.
-        **kwargs: Additional keyword arguments to pass to `forward` of the cell.
-    Returns:
-        (outputs, final_states)
-        outputs (Tensor|list|tuple): the output sequence. Tensor or nested
-            structure of Tensors.
-            If `time_major` is True, the shape of each tensor in outpus is
-            `[time_steps, batch_size, hidden_size]`, else
-            `[batch_size, time_steps, hidden_size]`.
-        final_states (Tensor|list|tuple): final states. A (possibly nested structure of)
-            tensor[s], representing the final state for RNN. It has the same
-            structure of intial state. Each tensor in final states has the same
-            shape and dtype as the corresponding tensor in initial states.
-    Examples:
-        .. code-block:: python
-            import paddle
-            paddle.disable_static()
-            cell = paddle.nn.SimpleRNNCell(16, 32)
-            inputs = paddle.rand((4, 23, 16))
-            prev_h = paddle.randn((4, 32))
-            outputs, final_states = paddle.fluid.layers.rnn(cell, inputs, prev_h)
-    """
-    if _non_static_mode():
-        return _rnn_dynamic_graph(
-            cell,
-            inputs,
-            initial_states,
-            sequence_length,
-            time_major,
-            is_reverse,
-            **kwargs
-        )
-    else:
-        return _rnn_static_graph(
-            cell,
-            inputs,
-            initial_states,
-            sequence_length,
-            time_major,
-            is_reverse,
-            **kwargs
-        )
 class ArrayWrapper:
    def __init__(self, x):
        self.array = [x]
@@ -545,273 +57,6 @@ class ArrayWrapper:
        return self.array.__getitem__(item)
-def _maybe_copy(state, new_state, step_mask):
-    """update rnn state or just pass the old state through"""
-    new_state = paddle.tensor.math._multiply_with_axis(
-        new_state, step_mask, axis=0
-    ) + paddle.tensor.math._multiply_with_axis(state, (1 - step_mask), axis=0)
-    return new_state
-def _transpose_batch_time(x):
-    perm = [1, 0] + list(range(2, len(x.shape)))
-    return paddle.transpose(x, perm)
-def _rnn_dynamic_graph(
-    cell,
-    inputs,
-    initial_states=None,
-    sequence_length=None,
-    time_major=False,
-    is_reverse=False,
-    **kwargs
-):
-    time_step_index = 0 if time_major else 1
-    flat_inputs = flatten(inputs)
-    time_steps = flat_inputs[0].shape[time_step_index]
-    if initial_states is None:
-        initial_states = cell.get_initial_states(
-            batch_ref=inputs, batch_dim_idx=1 if time_major else 0
-        )
-    if not time_major:
-        inputs = map_structure(_transpose_batch_time, inputs)
-    if sequence_length is not None:
-        mask = sequence_lod.sequence_mask(
-            sequence_length, maxlen=time_steps, dtype=inputs.dtype
-        )
-        mask = paddle.transpose(mask, [1, 0])
-    if is_reverse:
-        inputs = map_structure(lambda x: paddle.reverse(x, axis=[0]), inputs)
-        mask = (
-            paddle.reverse(mask, axis=[0])
-            if sequence_length is not None
-            else None
-        )
-    states = initial_states
-    outputs = []
-    for i in range(time_steps):
-        step_inputs = map_structure(lambda x: x[i], inputs)
-        step_outputs, new_states = cell(step_inputs, states, **kwargs)
-        if sequence_length is not None:
-            new_states = map_structure(
-                partial(_maybe_copy, step_mask=mask[i]), states, new_states
-            )
-        states = new_states
-        outputs = (
-            map_structure(lambda x: ArrayWrapper(x), step_outputs)
-            if i == 0
-            else map_structure(
-                lambda x, x_array: x_array.append(x), step_outputs, outputs
-            )
-        )
-    final_outputs = map_structure(
-        lambda x: paddle.stack(x.array, axis=time_step_index), outputs
-    )
-    if is_reverse:
-        final_outputs = map_structure(
-            lambda x: paddle.reverse(x, axis=time_step_index), final_outputs
-        )
-    final_states = new_states
-    return final_outputs, final_states
-def _rnn_static_graph(
-    cell,
-    inputs,
-    initial_states=None,
-    sequence_length=None,
-    time_major=False,
-    is_reverse=False,
-    **kwargs
-):
-    check_type(inputs, 'inputs', (Variable, list, tuple), 'rnn')
-    if isinstance(inputs, (list, tuple)):
-        for i, input_x in enumerate(inputs):
-            check_variable_and_dtype(
-                input_x, 'inputs[' + str(i) + ']', ['float32', 'float64'], 'rnn'
-            )
-    check_type(
-        initial_states,
-        'initial_states',
-        (Variable, list, tuple, type(None)),
-        'rnn',
-    )
-    check_type(
-        sequence_length, 'sequence_length', (Variable, type(None)), 'rnn'
-    )
-    def _switch_grad(x, stop=False):
-        x.stop_gradient = stop
-        return x
-    if initial_states is None:
-        initial_states = cell.get_initial_states(
-            batch_ref=inputs, batch_dim_idx=1 if time_major else 0
-        )
-    initial_states = map_structure(_switch_grad, initial_states)
-    if not time_major:
-        inputs = map_structure(_transpose_batch_time, inputs)
-    if sequence_length:
-        max_seq_len = paddle.shape(flatten(inputs)[0])[0]
-        mask = sequence_lod.sequence_mask(
-            sequence_length,
-            maxlen=max_seq_len,
-            dtype=flatten(initial_states)[0].dtype,
-        )
-        mask = paddle.transpose(mask, [1, 0])
-    if is_reverse:
-        inputs = map_structure(lambda x: paddle.reverse(x, axis=[0]), inputs)
-        mask = paddle.reverse(mask, axis=[0]) if sequence_length else None
-    # StaticRNN
-    rnn = control_flow.StaticRNN()
-    with rnn.step():
-        inputs = map_structure(rnn.step_input, inputs)
-        states = map_structure(rnn.memory, initial_states)
-        copy_states = map_structure(lambda x: x, states)
-        outputs, new_states = cell(inputs, copy_states, **kwargs)
-        assert_same_structure(states, new_states)
-        if sequence_length:
-            step_mask = rnn.step_input(mask)
-            new_states = map_structure(
-                partial(_maybe_copy, step_mask=step_mask), states, new_states
-            )
-        map_structure(rnn.update_memory, states, new_states)
-        flat_outputs = flatten(outputs)
-        map_structure(rnn.step_output, outputs)
-        map_structure(rnn.step_output, new_states)
-    rnn_out = rnn()
-    final_outputs = rnn_out[: len(flat_outputs)]
-    final_outputs = pack_sequence_as(outputs, final_outputs)
-    final_states = map_structure(lambda x: x[-1], rnn_out[len(flat_outputs) :])
-    final_states = pack_sequence_as(new_states, final_states)
-    if is_reverse:
-        final_outputs = map_structure(
-            lambda x: paddle.reverse(x, axis=[0]), final_outputs
-        )
-    if not time_major:
-        final_outputs = map_structure(_transpose_batch_time, final_outputs)
-    return (final_outputs, final_states)
-def birnn(
-    cell_fw,
-    cell_bw,
-    inputs,
-    initial_states=None,
-    sequence_length=None,
-    time_major=False,
-    **kwargs
-):
-    """
-    birnn creates a bidirectional recurrent neural network specified by
-    RNNCell `cell_fw` and `cell_bw`, which performs :code:`cell.call()`
-    (for dygraph mode :code:`cell.forward`) repeatedly until reaches to
-    the maximum length of `inputs` and then concat the outputs for both RNNs
-    along the last axis.
-    Arguments:
-        cell_fw(RNNCellBase): An instance of `RNNCellBase`.
-        cell_bw(RNNCellBase): An instance of `RNNCellBase`.
-        inputs(Tensor): the input sequences.
-            If time_major is True, the shape is
-            `[time_steps, batch_size, input_size]`
-            else the shape is `[batch_size, time_steps, input_size]`.
-        initial_states(tuple, optional): A tuple of initial states of
-            `cell_fw` and `cell_bw`.
-            If not provided, `cell.get_initial_states` would be called to
-            produce initial state for each cell. Defaults to None.
-        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64
-            or int32. The valid lengths of input sequences. Defaults to None.
-            If `sequence_length` is not None, the inputs are treated as
-            padded sequences. In each input sequence, elements whose time step
-            index are not less than the valid length are treated as paddings.
-        time_major (bool): Whether the first dimension of the input means the
-            time steps. Defaults to False.
-        **kwargs: Additional keyword arguments to pass to `forward` of each cell.
-    Returns:
-        (outputs, final_states)
-        outputs (Tensor): the outputs of the bidirectional RNN. It is the
-            concatenation of the outputs from the forward RNN and backward
-            RNN along the last axis.
-            If time major is True, the shape is `[time_steps, batch_size, size]`,
-            else the shape is `[batch_size, time_steps, size]`, where size is
-            `cell_fw.hidden_size + cell_bw.hidden_size`.
-        final_states (tuple): A tuple of the final states of the forward
-            cell and backward cell.
-    Examples:
-        .. code-block:: python
-            import paddle
-            paddle.disable_static()
-            cell_fw = paddle.nn.LSTMCell(16, 32)
-            cell_bw = paddle.nn.LSTMCell(16, 32)
-            inputs = paddle.rand((4, 23, 16))
-            hf, cf = paddle.rand((4, 32)), paddle.rand((4, 32))
-            hb, cb = paddle.rand((4, 32)), paddle.rand((4, 32))
-            initial_states = ((hf, cf), (hb, cb))
-            outputs, final_states = paddle.fluid.layers.birnn(
-                cell_fw, cell_bw, inputs, initial_states)
-    """
-    if initial_states is None:
-        states_fw = cell_fw.get_initial_states(
-            batch_ref=inputs, batch_dim_idx=1 if time_major else 0
-        )
-        states_bw = cell_fw.get_initial_states(
-            batch_ref=inputs, batch_dim_idx=1 if time_major else 0
-        )
-    else:
-        states_fw, states_bw = initial_states
-    outputs_fw, states_fw = rnn(
-        cell_fw,
-        inputs,
-        states_fw,
-        sequence_length,
-        time_major=time_major,
-        **kwargs
-    )
-    outputs_bw, states_bw = rnn(
-        cell_bw,
-        inputs,
-        states_bw,
-        sequence_length,
-        time_major=time_major,
-        is_reverse=True,
-        **kwargs
-    )
-    outputs = map_structure(
-        lambda x, y: tensor.concat([x, y], -1), outputs_fw, outputs_bw
-    )
-    final_states = (states_fw, states_bw)
-    return outputs, final_states
 def _dynamic_decode_imperative(
    decoder,
    inits=None,
@@ -2175,151 +1420,3 @@ def gru_unit(
    )
    return updated_hidden, reset_hidden_pre, gate
-def lstm_unit(
-    x_t,
-    hidden_t_prev,
-    cell_t_prev,
-    forget_bias=0.0,
-    param_attr=None,
-    bias_attr=None,
-    name=None,
-):
-    r"""
-	:api_attr: Static Graph
-    Long-Short Term Memory (LSTM) RNN cell. This operator performs LSTM calculations for
-    one time step, whose implementation is based on calculations described in `RECURRENT
-    NEURAL NETWORK REGULARIZATION <http://arxiv.org/abs/1409.2329>`_  .
-    We add forget_bias to the biases of the forget gate in order to
-    reduce the scale of forgetting. The formula is as follows:
-    .. math::
-        i_{t} & = \sigma(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
-        f_{t} & = \sigma(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
-        c_{t} & = f_{t}c_{t-1} + i_{t} tanh (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
-        o_{t} & = \sigma(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
-        h_{t} & = o_{t} tanh (c_{t})
-    :math:`x_{t}` stands for ``x_t`` , corresponding to the input of current time step;
-    :math:`h_{t-1}` and :math:`c_{t-1}` correspond to ``hidden_t_prev`` and ``cell_t_prev`` ,
-    representing the output of from previous time step.
-    :math:`i_{t}, f_{t}, c_{t}, o_{t}, h_{t}` are input gate, forget gate, cell, output gate
-    and hidden calculation.
-    Args:
-        x_t(Variable): A 2D Tensor representing the input of current time step.
-            Its shape should be :math:`[N, M]` , where :math:`N` stands for batch
-            size, :math:`M` for the feature size of input. The data type should
-            be float32 or float64.
-        hidden_t_prev(Variable): A 2D Tensor representing the hidden value from
-            previous step. Its shape should be :math:`[N, D]` , where :math:`N`
-            stands for batch size, :math:`D` for the hidden size. The data type
-            should be same as ``x_t`` .
-        cell_t_prev(Variable): A 2D Tensor representing the cell value from
-            previous step. It has the same shape and data type with ``hidden_t_prev`` .
-        forget_bias (float, optional): :math:`forget\\_bias` added to the biases
-            of the forget gate. Default 0.
-        param_attr(ParamAttr, optional):  To specify the weight parameter property.
-            Default: None, which means the default weight parameter property is used.
-            See usage for details in :ref:`api_fluid_ParamAttr` .
-        bias_attr (ParamAttr, optional): To specify the bias parameter property.
-            Default: None, which means the default bias parameter property is used.
-            See usage for details in :ref:`api_fluid_ParamAttr` .
-        name(str, optional): For detailed information, please refer
-            to :ref:`api_guide_Name`. Usually name is no need to set and
-            None by default.
-    Returns:
-        tuple: The tuple contains two Tensor variables with the same shape and \
-            data type with ``hidden_t_prev`` , representing the hidden value and \
-            cell value which correspond to :math:`h_{t}` and :math:`c_{t}` in \
-            the formula.
-    Raises:
-        ValueError: Rank of x_t must be 2.
-        ValueError: Rank of hidden_t_prev must be 2.
-        ValueError: Rank of cell_t_prev must be 2.
-        ValueError: The 1st dimensions of x_t, hidden_t_prev and cell_t_prev must be the same.
-        ValueError: The 2nd dimensions of hidden_t_prev and cell_t_prev must be the same.
-    Examples:
-        .. code-block:: python
-            import paddle.fluid as fluid
-            dict_dim, emb_dim, hidden_dim = 128, 64, 512
-            data = fluid.data(name='step_data', shape=[None], dtype='int64')
-            x = fluid.embedding(input=data, size=[dict_dim, emb_dim])
-            pre_hidden = fluid.data(
-                name='pre_hidden', shape=[None, hidden_dim], dtype='float32')
-            pre_cell = fluid.data(
-                name='pre_cell', shape=[None, hidden_dim], dtype='float32')
-            hidden = fluid.layers.lstm_unit(
-                x_t=x,
-                hidden_t_prev=pre_hidden,
-                cell_t_prev=pre_cell)
-    """
-    helper = LayerHelper('lstm_unit', **locals())
-    check_variable_and_dtype(x_t, 'x_t', ['float32', 'float64'], 'lstm_unit')
-    check_variable_and_dtype(
-        hidden_t_prev, 'hidden_t_prev', ['float32', 'float64'], 'lstm_unit'
-    )
-    check_variable_and_dtype(
-        cell_t_prev, 'cell_t_prev', ['float32', 'float64'], 'lstm_unit'
-    )
-    if len(x_t.shape) != 2:
-        raise ValueError("Rank of x_t must be 2.")
-    if len(hidden_t_prev.shape) != 2:
-        raise ValueError("Rank of hidden_t_prev must be 2.")
-    if len(cell_t_prev.shape) != 2:
-        raise ValueError("Rank of cell_t_prev must be 2.")
-    if (
-        x_t.shape[0] != hidden_t_prev.shape[0]
-        or x_t.shape[0] != cell_t_prev.shape[0]
-    ):
-        raise ValueError(
-            "The 1st dimensions of x_t, hidden_t_prev and "
-            "cell_t_prev must be the same."
-        )
-    if hidden_t_prev.shape[1] != cell_t_prev.shape[1]:
-        raise ValueError(
-            "The 2nd dimensions of hidden_t_prev and "
-            "cell_t_prev must be the same."
-        )
-    if bias_attr is None:
-        bias_attr = ParamAttr()
-    size = cell_t_prev.shape[1]
-    concat_out = nn.concat(input=[x_t, hidden_t_prev], axis=1)
-    fc_out = nn.fc(
-        input=concat_out,
-        size=4 * size,
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-    )
-    dtype = x_t.dtype
-    c = helper.create_variable_for_type_inference(dtype)
-    h = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='lstm_unit',
-        inputs={"X": fc_out, "C_prev": cell_t_prev},
-        outputs={"C": c, "H": h},
-        attrs={"forget_bias": forget_bias},
-    )
-    return h, c
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -2179,26 +2179,6 @@ class TestBook(LayerTest):
                x, kernel_size=[5, 3], stride=[1, 2], padding=(2, 1)
            )
-    def make_lstm_unit(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            x_t_data = self._get_data(
-                name='x_t_data', shape=[10, 10], dtype='float32'
-            )
-            x_t = layers.fc(input=x_t_data, size=10)
-            prev_hidden_data = self._get_data(
-                name='prev_hidden_data', shape=[10, 30], dtype='float32'
-            )
-            prev_hidden = layers.fc(input=prev_hidden_data, size=30)
-            prev_cell_data = self._get_data(
-                name='prev_cell', shape=[10, 30], dtype='float32'
-            )
-            prev_cell = layers.fc(input=prev_cell_data, size=30)
-            return layers.lstm_unit(
-                x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell
-            )
    def make_softmax(self):
        with program_guard(
            fluid.default_main_program(), fluid.default_startup_program()

--- a/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
@@ -17,10 +17,6 @@ import unittest
 import numpy as np
 from op_test import OpTest
-from paddle import fluid
-from paddle.fluid.framework import Program, program_guard
-from paddle.fluid.layers import lstm_unit
 def sigmoid_np(x):
    return 1.0 / (1.0 + np.exp(-x))
@@ -30,79 +26,6 @@ def tanh_np(x):
    return 2 * sigmoid_np(2.0 * x) - 1.0
-class LstmUnitTestError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            batch_size, dict_dim, emb_dim, hidden_dim = 32, 128, 64, 512
-            data = fluid.data(
-                name='step_data', shape=[batch_size], dtype='int64'
-            )
-            inputs = fluid.embedding(input=data, size=[dict_dim, emb_dim])
-            pre_hidden = fluid.data(
-                name='pre_hidden',
-                shape=[batch_size, hidden_dim],
-                dtype='float32',
-            )
-            pre_cell = fluid.data(
-                name='pre_cell', shape=[batch_size, hidden_dim], dtype='float32'
-            )
-            np_input = np.random.uniform(
-                -0.1, 0.1, (batch_size, emb_dim)
-            ).astype('float64')
-            np_pre_hidden = np.random.uniform(
-                -0.1, 0.1, (batch_size, hidden_dim)
-            ).astype('float64')
-            np_pre_cell = np.random.uniform(
-                -0.1, 0.1, (batch_size, hidden_dim)
-            ).astype('float64')
-            def test_input_Variable():
-                lstm_unit(np_input, pre_hidden, pre_cell)
-            self.assertRaises(TypeError, test_input_Variable)
-            def test_pre_hidden_Variable():
-                lstm_unit(inputs, np_pre_hidden, pre_cell)
-            self.assertRaises(TypeError, test_pre_hidden_Variable)
-            def test_pre_cell_Variable():
-                lstm_unit(inputs, pre_hidden, np_pre_cell)
-            self.assertRaises(TypeError, test_pre_cell_Variable)
-            def test_input_type():
-                error_input = fluid.data(
-                    name='error_input',
-                    shape=[batch_size, emb_dim],
-                    dtype='int32',
-                )
-                lstm_unit(error_input, pre_hidden, pre_cell)
-            self.assertRaises(TypeError, test_input_type)
-            def test_pre_hidden_type():
-                error_pre_hidden = fluid.data(
-                    name='error_pre_hidden',
-                    shape=[batch_size, hidden_dim],
-                    dtype='int32',
-                )
-                lstm_unit(inputs, error_pre_hidden, pre_cell)
-            self.assertRaises(TypeError, test_pre_hidden_type)
-            def test_pre_cell_type():
-                error_pre_cell = fluid.data(
-                    name='error_pre_cell',
-                    shape=[batch_size, hidden_dim],
-                    dtype='int32',
-                )
-                lstm_unit(inputs, pre_hidden, error_pre_cell)
-            self.assertRaises(TypeError, test_pre_cell_type)
 class LstmUnitTest(OpTest):
    def setUp(self):
        self.op_type = "lstm_unit"

--- a/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py
@@ -16,296 +16,20 @@ import unittest
 import numpy
 import numpy as np
+from rnn.rnn_numpy import LSTMCell
+from rnn.rnn_numpy import rnn as numpy_rnn
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.layers as layers
 import paddle.fluid.layers.utils as utils
-from paddle.fluid import contrib, framework
+from paddle.fluid import framework
-from paddle.fluid.contrib.layers import basic_lstm
 from paddle.fluid.executor import Executor
 from paddle.fluid.framework import Program, program_guard
-from paddle.fluid.layers import rnn as dynamic_rnn
+from paddle.nn.layer.rnn import rnn as dynamic_rnn
-from paddle.fluid.layers.rnn import GRUCell, LSTMCell, RNNCell
+paddle.enable_static()
-class TestLSTMCellError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            batch_size, input_size, hidden_size = 4, 16, 16
-            inputs = fluid.data(
-                name='inputs', shape=[None, input_size], dtype='float32'
-            )
-            pre_hidden = fluid.data(
-                name='pre_hidden', shape=[None, hidden_size], dtype='float32'
-            )
-            pre_cell = fluid.data(
-                name='pre_cell', shape=[None, hidden_size], dtype='float32'
-            )
-            cell = LSTMCell(hidden_size)
-            def test_input_Variable():
-                np_input = np.random.random((batch_size, input_size)).astype(
-                    "float32"
-                )
-                cell(np_input, [pre_hidden, pre_cell])
-            self.assertRaises(TypeError, test_input_Variable)
-            def test_pre_hidden_Variable():
-                np_pre_hidden = np.random.random(
-                    (batch_size, hidden_size)
-                ).astype("float32")
-                cell(inputs, [np_pre_hidden, pre_cell])
-            self.assertRaises(TypeError, test_pre_hidden_Variable)
-            def test_pre_cell_Variable():
-                np_pre_cell = np.random.random((batch_size, input_size)).astype(
-                    "float32"
-                )
-                cell(inputs, [pre_hidden, np_pre_cell])
-            self.assertRaises(TypeError, test_pre_cell_Variable)
-            def test_input_type():
-                error_inputs = fluid.data(
-                    name='error_inputs', shape=[None, input_size], dtype='int32'
-                )
-                cell(error_inputs, [pre_hidden, pre_cell])
-            self.assertRaises(TypeError, test_input_type)
-            def test_pre_hidden_type():
-                error_pre_hidden = fluid.data(
-                    name='error_pre_hidden',
-                    shape=[None, hidden_size],
-                    dtype='int32',
-                )
-                cell(inputs, [error_pre_hidden, pre_cell])
-            self.assertRaises(TypeError, test_pre_hidden_type)
-            def test_pre_cell_type():
-                error_pre_cell = fluid.data(
-                    name='error_pre_cell',
-                    shape=[None, hidden_size],
-                    dtype='int32',
-                )
-                cell(inputs, [pre_hidden, error_pre_cell])
-            self.assertRaises(TypeError, test_pre_cell_type)
-            def test_dtype():
-                # the input type must be Variable
-                LSTMCell(hidden_size, dtype="int32")
-            self.assertRaises(TypeError, test_dtype)
-class TestLSTMCell(unittest.TestCase):
-    def setUp(self):
-        self.batch_size = 4
-        self.input_size = 16
-        self.hidden_size = 16
-    def test_run(self):
-        inputs = fluid.data(
-            name='inputs', shape=[None, self.input_size], dtype='float32'
-        )
-        pre_hidden = fluid.data(
-            name='pre_hidden', shape=[None, self.hidden_size], dtype='float32'
-        )
-        pre_cell = fluid.data(
-            name='pre_cell', shape=[None, self.hidden_size], dtype='float32'
-        )
-        cell = LSTMCell(self.hidden_size)
-        lstm_hidden_new, lstm_states_new = cell(inputs, [pre_hidden, pre_cell])
-        lstm_unit = contrib.layers.rnn_impl.BasicLSTMUnit(
-            "basicLSTM",
-            self.hidden_size,
-            None,
-            None,
-            None,
-            None,
-            1.0,
-            "float32",
-        )
-        lstm_hidden, lstm_cell = lstm_unit(inputs, pre_hidden, pre_cell)
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = Executor(place)
-        exe.run(framework.default_startup_program())
-        inputs_np = np.random.uniform(
-            -0.1, 0.1, (self.batch_size, self.input_size)
-        ).astype('float32')
-        pre_hidden_np = np.random.uniform(
-            -0.1, 0.1, (self.batch_size, self.hidden_size)
-        ).astype('float32')
-        pre_cell_np = np.random.uniform(
-            -0.1, 0.1, (self.batch_size, self.hidden_size)
-        ).astype('float32')
-        param_names = [
-            ["LSTMCell/BasicLSTMUnit_0.w_0", "basicLSTM/BasicLSTMUnit_0.w_0"],
-            ["LSTMCell/BasicLSTMUnit_0.b_0", "basicLSTM/BasicLSTMUnit_0.b_0"],
-        ]
-        for names in param_names:
-            param = np.array(
-                fluid.global_scope().find_var(names[0]).get_tensor()
-            )
-            param = np.random.uniform(-0.1, 0.1, size=param.shape).astype(
-                'float32'
-            )
-            fluid.global_scope().find_var(names[0]).get_tensor().set(
-                param, place
-            )
-            fluid.global_scope().find_var(names[1]).get_tensor().set(
-                param, place
-            )
-        out = exe.run(
-            feed={
-                'inputs': inputs_np,
-                'pre_hidden': pre_hidden_np,
-                'pre_cell': pre_cell_np,
-            },
-            fetch_list=[lstm_hidden_new, lstm_hidden],
-        )
-        np.testing.assert_allclose(out[0], out[1], rtol=0.0001, atol=0)
-class TestGRUCellError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            batch_size, input_size, hidden_size = 4, 16, 16
-            inputs = fluid.data(
-                name='inputs', shape=[None, input_size], dtype='float32'
-            )
-            pre_hidden = layers.data(
-                name='pre_hidden',
-                shape=[None, hidden_size],
-                append_batch_size=False,
-                dtype='float32',
-            )
-            cell = GRUCell(hidden_size)
-            def test_input_Variable():
-                np_input = np.random.random((batch_size, input_size)).astype(
-                    "float32"
-                )
-                cell(np_input, pre_hidden)
-            self.assertRaises(TypeError, test_input_Variable)
-            def test_pre_hidden_Variable():
-                np_pre_hidden = np.random.random(
-                    (batch_size, hidden_size)
-                ).astype("float32")
-                cell(inputs, np_pre_hidden)
-            self.assertRaises(TypeError, test_pre_hidden_Variable)
-            def test_input_type():
-                error_inputs = fluid.data(
-                    name='error_inputs', shape=[None, input_size], dtype='int32'
-                )
-                cell(error_inputs, pre_hidden)
-            self.assertRaises(TypeError, test_input_type)
-            def test_pre_hidden_type():
-                error_pre_hidden = fluid.data(
-                    name='error_pre_hidden',
-                    shape=[None, hidden_size],
-                    dtype='int32',
-                )
-                cell(inputs, error_pre_hidden)
-            self.assertRaises(TypeError, test_pre_hidden_type)
-            def test_dtype():
-                # the input type must be Variable
-                GRUCell(hidden_size, dtype="int32")
-            self.assertRaises(TypeError, test_dtype)
-class TestGRUCell(unittest.TestCase):
-    def setUp(self):
-        self.batch_size = 4
-        self.input_size = 16
-        self.hidden_size = 16
-    def test_run(self):
-        inputs = fluid.data(
-            name='inputs', shape=[None, self.input_size], dtype='float32'
-        )
-        pre_hidden = layers.data(
-            name='pre_hidden',
-            shape=[None, self.hidden_size],
-            append_batch_size=False,
-            dtype='float32',
-        )
-        cell = GRUCell(self.hidden_size)
-        gru_hidden_new, _ = cell(inputs, pre_hidden)
-        gru_unit = contrib.layers.rnn_impl.BasicGRUUnit(
-            "basicGRU", self.hidden_size, None, None, None, None, "float32"
-        )
-        gru_hidden = gru_unit(inputs, pre_hidden)
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = Executor(place)
-        exe.run(framework.default_startup_program())
-        inputs_np = np.random.uniform(
-            -0.1, 0.1, (self.batch_size, self.input_size)
-        ).astype('float32')
-        pre_hidden_np = np.random.uniform(
-            -0.1, 0.1, (self.batch_size, self.hidden_size)
-        ).astype('float32')
-        param_names = [
-            ["GRUCell/BasicGRUUnit_0.w_0", "basicGRU/BasicGRUUnit_0.w_0"],
-            ["GRUCell/BasicGRUUnit_0.w_1", "basicGRU/BasicGRUUnit_0.w_1"],
-            ["GRUCell/BasicGRUUnit_0.b_0", "basicGRU/BasicGRUUnit_0.b_0"],
-            ["GRUCell/BasicGRUUnit_0.b_1", "basicGRU/BasicGRUUnit_0.b_1"],
-        ]
-        for names in param_names:
-            param = np.array(
-                fluid.global_scope().find_var(names[0]).get_tensor()
-            )
-            param = np.random.uniform(-0.1, 0.1, size=param.shape).astype(
-                'float32'
-            )
-            fluid.global_scope().find_var(names[0]).get_tensor().set(
-                param, place
-            )
-            fluid.global_scope().find_var(names[1]).get_tensor().set(
-                param, place
-            )
-        out = exe.run(
-            feed={'inputs': inputs_np, 'pre_hidden': pre_hidden_np},
-            fetch_list=[gru_hidden_new, gru_hidden],
-        )
-        np.testing.assert_allclose(out[0], out[1], rtol=0.0001, atol=0)
 class TestRnnError(unittest.TestCase):
@@ -336,7 +60,9 @@ class TestRnnError(unittest.TestCase):
            inputs_dynamic_rnn = paddle.transpose(
                inputs_basic_lstm, perm=[1, 0, 2]
            )
-            cell = LSTMCell(hidden_size, name="LSTMCell_for_rnn")
+            cell = paddle.nn.LSTMCell(
+                input_size, hidden_size, name="LSTMCell_for_rnn"
+            )
            np_inputs_dynamic_rnn = np.random.random(
                (seq_len, batch_size, input_size)
            ).astype("float32")
@@ -362,7 +88,9 @@ class TestRnnError(unittest.TestCase):
            self.assertRaises(TypeError, test_input_list)
            def test_initial_states_type():
-                cell = GRUCell(hidden_size, name="GRUCell_for_rnn")
+                cell = paddle.nn.GRUCell(
+                    input_size, hidden_size, name="GRUCell_for_rnn"
+                )
                error_initial_states = np.random.random(
                    (batch_size, hidden_size)
                ).astype("float32")
@@ -417,36 +145,9 @@ class TestRnn(unittest.TestCase):
        self.seq_len = 4
    def test_run(self):
-        inputs_basic_lstm = fluid.data(
-            name='inputs_basic_lstm',
-            shape=[None, None, self.input_size],
-            dtype='float32',
-        )
-        sequence_length = fluid.data(
-            name="sequence_length", shape=[None], dtype='int64'
-        )
-        inputs_dynamic_rnn = paddle.transpose(inputs_basic_lstm, perm=[1, 0, 2])
+        numpy_cell = LSTMCell(self.input_size, self.hidden_size)
-        cell = LSTMCell(self.hidden_size, name="LSTMCell_for_rnn")
+        dynamic_cell = paddle.nn.LSTMCell(self.input_size, self.hidden_size)
-        output, final_state = dynamic_rnn(
-            cell=cell,
-            inputs=inputs_dynamic_rnn,
-            sequence_length=sequence_length,
-            is_reverse=False,
-        )
-        output_new = paddle.transpose(output, perm=[1, 0, 2])
-        rnn_out, last_hidden, last_cell = basic_lstm(
-            inputs_basic_lstm,
-            None,
-            None,
-            self.hidden_size,
-            num_layers=1,
-            batch_first=False,
-            bidirectional=False,
-            sequence_length=sequence_length,
-            forget_bias=1.0,
-        )
        if core.is_compiled_with_cuda():
            place = core.CUDAPlace(0)
@@ -455,60 +156,68 @@ class TestRnn(unittest.TestCase):
        exe = Executor(place)
        exe.run(framework.default_startup_program())
-        inputs_basic_lstm_np = np.random.uniform(
+        state = numpy_cell.parameters
-            -0.1, 0.1, (self.seq_len, self.batch_size, self.input_size)
+        for k, v in dynamic_cell.named_parameters():
-        ).astype('float32')
+            param = np.random.uniform(-0.1, 0.1, size=state[k].shape).astype(
+                'float64'
+            )
+            setattr(numpy_cell, k, param)
+            fluid.global_scope().find_var(v.name).get_tensor().set(param, place)
+        sequence_length = fluid.data(
+            name="sequence_length", shape=[None], dtype='int64'
+        )
+        inputs_rnn = fluid.data(
+            name='inputs_rnn',
+            shape=[None, None, self.input_size],
+            dtype='float64',
+        )
+        pre_hidden = fluid.data(
+            name='pre_hidden', shape=[None, self.hidden_size], dtype='float64'
+        )
+        pre_cell = fluid.data(
+            name='pre_cell', shape=[None, self.hidden_size], dtype='float64'
+        )
+        dynamic_output, dynamic_final_state = dynamic_rnn(
+            cell=dynamic_cell,
+            inputs=inputs_rnn,
+            sequence_length=sequence_length,
+            initial_states=(pre_hidden, pre_cell),
+            is_reverse=False,
+        )
+        inputs_rnn_np = np.random.uniform(
+            -0.1, 0.1, (self.batch_size, self.seq_len, self.input_size)
+        ).astype('float64')
        sequence_length_np = (
            np.ones(self.batch_size, dtype='int64') * self.seq_len
        )
-        inputs_np = np.random.uniform(
-            -0.1, 0.1, (self.batch_size, self.input_size)
-        ).astype('float32')
        pre_hidden_np = np.random.uniform(
            -0.1, 0.1, (self.batch_size, self.hidden_size)
-        ).astype('float32')
+        ).astype('float64')
        pre_cell_np = np.random.uniform(
            -0.1, 0.1, (self.batch_size, self.hidden_size)
-        ).astype('float32')
+        ).astype('float64')
-        param_names = [
-            [
-                "LSTMCell_for_rnn/BasicLSTMUnit_0.w_0",
-                "basic_lstm_layers_0/BasicLSTMUnit_0.w_0",
-            ],
-            [
-                "LSTMCell_for_rnn/BasicLSTMUnit_0.b_0",
-                "basic_lstm_layers_0/BasicLSTMUnit_0.b_0",
-            ],
-        ]
-        for names in param_names:
-            param = np.array(
-                fluid.global_scope().find_var(names[0]).get_tensor()
-            )
-            param = np.random.uniform(-0.1, 0.1, size=param.shape).astype(
-                'float32'
-            )
-            fluid.global_scope().find_var(names[0]).get_tensor().set(
-                param, place
-            )
-            fluid.global_scope().find_var(names[1]).get_tensor().set(
-                param, place
-            )
-        out = exe.run(
+        o1, _ = numpy_rnn(
+            cell=numpy_cell,
+            inputs=inputs_rnn_np,
+            initial_states=(pre_hidden_np, pre_cell_np),
+            sequence_length=sequence_length_np,
+            is_reverse=False,
+        )
+        o2 = exe.run(
            feed={
-                'inputs_basic_lstm': inputs_basic_lstm_np,
+                'inputs_rnn': inputs_rnn_np,
                'sequence_length': sequence_length_np,
-                'inputs': inputs_np,
                'pre_hidden': pre_hidden_np,
                'pre_cell': pre_cell_np,
            },
-            fetch_list=[output_new, rnn_out],
+            fetch_list=[dynamic_output],
-        )
+        )[0]
+        np.testing.assert_allclose(o1, o2, rtol=0.001)
-        np.testing.assert_allclose(out[0], out[1], rtol=0.0001)
 class TestRnnUtil(unittest.TestCase):
@@ -528,218 +237,5 @@ class TestRnnUtil(unittest.TestCase):
            pass
-class EncoderCell(RNNCell):
-    """Encoder Cell"""
-    def __init__(
-        self,
-        num_layers,
-        hidden_size,
-        dropout_prob=0.0,
-        init_scale=0.1,
-    ):
-        self.num_layers = num_layers
-        self.hidden_size = hidden_size
-        self.dropout_prob = dropout_prob
-        self.lstm_cells = []
-        for i in range(num_layers):
-            self.lstm_cells.append(LSTMCell(hidden_size))
-    def call(self, step_input, states):
-        new_states = []
-        for i in range(self.num_layers):
-            out, new_state = self.lstm_cells[i](step_input, states[i])
-            step_input = (
-                layers.dropout(
-                    out,
-                    self.dropout_prob,
-                )
-                if self.dropout_prob
-                else out
-            )
-            new_states.append(new_state)
-        return step_input, new_states
-    @property
-    def state_shape(self):
-        return [cell.state_shape for cell in self.lstm_cells]
-class DecoderCell(RNNCell):
-    """Decoder Cell"""
-    def __init__(self, num_layers, hidden_size, dropout_prob=0.0):
-        self.num_layers = num_layers
-        self.hidden_size = hidden_size
-        self.dropout_prob = dropout_prob
-        self.lstm_cells = []
-        for i in range(num_layers):
-            self.lstm_cells.append(LSTMCell(hidden_size))
-    def call(self, step_input, states):
-        new_lstm_states = []
-        for i in range(self.num_layers):
-            out, new_lstm_state = self.lstm_cells[i](step_input, states[i])
-            step_input = (
-                layers.dropout(
-                    out,
-                    self.dropout_prob,
-                )
-                if self.dropout_prob
-                else out
-            )
-            new_lstm_states.append(new_lstm_state)
-        return step_input, new_lstm_states
-def def_seq2seq_model(
-    num_layers, hidden_size, dropout_prob, src_vocab_size, trg_vocab_size
-):
-    "vanilla seq2seq model"
-    # data
-    source = fluid.data(name="src", shape=[None, None], dtype="int64")
-    source_length = fluid.data(
-        name="src_sequence_length", shape=[None], dtype="int64"
-    )
-    target = fluid.data(name="trg", shape=[None, None], dtype="int64")
-    target_length = fluid.data(
-        name="trg_sequence_length", shape=[None], dtype="int64"
-    )
-    label = fluid.data(name="label", shape=[None, None, 1], dtype="int64")
-    # embedding
-    src_emb = fluid.embedding(source, (src_vocab_size, hidden_size))
-    tar_emb = fluid.embedding(target, (src_vocab_size, hidden_size))
-    # encoder
-    enc_cell = EncoderCell(num_layers, hidden_size, dropout_prob)
-    enc_output, enc_final_state = dynamic_rnn(
-        cell=enc_cell, inputs=src_emb, sequence_length=source_length
-    )
-    # decoder
-    dec_cell = DecoderCell(num_layers, hidden_size, dropout_prob)
-    dec_output, dec_final_state = dynamic_rnn(
-        cell=dec_cell, inputs=tar_emb, initial_states=enc_final_state
-    )
-    logits = layers.fc(
-        dec_output,
-        size=trg_vocab_size,
-        num_flatten_dims=len(dec_output.shape) - 1,
-        bias_attr=False,
-    )
-    # loss
-    loss = paddle.nn.functional.softmax_with_cross_entropy(
-        logits=logits, label=label, soft_label=False
-    )
-    loss = layers.unsqueeze(loss, axes=[2])
-    max_tar_seq_len = paddle.shape(target)[1]
-    tar_mask = layers.sequence_mask(
-        target_length, maxlen=max_tar_seq_len, dtype="float32"
-    )
-    loss = loss * tar_mask
-    loss = paddle.mean(loss, axis=[0])
-    loss = paddle.sum(loss)
-    # optimizer
-    optimizer = fluid.optimizer.Adam(0.001)
-    optimizer.minimize(loss)
-    return loss
-class TestSeq2SeqModel(unittest.TestCase):
-    """
-    Test cases to confirm seq2seq api training correctly.
-    """
-    def setUp(self):
-        np.random.seed(123)
-        self.model_hparams = {
-            "num_layers": 2,
-            "hidden_size": 128,
-            "dropout_prob": 0.1,
-            "src_vocab_size": 100,
-            "trg_vocab_size": 100,
-        }
-        self.iter_num = iter_num = 2
-        self.batch_size = batch_size = 4
-        src_seq_len = 10
-        trg_seq_len = 12
-        self.data = {
-            "src": np.random.randint(
-                2,
-                self.model_hparams["src_vocab_size"],
-                (iter_num * batch_size, src_seq_len),
-            ).astype("int64"),
-            "src_sequence_length": np.random.randint(
-                1, src_seq_len, (iter_num * batch_size,)
-            ).astype("int64"),
-            "trg": np.random.randint(
-                2,
-                self.model_hparams["src_vocab_size"],
-                (iter_num * batch_size, trg_seq_len),
-            ).astype("int64"),
-            "trg_sequence_length": np.random.randint(
-                1, trg_seq_len, (iter_num * batch_size,)
-            ).astype("int64"),
-            "label": np.random.randint(
-                2,
-                self.model_hparams["src_vocab_size"],
-                (iter_num * batch_size, trg_seq_len, 1),
-            ).astype("int64"),
-        }
-        place = (
-            core.CUDAPlace(0)
-            if core.is_compiled_with_cuda()
-            else core.CPUPlace()
-        )
-        self.exe = Executor(place)
-    def test_seq2seq_model(self):
-        main_program = fluid.Program()
-        startup_program = fluid.Program()
-        with fluid.program_guard(main_program, startup_program):
-            cost = def_seq2seq_model(**self.model_hparams)
-            self.exe.run(startup_program)
-            for iter_idx in range(self.iter_num):
-                cost_val = self.exe.run(
-                    feed={
-                        "src": self.data["src"][
-                            iter_idx
-                            * self.batch_size : (iter_idx + 1)
-                            * self.batch_size,
-                            :,
-                        ],
-                        "src_sequence_length": self.data["src_sequence_length"][
-                            iter_idx
-                            * self.batch_size : (iter_idx + 1)
-                            * self.batch_size
-                        ],
-                        "trg": self.data["trg"][
-                            iter_idx
-                            * self.batch_size : (iter_idx + 1)
-                            * self.batch_size,
-                            :,
-                        ],
-                        "trg_sequence_length": self.data["trg_sequence_length"][
-                            iter_idx
-                            * self.batch_size : (iter_idx + 1)
-                            * self.batch_size
-                        ],
-                        "label": self.data["label"][
-                            iter_idx
-                            * self.batch_size : (iter_idx + 1)
-                            * self.batch_size
-                        ],
-                    },
-                    fetch_list=[cost],
-                )[0]
-                print("iter_idx: %d, cost: %f" % (iter_idx, cost_val))
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -19,12 +19,10 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
-import paddle.fluid.core as core
 import paddle.fluid.layers as layers
 import paddle.nn as nn
 from paddle import Model, set_device
 from paddle.fluid.dygraph import Layer
-from paddle.fluid.executor import Executor
 from paddle.fluid.framework import _test_eager_guard
 from paddle.nn import BeamSearchDecoder, dynamic_decode
 from paddle.static import InputSpec as Input
@@ -32,257 +30,6 @@ from paddle.static import InputSpec as Input
 paddle.enable_static()
-class EncoderCell(layers.RNNCell):
-    def __init__(self, num_layers, hidden_size, dropout_prob=0.0):
-        self.num_layers = num_layers
-        self.hidden_size = hidden_size
-        self.dropout_prob = dropout_prob
-        self.lstm_cells = [
-            layers.LSTMCell(hidden_size) for i in range(num_layers)
-        ]
-    def call(self, step_input, states):
-        new_states = []
-        for i in range(self.num_layers):
-            out, new_state = self.lstm_cells[i](step_input, states[i])
-            step_input = (
-                layers.dropout(out, self.dropout_prob)
-                if self.dropout_prob > 0
-                else out
-            )
-            new_states.append(new_state)
-        return step_input, new_states
-    @property
-    def state_shape(self):
-        return [cell.state_shape for cell in self.lstm_cells]
-class DecoderCell(layers.RNNCell):
-    def __init__(self, num_layers, hidden_size, dropout_prob=0.0):
-        self.num_layers = num_layers
-        self.hidden_size = hidden_size
-        self.dropout_prob = dropout_prob
-        self.lstm_cells = [
-            layers.LSTMCell(hidden_size) for i in range(num_layers)
-        ]
-    def attention(self, hidden, encoder_output, encoder_padding_mask):
-        query = layers.fc(
-            hidden, size=encoder_output.shape[-1], bias_attr=False
-        )
-        attn_scores = paddle.matmul(
-            layers.unsqueeze(query, [1]), encoder_output, transpose_y=True
-        )
-        if encoder_padding_mask is not None:
-            attn_scores = paddle.add(attn_scores, encoder_padding_mask)
-        attn_scores = paddle.nn.functional.softmax(attn_scores)
-        attn_out = paddle.squeeze(
-            paddle.matmul(attn_scores, encoder_output), [1]
-        )
-        attn_out = layers.concat([attn_out, hidden], 1)
-        attn_out = layers.fc(attn_out, size=self.hidden_size, bias_attr=False)
-        return attn_out
-    def call(
-        self, step_input, states, encoder_output, encoder_padding_mask=None
-    ):
-        lstm_states, input_feed = states
-        new_lstm_states = []
-        step_input = layers.concat([step_input, input_feed], 1)
-        for i in range(self.num_layers):
-            out, new_lstm_state = self.lstm_cells[i](step_input, lstm_states[i])
-            step_input = (
-                layers.dropout(out, self.dropout_prob)
-                if self.dropout_prob > 0
-                else out
-            )
-            new_lstm_states.append(new_lstm_state)
-        out = self.attention(step_input, encoder_output, encoder_padding_mask)
-        return out, [new_lstm_states, out]
-class Encoder:
-    def __init__(self, num_layers, hidden_size, dropout_prob=0.0):
-        self.encoder_cell = EncoderCell(num_layers, hidden_size, dropout_prob)
-    def __call__(self, src_emb, src_sequence_length):
-        encoder_output, encoder_final_state = layers.rnn(
-            cell=self.encoder_cell,
-            inputs=src_emb,
-            sequence_length=src_sequence_length,
-            is_reverse=False,
-        )
-        return encoder_output, encoder_final_state
-class Decoder:
-    def __init__(
-        self,
-        num_layers,
-        hidden_size,
-        dropout_prob,
-        decoding_strategy="infer_sample",
-        max_decoding_length=20,
-    ):
-        self.decoder_cell = DecoderCell(num_layers, hidden_size, dropout_prob)
-        self.decoding_strategy = decoding_strategy
-        self.max_decoding_length = (
-            None
-            if (self.decoding_strategy == "train_greedy")
-            else max_decoding_length
-        )
-    def __call__(
-        self,
-        decoder_initial_states,
-        encoder_output,
-        encoder_padding_mask,
-        **kwargs
-    ):
-        output_layer = kwargs.pop("output_layer", None)
-        beam_size = kwargs.get("beam_size", 4)
-        encoder_output = BeamSearchDecoder.tile_beam_merge_with_batch(
-            encoder_output, beam_size
-        )
-        encoder_padding_mask = BeamSearchDecoder.tile_beam_merge_with_batch(
-            encoder_padding_mask, beam_size
-        )
-        decoder = BeamSearchDecoder(
-            cell=self.decoder_cell, output_fn=output_layer, **kwargs
-        )
-        (
-            decoder_output,
-            decoder_final_state,
-            dec_seq_lengths,
-        ) = layers.dynamic_decode(
-            decoder,
-            inits=decoder_initial_states,
-            max_step_num=self.max_decoding_length,
-            encoder_output=encoder_output,
-            encoder_padding_mask=encoder_padding_mask,
-            impute_finished=False  # for test coverage
-            if self.decoding_strategy == "beam_search"
-            else True,
-            is_test=True if self.decoding_strategy == "beam_search" else False,
-            return_length=True,
-        )
-        return decoder_output, decoder_final_state, dec_seq_lengths
-class Seq2SeqModel:
-    """Seq2Seq model: RNN encoder-decoder with attention"""
-    def __init__(
-        self,
-        num_layers,
-        hidden_size,
-        dropout_prob,
-        src_vocab_size,
-        trg_vocab_size,
-        start_token,
-        end_token,
-        decoding_strategy="infer_sample",
-        max_decoding_length=20,
-        beam_size=4,
-    ):
-        self.start_token, self.end_token = start_token, end_token
-        self.max_decoding_length, self.beam_size = (
-            max_decoding_length,
-            beam_size,
-        )
-        self.src_embeder = paddle.nn.Embedding(
-            src_vocab_size,
-            hidden_size,
-            weight_attr=fluid.ParamAttr(name="source_embedding"),
-        )
-        self.trg_embeder = paddle.nn.Embedding(
-            trg_vocab_size,
-            hidden_size,
-            weight_attr=fluid.ParamAttr(name="target_embedding"),
-        )
-        self.encoder = Encoder(num_layers, hidden_size, dropout_prob)
-        self.decoder = Decoder(
-            num_layers,
-            hidden_size,
-            dropout_prob,
-            decoding_strategy,
-            max_decoding_length,
-        )
-        self.output_layer = lambda x: layers.fc(
-            x,
-            size=trg_vocab_size,
-            num_flatten_dims=len(x.shape) - 1,
-            param_attr=fluid.ParamAttr(),
-            bias_attr=False,
-        )
-    def __call__(self, src, src_length, trg=None, trg_length=None):
-        # encoder
-        encoder_output, encoder_final_state = self.encoder(
-            self.src_embeder(src), src_length
-        )
-        decoder_initial_states = [
-            encoder_final_state,
-            self.decoder.decoder_cell.get_initial_states(
-                batch_ref=encoder_output, shape=[encoder_output.shape[-1]]
-            ),
-        ]
-        src_mask = layers.sequence_mask(
-            src_length, maxlen=paddle.shape(src)[1], dtype="float32"
-        )
-        encoder_padding_mask = (src_mask - 1.0) * 1e9
-        encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1])
-        # decoder
-        decoder_kwargs = (
-            {
-                "inputs": self.trg_embeder(trg),
-                "sequence_length": trg_length,
-            }
-            if self.decoder.decoding_strategy == "train_greedy"
-            else (
-                {
-                    "embedding_fn": self.trg_embeder,
-                    "beam_size": self.beam_size,
-                    "start_token": self.start_token,
-                    "end_token": self.end_token,
-                }
-                if self.decoder.decoding_strategy == "beam_search"
-                else {
-                    "embedding_fn": self.trg_embeder,
-                    "start_tokens": layers.fill_constant_batch_size_like(
-                        input=encoder_output,
-                        shape=[-1],
-                        dtype=src.dtype,
-                        value=self.start_token,
-                    ),
-                    "end_token": self.end_token,
-                }
-            )
-        )
-        decoder_kwargs["output_layer"] = self.output_layer
-        (decoder_output, decoder_final_state, dec_seq_lengths) = self.decoder(
-            decoder_initial_states,
-            encoder_output,
-            encoder_padding_mask,
-            **decoder_kwargs
-        )
-        if self.decoder.decoding_strategy == "beam_search":  # for inference
-            return decoder_output
-        logits, samples, sample_length = (
-            decoder_output.cell_outputs,
-            decoder_output.sample_ids,
-            dec_seq_lengths,
-        )
-        probs = paddle.nn.functional.softmax(logits)
-        return probs, samples, sample_length
 class PolicyGradient:
    """policy gradient"""
@@ -477,91 +224,6 @@ class SeqPGAgent:
        return results
-class TestDynamicDecode(unittest.TestCase):
-    def setUp(self):
-        np.random.seed(123)
-        self.model_hparams = {
-            "num_layers": 2,
-            "hidden_size": 32,
-            "dropout_prob": 0.1,
-            "src_vocab_size": 100,
-            "trg_vocab_size": 100,
-            "start_token": 0,
-            "end_token": 1,
-            "decoding_strategy": "infer_greedy",
-            "max_decoding_length": 10,
-        }
-        self.iter_num = iter_num = 2
-        self.batch_size = batch_size = 4
-        src_seq_len = 10
-        trg_seq_len = 12
-        self.data = {
-            "src": np.random.randint(
-                2,
-                self.model_hparams["src_vocab_size"],
-                (iter_num * batch_size, src_seq_len),
-            ).astype("int64"),
-            "src_sequence_length": np.random.randint(
-                1, src_seq_len, (iter_num * batch_size,)
-            ).astype("int64"),
-            "trg": np.random.randint(
-                2,
-                self.model_hparams["src_vocab_size"],
-                (iter_num * batch_size, trg_seq_len),
-            ).astype("int64"),
-            "trg_sequence_length": np.random.randint(
-                1, trg_seq_len, (iter_num * batch_size,)
-            ).astype("int64"),
-            "label": np.random.randint(
-                2,
-                self.model_hparams["src_vocab_size"],
-                (iter_num * batch_size, trg_seq_len, 1),
-            ).astype("int64"),
-        }
-        place = (
-            core.CUDAPlace(0)
-            if core.is_compiled_with_cuda()
-            else core.CPUPlace()
-        )
-        self.exe = Executor(place)
-    def test_beam_search_infer(self):
-        paddle.set_default_dtype("float32")
-        paddle.enable_static()
-        self.model_hparams["decoding_strategy"] = "beam_search"
-        main_program = fluid.Program()
-        startup_program = fluid.Program()
-        with fluid.program_guard(main_program, startup_program):
-            source = fluid.data(name="src", shape=[None, None], dtype="int64")
-            source_length = fluid.data(
-                name="src_sequence_length", shape=[None], dtype="int64"
-            )
-            model = Seq2SeqModel(**self.model_hparams)
-            output = model(source, source_length)
-        self.exe.run(startup_program)
-        for iter_idx in range(self.iter_num):
-            trans_ids = self.exe.run(
-                program=main_program,
-                feed={
-                    "src": self.data["src"][
-                        iter_idx
-                        * self.batch_size : (iter_idx + 1)
-                        * self.batch_size,
-                        :,
-                    ],
-                    "src_sequence_length": self.data["src_sequence_length"][
-                        iter_idx
-                        * self.batch_size : (iter_idx + 1)
-                        * self.batch_size
-                    ],
-                },
-                fetch_list=[output],
-            )[0]
 class ModuleApiTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):

--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -14,26 +14,389 @@
 import math
 from collections.abc import Sequence
-from functools import reduce
+from functools import partial, reduce
 import numpy as np
 import paddle
 from paddle import _C_ops, _legacy_C_ops, framework, in_dynamic_mode
-from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.data_feeder import check_type, check_variable_and_dtype
-from paddle.fluid.layers import utils
+from paddle.fluid.framework import _non_static_mode, in_dygraph_mode
+from paddle.fluid.layers import control_flow, sequence_lod, utils
 from paddle.fluid.layers.utils import flatten, map_structure
 from paddle.framework import core
 from paddle.nn import Layer
 from paddle.nn import functional as F
 from paddle.nn import initializer as I
-from paddle.static import default_startup_program, program_guard
+from paddle.static import Variable, default_startup_program, program_guard
 from .container import LayerList
 __all__ = []
+def rnn(
+    cell,
+    inputs,
+    initial_states=None,
+    sequence_length=None,
+    time_major=False,
+    is_reverse=False,
+    **kwargs
+):
+    r"""
+    rnn creates a recurrent neural network specified by RNNCell `cell`,
+    which performs :code:`cell.call()` (for dygraph mode :code:`cell.forward`)
+    repeatedly until reaches to the maximum length of `inputs`.
+    Parameters:
+        cell(RNNCellBase): An instance of `RNNCellBase`.
+        inputs(Tensor): the input sequences.
+            If time_major is True, the shape is
+            `[time_steps, batch_size, input_size]`
+            else the shape is `[batch_size, time_steps, input_size]`.
+        initial_states(Tensor|tuple|list, optional): the initial state of the
+            rnn cell. Tensor or a possibly nested structure of tensors. If not
+            provided, `cell.get_initial_states` would be called to produce
+            the initial state. Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as
+            padded sequences. In each input sequence, elements whose time step
+            index are not less than the valid length are treated as paddings.
+        time_major (bool, optional): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Defaults to False.
+        **kwargs: Additional keyword arguments to pass to `forward` of the cell.
+    Returns:
+        outputs (Tensor|list|tuple): the output sequence. Tensor or nested
+            structure of Tensors.
+            If `time_major` is True, the shape of each tensor in outpus is
+            `[time_steps, batch_size, hidden_size]`, else
+            `[batch_size, time_steps, hidden_size]`.
+        final_states (Tensor|list|tuple): final states. A (possibly nested structure of)
+            tensor[s], representing the final state for RNN. It has the same
+            structure of intial state. Each tensor in final states has the same
+            shape and dtype as the corresponding tensor in initial states.
+    Examples:
+        .. code-block:: python
+            import paddle
+            paddle.disable_static()
+            cell = paddle.nn.SimpleRNNCell(16, 32)
+            inputs = paddle.rand((4, 23, 16))
+            prev_h = paddle.randn((4, 32))
+            outputs, final_states = paddle.nn.layer.rnn(cell, inputs, prev_h)
+    """
+    if _non_static_mode():
+        return _rnn_dynamic_graph(
+            cell,
+            inputs,
+            initial_states,
+            sequence_length,
+            time_major,
+            is_reverse,
+            **kwargs
+        )
+    else:
+        return _rnn_static_graph(
+            cell,
+            inputs,
+            initial_states,
+            sequence_length,
+            time_major,
+            is_reverse,
+            **kwargs
+        )
+class ArrayWrapper:
+    def __init__(self, x):
+        self.array = [x]
+    def append(self, x):
+        self.array.append(x)
+        return self
+    def __getitem__(self, item):
+        return self.array.__getitem__(item)
+def _maybe_copy(state, new_state, step_mask):
+    """update rnn state or just pass the old state through"""
+    new_state = paddle.tensor.math._multiply_with_axis(
+        new_state, step_mask, axis=0
+    ) + paddle.tensor.math._multiply_with_axis(state, (1 - step_mask), axis=0)
+    return new_state
+def _transpose_batch_time(x):
+    perm = [1, 0] + list(range(2, len(x.shape)))
+    return paddle.transpose(x, perm)
+def _rnn_dynamic_graph(
+    cell,
+    inputs,
+    initial_states=None,
+    sequence_length=None,
+    time_major=False,
+    is_reverse=False,
+    **kwargs
+):
+    time_step_index = 0 if time_major else 1
+    flat_inputs = flatten(inputs)
+    time_steps = flat_inputs[0].shape[time_step_index]
+    if initial_states is None:
+        initial_states = cell.get_initial_states(
+            batch_ref=inputs, batch_dim_idx=1 if time_major else 0
+        )
+    if not time_major:
+        inputs = map_structure(_transpose_batch_time, inputs)
+    if sequence_length is not None:
+        mask = sequence_lod.sequence_mask(
+            sequence_length, maxlen=time_steps, dtype=inputs.dtype
+        )
+        mask = paddle.transpose(mask, [1, 0])
+    if is_reverse:
+        inputs = map_structure(lambda x: paddle.reverse(x, axis=[0]), inputs)
+        mask = (
+            paddle.reverse(mask, axis=[0])
+            if sequence_length is not None
+            else None
+        )
+    states = initial_states
+    outputs = []
+    for i in range(time_steps):
+        step_inputs = map_structure(lambda x: x[i], inputs)
+        step_outputs, new_states = cell(step_inputs, states, **kwargs)
+        if sequence_length is not None:
+            new_states = map_structure(
+                partial(_maybe_copy, step_mask=mask[i]), states, new_states
+            )
+        states = new_states
+        outputs = (
+            map_structure(lambda x: ArrayWrapper(x), step_outputs)
+            if i == 0
+            else map_structure(
+                lambda x, x_array: x_array.append(x), step_outputs, outputs
+            )
+        )
+    final_outputs = map_structure(
+        lambda x: paddle.stack(x.array, axis=time_step_index), outputs
+    )
+    if is_reverse:
+        final_outputs = map_structure(
+            lambda x: paddle.reverse(x, axis=time_step_index), final_outputs
+        )
+    final_states = new_states
+    return final_outputs, final_states
+def _rnn_static_graph(
+    cell,
+    inputs,
+    initial_states=None,
+    sequence_length=None,
+    time_major=False,
+    is_reverse=False,
+    **kwargs
+):
+    check_type(inputs, 'inputs', (Variable, list, tuple), 'rnn')
+    if isinstance(inputs, (list, tuple)):
+        for i, input_x in enumerate(inputs):
+            check_variable_and_dtype(
+                input_x, 'inputs[' + str(i) + ']', ['float32', 'float64'], 'rnn'
+            )
+    check_type(
+        initial_states,
+        'initial_states',
+        (Variable, list, tuple, type(None)),
+        'rnn',
+    )
+    check_type(
+        sequence_length, 'sequence_length', (Variable, type(None)), 'rnn'
+    )
+    def _switch_grad(x, stop=False):
+        x.stop_gradient = stop
+        return x
+    if initial_states is None:
+        initial_states = cell.get_initial_states(
+            batch_ref=inputs, batch_dim_idx=1 if time_major else 0
+        )
+    initial_states = map_structure(_switch_grad, initial_states)
+    if not time_major:
+        inputs = map_structure(_transpose_batch_time, inputs)
+    if sequence_length:
+        max_seq_len = paddle.shape(flatten(inputs)[0])[0]
+        mask = sequence_lod.sequence_mask(
+            sequence_length,
+            maxlen=max_seq_len,
+            dtype=flatten(initial_states)[0].dtype,
+        )
+        mask = paddle.transpose(mask, [1, 0])
+    if is_reverse:
+        inputs = map_structure(lambda x: paddle.reverse(x, axis=[0]), inputs)
+        mask = paddle.reverse(mask, axis=[0]) if sequence_length else None
+    # StaticRNN
+    rnn = control_flow.StaticRNN()
+    with rnn.step():
+        inputs = map_structure(rnn.step_input, inputs)
+        states = map_structure(rnn.memory, initial_states)
+        copy_states = map_structure(lambda x: x, states)
+        outputs, new_states = cell(inputs, copy_states, **kwargs)
+        utils.assert_same_structure(states, new_states)
+        if sequence_length:
+            step_mask = rnn.step_input(mask)
+            new_states = map_structure(
+                partial(_maybe_copy, step_mask=step_mask), states, new_states
+            )
+        map_structure(rnn.update_memory, states, new_states)
+        flat_outputs = flatten(outputs)
+        map_structure(rnn.step_output, outputs)
+        map_structure(rnn.step_output, new_states)
+    rnn_out = rnn()
+    final_outputs = rnn_out[: len(flat_outputs)]
+    final_outputs = utils.pack_sequence_as(outputs, final_outputs)
+    final_states = map_structure(lambda x: x[-1], rnn_out[len(flat_outputs) :])
+    final_states = utils.pack_sequence_as(new_states, final_states)
+    if is_reverse:
+        final_outputs = map_structure(
+            lambda x: paddle.reverse(x, axis=[0]), final_outputs
+        )
+    if not time_major:
+        final_outputs = map_structure(_transpose_batch_time, final_outputs)
+    return (final_outputs, final_states)
+def birnn(
+    cell_fw,
+    cell_bw,
+    inputs,
+    initial_states=None,
+    sequence_length=None,
+    time_major=False,
+    **kwargs
+):
+    r"""
+    birnn creates a bidirectional recurrent neural network specified by
+    RNNCell `cell_fw` and `cell_bw`, which performs :code:`cell.call()`
+    (for dygraph mode :code:`cell.forward`) repeatedly until reaches to
+    the maximum length of `inputs` and then concat the outputs for both RNNs
+    along the last axis.
+    Parameters:
+        cell_fw(RNNCellBase): An instance of `RNNCellBase`.
+        cell_bw(RNNCellBase): An instance of `RNNCellBase`.
+        inputs(Tensor): the input sequences.
+            If time_major is True, the shape is
+            `[time_steps, batch_size, input_size]`
+            else the shape is `[batch_size, time_steps, input_size]`.
+        initial_states(tuple, optional): A tuple of initial states of
+            `cell_fw` and `cell_bw`.
+            If not provided, `cell.get_initial_states` would be called to
+            produce initial state for each cell. Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as
+            padded sequences. In each input sequence, elements whose time step
+            index are not less than the valid length are treated as paddings.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+        **kwargs: Additional keyword arguments to pass to `forward` of each cell.
+    Returns:
+        outputs (Tensor): the outputs of the bidirectional RNN. It is the
+            concatenation of the outputs from the forward RNN and backward
+            RNN along the last axis.
+            If time major is True, the shape is `[time_steps, batch_size, size]`,
+            else the shape is `[batch_size, time_steps, size]`, where size is
+            `cell_fw.hidden_size + cell_bw.hidden_size`.
+        final_states (tuple): A tuple of the final states of the forward
+            cell and backward cell.
+    Examples:
+        .. code-block:: python
+            import paddle
+            paddle.disable_static()
+            cell_fw = paddle.nn.LSTMCell(16, 32)
+            cell_bw = paddle.nn.LSTMCell(16, 32)
+            inputs = paddle.rand((4, 23, 16))
+            hf, cf = paddle.rand((4, 32)), paddle.rand((4, 32))
+            hb, cb = paddle.rand((4, 32)), paddle.rand((4, 32))
+            initial_states = ((hf, cf), (hb, cb))
+            outputs, final_states = paddle.nn.layer.birnn(
+                cell_fw, cell_bw, inputs, initial_states)
+    """
+    if initial_states is None:
+        states_fw = cell_fw.get_initial_states(
+            batch_ref=inputs, batch_dim_idx=1 if time_major else 0
+        )
+        states_bw = cell_fw.get_initial_states(
+            batch_ref=inputs, batch_dim_idx=1 if time_major else 0
+        )
+    else:
+        states_fw, states_bw = initial_states
+    outputs_fw, states_fw = rnn(
+        cell_fw,
+        inputs,
+        states_fw,
+        sequence_length,
+        time_major=time_major,
+        **kwargs
+    )
+    outputs_bw, states_bw = rnn(
+        cell_bw,
+        inputs,
+        states_bw,
+        sequence_length,
+        time_major=time_major,
+        is_reverse=True,
+        **kwargs
+    )
+    outputs = map_structure(
+        lambda x, y: paddle.concat([x, y], -1), outputs_fw, outputs_bw
+    )
+    final_states = (states_fw, states_bw)
+    return outputs, final_states
 def split_states(states, bidirectional=False, state_components=1):
    r"""
    Split states of RNN network into possibly nested list or tuple of
@@ -779,7 +1142,7 @@ class RNN(Layer):
    def forward(
        self, inputs, initial_states=None, sequence_length=None, **kwargs
    ):
-        final_outputs, final_states = paddle.fluid.layers.rnn(
+        final_outputs, final_states = rnn(
            self.cell,
            inputs,
            initial_states=initial_states,
@@ -866,7 +1229,7 @@ class BiRNN(Layer):
                len(initial_states) == 2
            ), "length of initial_states should be 2 when it is a list/tuple"
-        outputs, final_states = paddle.fluid.layers.birnn(
+        outputs, final_states = birnn(
            self.cell_fw,
            self.cell_bw,
            inputs,