diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py
index 1132ef393d552c878c8db27752c1f3e3280500d0..397c305c23e040955a12ddfc9683dc1daad0ab32 100644
--- a/python/paddle/fluid/dygraph/__init__.py
+++ b/python/paddle/fluid/dygraph/__init__.py
@@ -43,9 +43,6 @@ from .io import *
 from . import static_runner
 from .static_runner import StaticModelRunner
 
-from . import rnn
-from .rnn import *
-
 from . import amp
 from .amp import *
 
@@ -60,5 +57,4 @@ __all__ += parallel.__all__
 __all__ += checkpoint.__all__
 __all__ += learning_rate_scheduler.__all__
 __all__ += io.__all__
-__all__ += rnn.__all__
 __all__ += amp.__all__
diff --git a/python/paddle/fluid/dygraph/rnn.py b/python/paddle/fluid/dygraph/rnn.py
deleted file mode 100644
index d74e0b1bfee70e790fe195839028917fb5d015aa..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/dygraph/rnn.py
+++ /dev/null
@@ -1,509 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-from . import Layer
-from ..layers import (
-    concat,
-    fill_constant,
-    elementwise_mul,
-    split,
-)
-import copy
-import paddle
-
-__all__ = ['LSTMCell', 'GRUCell']
-
-
-class LSTMCell(Layer):
-    r"""
-    LSTMCell implementation using basic operators.
-    There are two LSTMCell version, the default one is compatible with CUDNN LSTM implementation.
-    The algorithm can be described as the equations below.
-
-        .. math::
-
-            i_t &= sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i)
-
-            f_t &= sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f)
-
-            o_t &= sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o)
-
-            \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c)
-
-            c_t &= f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t}
-
-            h_t &= o_t \\odot tanh(c_t)
-
-    The other LSTMCell version is compatible with the BasicLSTMUnit used in static graph.
-    The algorithm can be described as the equations below.
-
-        .. math::
-
-            i_t &= sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i)
-
-            f_t &= sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias )
-
-            o_t &= sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o)
-
-            \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
-
-            c_t &= f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t}
-
-            h_t &= o_t \\odot tanh(c_t)
-
-    Args:
-        hidden_size (integer): The hidden size used in the Cell.
-        input_size (integer): The input size used in the Cell.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, LSTMCell will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of LSTMCell.
-            If it is set to None or one attribute of ParamAttr, LSTMCell will
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized as zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cells (actNode).
-                             Default: 'fluid.layers.tanh'
-        forget_bias(float|1.0): forget bias used when computing forget gate. This
-            is not used in default LSTMCell implementation (CUDNN compatiable)
-        use_cudnn_impl(bool|True): whether to use CUDNN compatible LSTMCell
-        dtype(string): data type used in this cell
-
-    Returns:
-        None
-
-    Examples:
-
-        .. code-block:: python
-
-            from paddle import fluid
-            import paddle.fluid.core as core
-            from paddle.fluid.dygraph import LSTMCell
-            import numpy as np
-            batch_size = 64
-            input_size = 128
-            hidden_size = 256
-            step_input_np = np.random.uniform(-0.1, 0.1, (
-                batch_size, input_size)).astype('float64')
-            pre_hidden_np = np.random.uniform(-0.1, 0.1, (
-                batch_size, hidden_size)).astype('float64')
-            pre_cell_np = np.random.uniform(-0.1, 0.1, (
-                batch_size, hidden_size)).astype('float64')
-            if core.is_compiled_with_cuda():
-                place = core.CUDAPlace(0)
-            else:
-                place = core.CPUPlace()
-            with fluid.dygraph.guard(place):
-                cudnn_lstm = LSTMCell(hidden_size, input_size)
-                step_input_var = fluid.dygraph.to_variable(step_input_np)
-                pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np)
-                pre_cell_var = fluid.dygraph.to_variable(pre_cell_np)
-                new_hidden, new_cell = cudnn_lstm(step_input_var, pre_hidden_var, pre_cell_var)
-
-    """
-
-    def __init__(
-        self,
-        hidden_size,
-        input_size,
-        param_attr=None,
-        bias_attr=None,
-        gate_activation=None,
-        activation=None,
-        forget_bias=1.0,
-        use_cudnn_impl=True,
-        dtype='float64',
-    ):
-        super().__init__(dtype)
-
-        self._hidden_size = hidden_size
-        self._input_size = input_size
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._dtype = dtype
-        self._gate_activation = gate_activation or paddle.nn.functional.sigmoid
-        self._activation = activation or paddle.tanh
-        self._use_cudnn_impl = use_cudnn_impl
-
-        if self._use_cudnn_impl:
-
-            if (
-                self._param_attr is not None
-                and self._param_attr.name is not None
-            ):
-                weight_ih_param_attr = copy.deepcopy(self._param_attr)
-                weight_hh_param_attr = copy.deepcopy(self._param_attr)
-                weight_ih_param_attr.name += "_weight_ih"
-                weight_hh_param_attr.name += "_weight_hh"
-            else:
-                weight_ih_param_attr = self._param_attr
-                weight_hh_param_attr = self._param_attr
-
-            if self._bias_attr is not None and self._bias_attr.name is not None:
-                bias_ih_param_attr = copy.deepcopy(self._bias_attr)
-                bias_hh_param_attr = copy.deepcopy(self._bias_attr)
-                bias_ih_param_attr.name += "_bias_ih"
-                bias_hh_param_attr.name += "_bias_hh"
-            else:
-                bias_ih_param_attr = self._bias_attr
-                bias_hh_param_attr = self._bias_attr
-
-            self._weight_ih = self.create_parameter(
-                attr=weight_ih_param_attr,
-                shape=[4 * self._hidden_size, self._input_size],
-                dtype=self._dtype,
-            )
-
-            self._weight_hh = self.create_parameter(
-                attr=weight_hh_param_attr,
-                shape=[4 * self._hidden_size, self._hidden_size],
-                dtype=self._dtype,
-            )
-
-            self._bias_ih = self.create_parameter(
-                attr=bias_ih_param_attr,
-                shape=[4 * self._hidden_size],
-                dtype=self._dtype,
-                is_bias=True,
-            )
-            self._bias_hh = self.create_parameter(
-                attr=bias_hh_param_attr,
-                shape=[4 * self._hidden_size],
-                dtype=self._dtype,
-                is_bias=True,
-            )
-
-        else:
-
-            self._forget_bias = fill_constant(
-                [1], dtype=dtype, value=forget_bias
-            )
-            self._forget_bias.stop_gradient = False
-
-            self._weight = self.create_parameter(
-                attr=self._param_attr,
-                shape=[
-                    self._input_size + self._hidden_size,
-                    4 * self._hidden_size,
-                ],
-                dtype=dtype,
-            )
-
-            self._bias = self.create_parameter(
-                attr=self._bias_attr,
-                shape=[4 * self._hidden_size],
-                dtype=dtype,
-                is_bias=True,
-            )
-
-    def forward(self, input, pre_hidden, pre_cell):
-
-        if self._use_cudnn_impl:
-            igates = paddle.matmul(input, y=self._weight_ih, transpose_y=True)
-            igates = paddle.add(igates, self._bias_ih)
-            hgates = paddle.matmul(
-                pre_hidden, self._weight_hh, transpose_y=True
-            )
-            hgates = paddle.add(hgates, self._bias_hh)
-            chunked_igates = split(igates, num_or_sections=4, dim=1)
-            chunked_hgates = split(hgates, num_or_sections=4, dim=1)
-
-            ingate = paddle.add(chunked_igates[0], chunked_hgates[0])
-            ingate = self._gate_activation(ingate)
-
-            forgetgate = paddle.add(chunked_igates[1], chunked_hgates[1])
-            forgetgate = self._gate_activation(forgetgate)
-
-            cellgate = paddle.add(chunked_igates[2], chunked_hgates[2])
-            cellgate = self._activation(cellgate)
-
-            outgate = paddle.add(chunked_igates[3], chunked_hgates[3])
-            outgate = self._gate_activation(outgate)
-
-            new_cell = (forgetgate * pre_cell) + (ingate * cellgate)
-            new_hidden = outgate * self._activation(new_cell)
-
-        else:
-
-            concat_input_hidden = concat([input, pre_hidden], 1)
-            gate_input = paddle.matmul(x=concat_input_hidden, y=self._weight)
-
-            gate_input = paddle.add(gate_input, self._bias)
-            i, j, f, o = split(gate_input, num_or_sections=4, dim=-1)
-            new_cell = paddle.add(
-                paddle.multiply(
-                    pre_cell,
-                    self._gate_activation(paddle.add(f, self._forget_bias)),
-                ),
-                paddle.multiply(
-                    paddle.nn.functional.sigmoid(i), paddle.tanh(j)
-                ),
-            )
-            new_hidden = self._activation(new_cell) * self._gate_activation(o)
-
-        return new_hidden, new_cell
-
-
-class GRUCell(Layer):
-    r"""
-    GRU implementation using basic operators.
-    There are two GRUCell version, the default one is compatible with CUDNN GRU implementation.
-    The algorithm can be described as the equations below.
-
-        .. math::
-
-            u_t & = sigmoid(W_{ux} x_{t} + b_ux + W_{uh} h_{t-1} + b_uh)
-
-            r_t & = sigmoid(W_{rx} x_{t} + b_rx + W_{rh} h_{t-1} + b_rh)
-
-            \\tilde{h_{t}} & = tanh(W_{cx} x_{t} + b_cx + r_t \\odot (W_{ch} h_{t-1} + b_ch))
-
-            h_t & = u_t h_{t-1} + (1-u_t) \\tilde{h_{t}}
-
-    The other LSTMCell version is compatible with the BasicGRUUnit used in static graph.
-    The algorithm can be described as the equations below.
-
-        .. math::
-
-            u_t & = sigmoid(W_{ux} x_{t} + W_{uh} h_{t-1} + b_u)
-
-            r_t & = sigmoid(W_{rx} x_{t} + W_{rh} h_{t-1} + b_r)
-
-            \\tilde{h_{t}} & = tanh(W_{cx} x_{t} + W_{ch} \\odot(r_t, h_{t-1}) + b_m)
-
-            h_t & = u_t h_{t-1} + (1-u_t) \\tilde{h_{t}}
-
-    Args:
-        hidden_size (integer): The hidden size used in the Cell.
-        input_size (integer): The input size used in the Cell.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, GRUCell will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of GRUCell.
-            If it is set to None or one attribute of ParamAttr, GRUCell will
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cell (actNode).
-                             Default: 'fluid.layers.tanh'
-        use_cudnn_impl(bool|True): whether to use CUDNN compatible LSTMCell
-        dtype(string): data type used in this cell
-
-    Returns:
-        None
-
-    Examples:
-
-        .. code-block:: python
-
-            from paddle import fluid
-            import paddle.fluid.core as core
-            from paddle.fluid.dygraph import GRUCell
-            import numpy as np
-            batch_size = 64
-            input_size = 128
-            hidden_size = 256
-            step_input_np = np.random.uniform(-0.1, 0.1, (
-            batch_size, input_size)).astype('float64')
-            pre_hidden_np = np.random.uniform(-0.1, 0.1, (
-            batch_size, hidden_size)).astype('float64')
-            if core.is_compiled_with_cuda():
-                place = core.CUDAPlace(0)
-            else:
-                place = core.CPUPlace()
-            with fluid.dygraph.guard(place):
-                cudnn_gru = GRUCell(hidden_size, input_size)
-                step_input_var = fluid.dygraph.to_variable(step_input_np)
-                pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np)
-
-    """
-
-    def __init__(
-        self,
-        hidden_size,
-        input_size,
-        param_attr=None,
-        bias_attr=None,
-        gate_activation=None,
-        activation=None,
-        use_cudnn_impl=True,
-        dtype='float64',
-    ):
-        super().__init__()
-
-        self._hidden_size = hidden_size
-        self._input_size = input_size
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._dtype = dtype
-        self._gate_activation = gate_activation or paddle.nn.functional.sigmoid
-        self._activation = activation or paddle.tanh
-        self._use_cudnn_impl = use_cudnn_impl
-
-        if self._use_cudnn_impl:
-
-            if (
-                self._param_attr is not None
-                and self._param_attr.name is not None
-            ):
-                weight_ih_param_attr = copy.deepcopy(self._param_attr)
-                weight_hh_param_attr = copy.deepcopy(self._param_attr)
-                weight_ih_param_attr.name += "_weight_ih"
-                weight_hh_param_attr.name += "_weight_hh"
-            else:
-                weight_ih_param_attr = self._param_attr
-                weight_hh_param_attr = self._param_attr
-
-            if self._bias_attr is not None and self._bias_attr.name is not None:
-                bias_ih_param_attr = copy.deepcopy(self._bias_attr)
-                bias_hh_param_attr = copy.deepcopy(self._bias_attr)
-                bias_ih_param_attr.name += "_bias_ih"
-                bias_hh_param_attr.name += "_bias_hh"
-            else:
-                bias_ih_param_attr = self._bias_attr
-                bias_hh_param_attr = self._bias_attr
-
-            self._weight_ih = self.create_parameter(
-                attr=weight_ih_param_attr,
-                shape=[3 * self._hidden_size, self._input_size],
-                dtype=self._dtype,
-            )
-
-            self._weight_hh = self.create_parameter(
-                attr=weight_hh_param_attr,
-                shape=[3 * self._hidden_size, self._hidden_size],
-                dtype=self._dtype,
-            )
-
-            self._bias_ih = self.create_parameter(
-                attr=bias_ih_param_attr,
-                shape=[3 * self._hidden_size],
-                dtype=self._dtype,
-                is_bias=True,
-            )
-            self._bias_hh = self.create_parameter(
-                attr=bias_hh_param_attr,
-                shape=[3 * self._hidden_size],
-                dtype=self._dtype,
-                is_bias=True,
-            )
-
-        else:
-
-            if (
-                self._param_attr is not None
-                and self._param_attr.name is not None
-            ):
-                gate_weight_param_attr = copy.deepcopy(self._param_attr)
-                candidate_weight_param_attr = copy.deepcopy(self._param_attr)
-                gate_weight_param_attr.name += "_gate_weight"
-                candidate_weight_param_attr.name += "_candidate_weight"
-            else:
-                gate_weight_param_attr = self._param_attr
-                candidate_weight_param_attr = self._param_attr
-
-            if self._bias_attr is not None and self._bias_attr.name is not None:
-                gate_bias_param_attr = copy.deepcopy(self._bias_attr)
-                candidate_bias_param_attr = copy.deepcopy(self._bias_attr)
-                gate_bias_param_attr.name += "_gate_bias"
-                candidate_bias_param_attr.name += "_candidate_bias"
-            else:
-                gate_bias_param_attr = self._bias_attr
-                candidate_bias_param_attr = self._bias_attr
-
-            self._gate_weight = self.create_parameter(
-                attr=gate_weight_param_attr,
-                shape=[
-                    self._input_size + self._hidden_size,
-                    2 * self._hidden_size,
-                ],
-                dtype=dtype,
-            )
-
-            self._candidate_weight = self.create_parameter(
-                attr=candidate_weight_param_attr,
-                shape=[self._input_size + self._hidden_size, self._hidden_size],
-                dtype=dtype,
-            )
-
-            self._gate_bias = self.create_parameter(
-                attr=gate_bias_param_attr,
-                shape=[2 * self._hidden_size],
-                dtype=dtype,
-                is_bias=True,
-            )
-            self._candidate_bias = self.create_parameter(
-                attr=candidate_bias_param_attr,
-                shape=[self._hidden_size],
-                dtype=dtype,
-                is_bias=True,
-            )
-
-    def forward(self, input, pre_hidden):
-
-        if self._use_cudnn_impl:
-            igates = paddle.matmul(input, y=self._weight_ih, transpose_y=True)
-            igates = paddle.add(igates, self._bias_ih)
-            hgates = paddle.matmul(
-                pre_hidden, self._weight_hh, transpose_y=True
-            )
-            hgates = paddle.add(hgates, self._bias_hh)
-
-            chunked_igates = split(igates, num_or_sections=3, dim=1)
-            chunked_hgates = split(hgates, num_or_sections=3, dim=1)
-
-            reset_gate = paddle.add(chunked_igates[0], chunked_hgates[0])
-            reset_gate = self._gate_activation(reset_gate)
-
-            input_gate = paddle.add(chunked_igates[1], chunked_hgates[1])
-            input_gate = self._gate_activation(input_gate)
-
-            _temp = reset_gate * chunked_hgates[2]
-            new_gate = paddle.add(chunked_igates[2], _temp)
-            new_gate = self._activation(new_gate)
-
-            new_hidden = (pre_hidden - new_gate) * input_gate + new_gate
-
-        else:
-
-            concat_input_hidden = concat([input, pre_hidden], 1)
-
-            gate_input = paddle.matmul(
-                x=concat_input_hidden, y=self._gate_weight
-            )
-
-            gate_input = paddle.add(gate_input, self._gate_bias)
-            gate_input = self._gate_activation(gate_input)
-            r, u = split(gate_input, num_or_sections=2, dim=1)
-
-            r_hidden = r * pre_hidden
-
-            candidate = paddle.matmul(
-                concat([input, r_hidden], 1), self._candidate_weight
-            )
-            candidate = paddle.add(candidate, self._candidate_bias)
-
-            c = self._activation(candidate)
-            new_hidden = u * pre_hidden + (1 - u) * c
-
-        return new_hidden
diff --git a/python/paddle/fluid/tests/unittests/test_cudnn_grucell.py b/python/paddle/fluid/tests/unittests/test_cudnn_grucell.py
deleted file mode 100644
index 6a1ed39157d529bdb52a5d9f6f69107a2d859b7e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_cudnn_grucell.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.dygraph import GRUCell
-
-np.random.seed = 123
-
-
-def sigmoid(x):
-    return 1.0 / (1.0 + np.exp(-x))
-
-
-def tanh(x):
-    return 2.0 * sigmoid(2.0 * x) - 1.0
-
-
-def cudnn_step(
-    step_input_np, pre_hidden_np, weight_ih, bias_ih, weight_hh, bias_hh
-):
-    igates = np.matmul(step_input_np, weight_ih.transpose(1, 0))
-    igates += bias_ih
-    hgates = np.matmul(pre_hidden_np, weight_hh.transpose(1, 0))
-    hgates += bias_hh
-
-    chunked_igates = np.split(igates, indices_or_sections=3, axis=1)
-    chunked_hgates = np.split(hgates, indices_or_sections=3, axis=1)
-
-    reset_gate = chunked_igates[0] + chunked_hgates[0]
-    reset_gate = sigmoid(reset_gate)
-
-    input_gate = chunked_igates[1] + chunked_hgates[1]
-    input_gate = sigmoid(input_gate)
-
-    _temp = reset_gate * chunked_hgates[2]
-    new_gate = chunked_igates[2] + _temp
-    new_gate = tanh(new_gate)
-
-    new_hidden = (pre_hidden_np - new_gate) * input_gate + new_gate
-
-    return new_hidden
-
-
-def non_cudnn_step(
-    step_in, pre_hidden, gate_w, gate_b, candidate_w, candidate_b
-):
-    concat_1 = np.concatenate([step_in, pre_hidden], 1)
-
-    gate_input = np.matmul(concat_1, gate_w)
-    gate_input += gate_b
-    gate_input = sigmoid(gate_input)
-    r, u = np.split(gate_input, indices_or_sections=2, axis=1)
-
-    r_hidden = r * pre_hidden
-
-    candidate = np.matmul(np.concatenate([step_in, r_hidden], 1), candidate_w)
-
-    candidate += candidate_b
-    c = tanh(candidate)
-
-    new_hidden = u * pre_hidden + (1 - u) * c
-
-    return new_hidden
-
-
-class TestCudnnGRU(unittest.TestCase):
-    def setUp(self):
-        self.input_size = 100
-        self.hidden_size = 200
-        self.batch_size = 64
-
-    def test_run(self):
-
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-
-        with fluid.dygraph.guard(place):
-            param_attr = fluid.ParamAttr(name="param_attr")
-            bias_attr = fluid.ParamAttr(name="bias_attr")
-            named_cudnn_gru = GRUCell(
-                self.hidden_size, self.input_size, param_attr, bias_attr
-            )
-            cudnn_gru = GRUCell(self.hidden_size, self.input_size)
-
-            param_list = cudnn_gru.state_dict()
-            named_param_list = named_cudnn_gru.state_dict()
-
-            # process weight and bias
-
-            weight_ih_name = "_weight_ih"
-            bias_ih_name = "_bias_ih"
-            weight_hh_name = "_weight_hh"
-            bias_hh_name = "_bias_hh"
-
-            weight_ih = param_list[weight_ih_name].numpy()
-            weight_ih = np.random.uniform(
-                -0.1, 0.1, size=weight_ih.shape
-            ).astype('float64')
-            param_list[weight_ih_name].set_value(weight_ih)
-            named_param_list[weight_ih_name].set_value(weight_ih)
-
-            bias_ih = param_list[bias_ih_name].numpy()
-            bias_ih = np.random.uniform(-0.1, 0.1, size=bias_ih.shape).astype(
-                'float64'
-            )
-            param_list[bias_ih_name].set_value(bias_ih)
-            named_param_list[bias_ih_name].set_value(bias_ih)
-
-            weight_hh = param_list[weight_hh_name].numpy()
-            weight_hh = np.random.uniform(
-                -0.1, 0.1, size=weight_hh.shape
-            ).astype('float64')
-            param_list[weight_hh_name].set_value(weight_hh)
-            named_param_list[weight_hh_name].set_value(weight_hh)
-
-            bias_hh = param_list[bias_hh_name].numpy()
-            bias_hh = np.random.uniform(-0.1, 0.1, size=bias_hh.shape).astype(
-                'float64'
-            )
-            param_list[bias_hh_name].set_value(bias_hh)
-            named_param_list[bias_hh_name].set_value(bias_hh)
-
-            step_input_np = np.random.uniform(
-                -0.1, 0.1, (self.batch_size, self.input_size)
-            ).astype('float64')
-            pre_hidden_np = np.random.uniform(
-                -0.1, 0.1, (self.batch_size, self.hidden_size)
-            ).astype('float64')
-
-            step_input_var = fluid.dygraph.to_variable(step_input_np)
-            pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np)
-            api_out = cudnn_gru(step_input_var, pre_hidden_var)
-            named_api_out = named_cudnn_gru(step_input_var, pre_hidden_var)
-
-        np_out = cudnn_step(
-            step_input_np, pre_hidden_np, weight_ih, bias_ih, weight_hh, bias_hh
-        )
-
-        np.testing.assert_allclose(api_out.numpy(), np_out, rtol=1e-05, atol=0)
-        np.testing.assert_allclose(
-            named_api_out.numpy(), np_out, rtol=1e-05, atol=0
-        )
-
-
-class TestNonCudnnGRU(unittest.TestCase):
-    def setUp(self):
-        self.input_size = 100
-        self.hidden_size = 200
-        self.batch_size = 64
-
-    def test_run(self):
-
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-
-        with fluid.dygraph.guard(place):
-            param_attr = fluid.ParamAttr(name="param_attr")
-            bias_attr = fluid.ParamAttr(name="bias_attr")
-            named_non_cudnn_gru = GRUCell(
-                self.hidden_size,
-                self.input_size,
-                param_attr,
-                bias_attr,
-                use_cudnn_impl=False,
-            )
-            non_cudnn_gru = GRUCell(
-                self.hidden_size, self.input_size, use_cudnn_impl=False
-            )
-
-            param_list = non_cudnn_gru.state_dict()
-            named_param_list = named_non_cudnn_gru.state_dict()
-
-            # process weight and bias
-
-            gate_w_name = "_gate_weight"
-            gate_b_name = "_gate_bias"
-            candidate_w_name = "_candidate_weight"
-            candidate_b_name = "_candidate_bias"
-
-            gate_w = param_list[gate_w_name].numpy()
-            gate_w = np.random.uniform(-0.1, 0.1, size=gate_w.shape).astype(
-                'float64'
-            )
-            param_list[gate_w_name].set_value(gate_w)
-            named_param_list[gate_w_name].set_value(gate_w)
-
-            gate_b = param_list[gate_b_name].numpy()
-            gate_b = np.random.uniform(-0.1, 0.1, size=gate_b.shape).astype(
-                'float64'
-            )
-            param_list[gate_b_name].set_value(gate_b)
-            named_param_list[gate_b_name].set_value(gate_b)
-
-            candidate_w = param_list[candidate_w_name].numpy()
-            candidate_w = np.random.uniform(
-                -0.1, 0.1, size=candidate_w.shape
-            ).astype('float64')
-            param_list[candidate_w_name].set_value(candidate_w)
-            named_param_list[candidate_w_name].set_value(candidate_w)
-
-            candidate_b = param_list[candidate_b_name].numpy()
-            candidate_b = np.random.uniform(
-                -0.1, 0.1, size=candidate_b.shape
-            ).astype('float64')
-            param_list[candidate_b_name].set_value(candidate_b)
-            named_param_list[candidate_b_name].set_value(candidate_b)
-
-            step_input_np = np.random.uniform(
-                -0.1, 0.1, (self.batch_size, self.input_size)
-            ).astype('float64')
-            pre_hidden_np = np.random.uniform(
-                -0.1, 0.1, (self.batch_size, self.hidden_size)
-            ).astype('float64')
-
-            step_input_var = fluid.dygraph.to_variable(step_input_np)
-            pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np)
-            api_out = non_cudnn_gru(step_input_var, pre_hidden_var)
-            named_api_out = named_non_cudnn_gru(step_input_var, pre_hidden_var)
-
-        np_out = non_cudnn_step(
-            step_input_np,
-            pre_hidden_np,
-            gate_w,
-            gate_b,
-            candidate_w,
-            candidate_b,
-        )
-
-        np.testing.assert_allclose(api_out.numpy(), np_out, rtol=1e-05, atol=0)
-        np.testing.assert_allclose(
-            named_api_out.numpy(), np_out, rtol=1e-05, atol=0
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cudnn_lstmcell.py b/python/paddle/fluid/tests/unittests/test_cudnn_lstmcell.py
deleted file mode 100644
index 913dc958117342fdc9ba331f5396947d477306d1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_cudnn_lstmcell.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.dygraph import LSTMCell
-
-np.random.seed = 123
-
-
-def sigmoid(x):
-    return 1.0 / (1.0 + np.exp(-x))
-
-
-def tanh(x):
-    return 2.0 * sigmoid(2.0 * x) - 1.0
-
-
-def non_cudnn_step(
-    step_in, pre_hidden, pre_cell, gate_w, gate_b, forget_bias=1.0
-):
-    concat_1 = np.concatenate([step_in, pre_hidden], 1)
-
-    gate_input = np.matmul(concat_1, gate_w)
-    gate_input += gate_b
-    i, j, f, o = np.split(gate_input, indices_or_sections=4, axis=1)
-
-    new_cell = pre_cell * sigmoid(f + forget_bias) + sigmoid(i) * tanh(j)
-    new_hidden = tanh(new_cell) * sigmoid(o)
-
-    return new_hidden, new_cell
-
-
-def cudnn_step(
-    step_input_np,
-    pre_hidden_np,
-    pre_cell_np,
-    weight_ih,
-    bias_ih,
-    weight_hh,
-    bias_hh,
-):
-
-    igates = np.matmul(step_input_np, weight_ih.transpose(1, 0))
-    igates = igates + bias_ih
-    hgates = np.matmul(pre_hidden_np, weight_hh.transpose(1, 0))
-    hgates = hgates + bias_hh
-
-    chunked_igates = np.split(igates, indices_or_sections=4, axis=1)
-    chunked_hgates = np.split(hgates, indices_or_sections=4, axis=1)
-
-    ingate = chunked_igates[0] + chunked_hgates[0]
-    ingate = sigmoid(ingate)
-
-    forgetgate = chunked_igates[1] + chunked_hgates[1]
-    forgetgate = sigmoid(forgetgate)
-
-    cellgate = chunked_igates[2] + chunked_hgates[2]
-    cellgate = tanh(cellgate)
-
-    outgate = chunked_igates[3] + chunked_hgates[3]
-    outgate = sigmoid(outgate)
-
-    new_cell = (forgetgate * pre_cell_np) + (ingate * cellgate)
-    new_hidden = outgate * tanh(new_cell)
-
-    return new_hidden, new_cell
-
-
-class TestCudnnLSTM(unittest.TestCase):
-    def setUp(self):
-        self.input_size = 100
-        self.hidden_size = 200
-        self.batch_size = 128
-
-    def test_run(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-
-        with fluid.dygraph.guard(place):
-            param_attr = fluid.ParamAttr(name="param_attr")
-            bias_attr = fluid.ParamAttr(name="bias_attr")
-            named_cudnn_lstm = LSTMCell(
-                self.hidden_size, self.input_size, param_attr, bias_attr
-            )
-            cudnn_lstm = LSTMCell(self.hidden_size, self.input_size)
-
-            param_list = cudnn_lstm.state_dict()
-            named_param_list = named_cudnn_lstm.state_dict()
-
-            # process weight and bias
-
-            weight_ih_name = "_weight_ih"
-            bias_ih_name = "_bias_ih"
-            weight_hh_name = "_weight_hh"
-            bias_hh_name = "_bias_hh"
-            weight_ih = param_list[weight_ih_name].numpy()
-            weight_ih = np.random.uniform(
-                -0.1, 0.1, size=weight_ih.shape
-            ).astype('float64')
-            param_list[weight_ih_name].set_value(weight_ih)
-            named_param_list[weight_ih_name].set_value(weight_ih)
-
-            bias_ih = param_list[bias_ih_name].numpy()
-            bias_ih = np.random.uniform(-0.1, 0.1, size=bias_ih.shape).astype(
-                'float64'
-            )
-            param_list[bias_ih_name].set_value(bias_ih)
-            named_param_list[bias_ih_name].set_value(bias_ih)
-
-            weight_hh = param_list[weight_hh_name].numpy()
-            weight_hh = np.random.uniform(
-                -0.1, 0.1, size=weight_hh.shape
-            ).astype('float64')
-            param_list[weight_hh_name].set_value(weight_hh)
-            named_param_list[weight_hh_name].set_value(weight_hh)
-
-            bias_hh = param_list[bias_hh_name].numpy()
-            bias_hh = np.random.uniform(-0.1, 0.1, size=bias_hh.shape).astype(
-                'float64'
-            )
-            param_list[bias_hh_name].set_value(bias_hh)
-            named_param_list[bias_hh_name].set_value(bias_hh)
-
-            step_input_np = np.random.uniform(
-                -0.1, 0.1, (self.batch_size, self.input_size)
-            ).astype('float64')
-            pre_hidden_np = np.random.uniform(
-                -0.1, 0.1, (self.batch_size, self.hidden_size)
-            ).astype('float64')
-            pre_cell_np = np.random.uniform(
-                -0.1, 0.1, (self.batch_size, self.hidden_size)
-            ).astype('float64')
-
-            step_input_var = fluid.dygraph.to_variable(step_input_np)
-            pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np)
-            pre_cell_var = fluid.dygraph.to_variable(pre_cell_np)
-            api_out = cudnn_lstm(step_input_var, pre_hidden_var, pre_cell_var)
-            named_api_out = named_cudnn_lstm(
-                step_input_var, pre_hidden_var, pre_cell_var
-            )
-
-            api_hidden_out = api_out[0]
-            api_cell_out = api_out[1]
-            named_api_hidden_out = named_api_out[0]
-            named_api_cell_out = named_api_out[1]
-
-            np_hidden_out, np_cell_out = cudnn_step(
-                step_input_np,
-                pre_hidden_np,
-                pre_cell_np,
-                weight_ih,
-                bias_ih,
-                weight_hh,
-                bias_hh,
-            )
-            np.testing.assert_allclose(
-                api_hidden_out.numpy(), np_hidden_out, rtol=1e-05, atol=0
-            )
-            np.testing.assert_allclose(
-                api_cell_out.numpy(), np_cell_out, rtol=1e-05, atol=0
-            )
-            np.testing.assert_allclose(
-                named_api_hidden_out.numpy(), np_hidden_out, rtol=1e-05, atol=0
-            )
-            np.testing.assert_allclose(
-                named_api_cell_out.numpy(), np_cell_out, rtol=1e-05, atol=0
-            )
-
-
-class TestNonCudnnLSTM(unittest.TestCase):
-    def setUp(self):
-        self.input_size = 100
-        self.hidden_size = 200
-        self.batch_size = 128
-
-    def test_run(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-
-        with fluid.dygraph.guard(place):
-            param_attr = fluid.ParamAttr(name="param_attr")
-            bias_attr = fluid.ParamAttr(name="bias_attr")
-            named_cudnn_lstm = LSTMCell(
-                self.hidden_size,
-                self.input_size,
-                param_attr,
-                bias_attr,
-                use_cudnn_impl=False,
-            )
-            cudnn_lstm = LSTMCell(
-                self.hidden_size, self.input_size, use_cudnn_impl=False
-            )
-
-            param_list = cudnn_lstm.state_dict()
-            named_param_list = named_cudnn_lstm.state_dict()
-
-            # process weight and bias
-
-            gate_w_name = "_weight"
-            gate_b_name = "_bias"
-
-            gate_w = param_list[gate_w_name].numpy()
-            gate_w = np.random.uniform(-0.1, 0.1, size=gate_w.shape).astype(
-                'float64'
-            )
-            param_list[gate_w_name].set_value(gate_w)
-            named_param_list[gate_w_name].set_value(gate_w)
-
-            gate_b = param_list[gate_b_name].numpy()
-            gate_b = np.random.uniform(-0.1, 0.1, size=gate_b.shape).astype(
-                'float64'
-            )
-            param_list[gate_b_name].set_value(gate_b)
-            named_param_list[gate_b_name].set_value(gate_b)
-
-            step_input_np = np.random.uniform(
-                -0.1, 0.1, (self.batch_size, self.input_size)
-            ).astype('float64')
-            pre_hidden_np = np.random.uniform(
-                -0.1, 0.1, (self.batch_size, self.hidden_size)
-            ).astype('float64')
-            pre_cell_np = np.random.uniform(
-                -0.1, 0.1, (self.batch_size, self.hidden_size)
-            ).astype('float64')
-
-            step_input_var = fluid.dygraph.to_variable(step_input_np)
-            pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np)
-            pre_cell_var = fluid.dygraph.to_variable(pre_cell_np)
-            api_out = cudnn_lstm(step_input_var, pre_hidden_var, pre_cell_var)
-            named_api_out = named_cudnn_lstm(
-                step_input_var, pre_hidden_var, pre_cell_var
-            )
-
-            api_hidden_out = api_out[0]
-            api_cell_out = api_out[1]
-            named_api_hidden_out = named_api_out[0]
-            named_api_cell_out = named_api_out[1]
-
-            np_hidden_out, np_cell_out = non_cudnn_step(
-                step_input_np, pre_hidden_np, pre_cell_np, gate_w, gate_b
-            )
-
-            np.testing.assert_allclose(
-                api_hidden_out.numpy(), np_hidden_out, rtol=1e-05, atol=0
-            )
-            np.testing.assert_allclose(
-                api_cell_out.numpy(), np_cell_out, rtol=1e-05, atol=0
-            )
-            np.testing.assert_allclose(
-                named_api_hidden_out.numpy(), np_hidden_out, rtol=1e-05, atol=0
-            )
-            np.testing.assert_allclose(
-                named_api_cell_out.numpy(), np_cell_out, rtol=1e-05, atol=0
-            )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index bd372cbabd552635c591081539726d6e07e3e4aa..4e552bebb75000584437a0e49dcf89cf0693a49c 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -1103,7 +1103,6 @@ FOURTH_HIGH_PARALLEL_JOB_NEW = [
     'test_trt_reduce_sum_op',
     'data_type_transform_test',
     'test_gru_rnn_op',
-    'test_cudnn_grucell',
     'test_argsort_op',
     'test_batch_norm_op',
     'test_inplace',
@@ -1190,7 +1189,6 @@ FOURTH_HIGH_PARALLEL_JOB_NEW = [
     'test_nn_functional_embedding_dygraph',
     'test_reshape_op',
     'test_maxout_op',
-    'test_cudnn_lstmcell',
     'test_sigmoid_focal_loss',
     'test_manual_seed',
     'test_lrn_op',
@@ -2727,7 +2725,6 @@ TWO_PARALLEL_JOB = [
     'test_polygon_box_transform',
     'test_sequence_pad_op',
     'test_sequence_expand',
-    'test_cudnn_grucell',
     'test_pool2d_bf16_mkldnn_op',
     'test_bilinear_api',
     'test_parallel_executor_inference_feed_partial_data',
@@ -2783,7 +2780,6 @@ TWO_PARALLEL_JOB = [
     'test_cos_sim_op',
     'test_sequence_enumerate_op',
     'test_sequence_concat',
-    'test_cudnn_lstmcell',
     'test_data_norm_op',
     'test_decoupled_py_reader_data_check',
     'test_deformable_conv_v1_op',