From 3ba1237e5f46567faa6853485a0776f49865ae35 Mon Sep 17 00:00:00 2001
From: wangzhen38 <41941775+wangzhen38@users.noreply.github.com>
Date: Tue, 6 Dec 2022 09:57:04 +0800
Subject: [PATCH] [remove fluid] GRUUnit NCE (#48610)

* [remove fluid] GRUUnit NCE

* [remove fluid] GRUUnit NCE

* [remove fluid] GRUUnit NCE

* [remove fluid] GRUUnit NCE
---
 python/paddle/fluid/dygraph/nn.py             | 480 ------------------
 .../unittests/dygraph_to_static/test_lac.py   |  10 +-
 .../fluid/tests/unittests/test_gru_unit_op.py |  16 -
 .../test_imperative_load_static_param.py      |   5 +-
 .../test_imperative_ocr_attention_model.py    |  18 +-
 .../fluid/tests/unittests/test_layers.py      | 471 -----------------
 .../paddle/fluid/tests/unittests/test_nce.py  |  53 --
 7 files changed, 10 insertions(+), 1043 deletions(-)

diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index abef927af86..39da342c380 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -54,8 +54,6 @@ __all__ = [
     'Linear',
     'BatchNorm',
     'Embedding',
-    'GRUUnit',
-    'NCE',
     'PRelu',
     'BilinearTensorProduct',
     'Conv2DTranspose',
@@ -1363,484 +1361,6 @@ class Embedding(layers.Layer):
         return out
 
 
-class GRUUnit(layers.Layer):
-    """
-    **GRU unit layer**
-
-    It creates a callable object from GRUUnit class.
-    If origin_mode is True, then the equation of a gru step is from paper
-    `Learning Phrase Representations using RNN Encoder-Decoder for Statistical
-    Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
-
-        .. math::
-            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
-
-            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
-
-            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
-
-            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
-
-    If origin_mode is False, then the equation of a gru step is from paper
-    `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
-    Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_
-
-        .. math::
-            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
-
-            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
-
-            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
-
-            h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t)
-
-
-    The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
-    of the equation above, the :math:`z_t` is split into 3 parts -
-    :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to
-    implement a full GRU unit operator for an input, a fully
-    connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.
-
-    The terms :math:`u_t` and :math:`r_t` represent the update and reset gates
-    of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is
-    an intermediate candidate hidden output, which is denoted by :math:`m_t`.
-    This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
-    and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
-
-    Parameters:
-        size (int): The input dimension value.
-        param_attr(ParamAttr, optional): The parameter attribute for the learnable
-            hidden-hidden weight matrix.
-
-            **Note**:
-
-                1. The shape of the weight matrix is :math:`[T, 3*D]`, where D is the hidden size.
-                2. All elements in the weight matrix can be divided into two parts. The first
-                   part are weights of the update gate and reset gate with shape :math:`[D, 2*D]`,
-                   and the second part are weights for candidate hidden state with shape :math:`[D, D]`.
-
-
-            If it is set to None or one attribute of ParamAttr, gru_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. The default
-            value is None.
-        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias
-            of GRU.Note that the bias with :math:`[1, 3*D]` concatenates
-            the bias in the update gate, reset gate and candidate calculations.
-            If it is set to False, no bias will be applied to the update gate,
-            reset gate and candidate calculations. If it is set to None or one
-            attribute of ParamAttr, gru_unit will create ParamAttr as
-            bias_attr. If the Initializer of the bias_attr is not set, the bias
-            is initialized zero. The default value is None.
-        activation (str): The activation type for cell (actNode).
-                             The default value is 'tanh'.
-        gate_activation (str): The activation type for gates (actGate).
-                                  The default value is 'sigmoid'.
-        dtype(str): The dtype of the layers. The data type can be set as
-            'float32', 'float64'. The default value is 'float32'.
-
-    Attribute:
-        **weight** (Parameter): the learnable weights of this layer.
-
-        **bias** (Parameter): the learnable bias of this layer.
-
-    Returns:
-        tuple: The hidden value, reset-hidden value and gate values. The hidden value
-        is a 2-D tensor with shape  :math:`[T, D]` . The reset-hidden value is a
-        2-D tensor with shape  :math:`[T, D]` . The gate value is a 2-D tensor with
-        shape  :math:`[T, 3*D]`.
-
-    Examples:
-
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import paddle.fluid.dygraph.base as base
-          import numpy
-
-          lod = [[2, 4, 3]]
-          D = 5
-          T = sum(lod[0])
-
-          input = numpy.random.rand(T, 3 * D).astype('float32')
-          hidden_input = numpy.random.rand(T, D).astype('float32')
-          with fluid.dygraph.guard():
-              x = numpy.random.random((3, 32, 32)).astype('float32')
-              gru = fluid.dygraph.GRUUnit(size=D * 3)
-              dy_ret = gru(
-                base.to_variable(input), base.to_variable(hidden_input))
-
-    """
-
-    def __init__(
-        self,
-        size,
-        param_attr=None,
-        bias_attr=None,
-        activation='tanh',
-        gate_activation='sigmoid',
-        origin_mode=False,
-        dtype='float32',
-    ):
-        super().__init__()
-        self._bias_attr = bias_attr
-        activation_dict = dict(
-            identity=0,
-            sigmoid=1,
-            tanh=2,
-            relu=3,
-        )
-        self.activation = activation_dict[activation]
-        self.gate_activation = activation_dict[gate_activation]
-
-        self._dtype = dtype
-        size = size // 3
-        # create weight
-        self.weight = self.create_parameter(
-            attr=param_attr, shape=[size, 3 * size], dtype=dtype
-        )
-
-        # create bias
-        bias_size = [1, 3 * size]
-        self._bias_size = bias_size
-        self.bias = self.create_parameter(
-            attr=bias_attr, shape=bias_size, dtype=dtype, is_bias=True
-        )
-
-    def forward(self, input, hidden):
-        if _non_static_mode():
-            gate, reset_hidden_pre, updated_hidden = _legacy_C_ops.gru_unit(
-                input,
-                hidden,
-                self.weight,
-                self.bias,
-                'activation',
-                self.activation,
-                'gate_activation',
-                self.gate_activation,
-            )
-            return updated_hidden, reset_hidden_pre, gate
-
-        check_variable_and_dtype(
-            input, 'input', ['float32', 'float64'], 'GRUUnit'
-        )
-        check_variable_and_dtype(
-            hidden, 'hidden', ['float32', 'float64'], 'GRUUnit'
-        )
-        inputs = {
-            'Input': [input],
-            'HiddenPrev': [hidden],
-            'Weight': [self.weight],
-        }
-        if self.bias is not None:
-            inputs['Bias'] = [self.bias]
-        gate = self._helper.create_variable_for_type_inference(self._dtype)
-        reset_hidden_pre = self._helper.create_variable_for_type_inference(
-            self._dtype
-        )
-        updated_hidden = self._helper.create_variable_for_type_inference(
-            self._dtype
-        )
-        self._helper.append_op(
-            type='gru_unit',
-            inputs=inputs,
-            outputs={
-                'Gate': gate,
-                'ResetHiddenPrev': reset_hidden_pre,
-                'Hidden': updated_hidden,
-            },
-            attrs={
-                'activation': self.activation,
-                'gate_activation': self.gate_activation,
-            },
-        )
-
-        return updated_hidden, reset_hidden_pre, gate
-
-
-class NCE(layers.Layer):
-    """
-    This interface is used to construct a callable object of the ``NCE`` class.
-    For more details, refer to code examples.
-    It implements the function of the ``NCE`` loss function.
-    By default this function uses a uniform distribution for sampling, and it
-    compute and return the noise-contrastive estimation training loss. See
-    `Noise-contrastive estimation: A new estimation principle for unnormalized statistical models <http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf>`_ .
-
-    Parameters:
-        num_total_classes (int): Total number of classes in all samples.
-        dim (int): Dimension of input (possibly embedding dim).
-        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
-             of nce. If it is set to None or one attribute of ParamAttr, nce
-             will create ParamAttr as param_attr. If the Initializer of the param_attr
-             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr or bool, optional): The attribute for the bias of nce.
-             If it is set to False, no bias will be added to the output units.
-             If it is set to None or one attribute of ParamAttr, nce
-             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-             is not set, the bias is initialized zero. Default: None.
-        num_neg_samples (int, optional): The number of negative classes. The default value is 10.
-        sampler (str, optional): The sampler used to sample class from negative classes.
-                       It can be 'uniform', 'log_uniform' or 'custom_dist'.
-                       default: 'uniform'.
-        custom_dist (float[], optional): A float[] with size=num_total_classes.
-                       It is used when sampler is set to 'custom_dist'.
-                       custom_dist[i] is the probability of i-th class to be sampled.
-                       Default: None.
-        seed (int, optional): The seed used in sampler. Default: 0.
-        is_sparse(bool, optional): The flag indicating whether to use sparse update. If is_sparse is True, the weight@GRAD and bias@GRAD will be changed to SelectedRows. Default: False.
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-
-    Attribute:
-        **weight** (Parameter): the learnable weights of this layer.
-
-        **bias** (Parameter or None): the learnable bias of this layer.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            import numpy as np
-            import paddle.fluid as fluid
-
-            window_size = 5
-            dict_size = 20
-            label_word = int(window_size // 2) + 1
-            inp_word = np.array([[1], [2], [3], [4], [5]]).astype('int64')
-            nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32')
-
-            with fluid.dygraph.guard():
-                words = []
-                for i in range(window_size):
-                    words.append(fluid.dygraph.base.to_variable(inp_word[i]))
-
-                emb = fluid.Embedding(
-                    size=[dict_size, 32],
-                    param_attr='emb.w',
-                    is_sparse=False)
-
-                embs3 = []
-                for i in range(window_size):
-                    if i == label_word:
-                        continue
-
-                    emb_rlt = emb(words[i])
-                    embs3.append(emb_rlt)
-
-                embs3 = fluid.layers.concat(input=embs3, axis=1)
-                nce = fluid.NCE(
-                             num_total_classes=dict_size,
-                             dim=embs3.shape[1],
-                             num_neg_samples=2,
-                             sampler="custom_dist",
-                             custom_dist=nid_freq_arr.tolist(),
-                             seed=1,
-                             param_attr='nce.w',
-                             bias_attr='nce.b')
-
-                wl = fluid.layers.unsqueeze(words[label_word], axes=[0])
-                nce_loss3 = nce(embs3, wl)
-
-    """
-
-    def __init__(
-        self,
-        num_total_classes,
-        dim,
-        sample_weight=None,
-        param_attr=None,
-        bias_attr=None,
-        num_neg_samples=None,
-        sampler="uniform",
-        custom_dist=None,
-        seed=0,
-        is_sparse=False,
-        dtype='float32',
-    ):
-        super().__init__()
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._num_total_classes = num_total_classes
-        self._dtype = dtype
-        self._inputs = dict()
-        self._inputs['SampleWeight'] = (
-            sample_weight if sample_weight is not None else []
-        )
-        if sampler == "uniform":
-            sampler = 0
-        elif sampler == "log_uniform":
-            sampler = 1
-        elif sampler == "custom_dist":
-            assert custom_dist is not None
-            # assert isinstance(custom_dist, Variable)
-
-            custom_dist_len = len(custom_dist)
-            alias_probs_ = [0] * custom_dist_len
-            alias_ = [0] * custom_dist_len
-            bigs = []
-            littles = []
-            for i in range(custom_dist_len):
-                normal_prob = custom_dist[i] * custom_dist_len
-                if normal_prob - 1.0 > 0:
-                    bigs.append((i, normal_prob))
-                elif 1.0 - normal_prob > 0:
-                    littles.append((i, normal_prob))
-                else:
-                    alias_probs_[i] = normal_prob
-                    alias_[i] = -1
-
-            while len(bigs) and len(littles):
-                big = bigs.pop(0)
-                little = littles.pop(0)
-
-                big_idx = big[0]
-                big_prob = big[1]
-
-                alias_probs_[little[0]] = little[1]
-                alias_[little[0]] = big_idx
-                big_left = big[1] + little[1] - 1
-                if big_left - 1.0 > 0:
-                    bigs.append((big_idx, big_left))
-                elif 1.0 - big_left > 0:
-                    littles.append((big_idx, big_left))
-                else:
-                    alias_probs_[big_idx] = big_left
-                    alias_[big_idx] = -1
-
-            if len(bigs):
-                big = bigs.pop(0)
-                alias_probs_[big[0]] = 1.0
-                alias_[big[0]] = -1
-            if len(littles):
-                little = littles.pop(0)
-                alias_probs_[little[0]] = 1.0
-                alias_[little[0]] = -1
-
-            def _init_by_numpy_array(numpy_array):
-                ret = self.create_parameter(
-                    attr=ParamAttr(),
-                    shape=numpy_array.shape,
-                    dtype=numpy_array.dtype,
-                    default_initializer=NumpyArrayInitializer(numpy_array),
-                )
-                ret.stop_gradient = True
-                return ret
-
-            self._inputs['CustomDistProbs'] = _init_by_numpy_array(
-                np.array(custom_dist).astype('float32')
-            )
-            self._inputs['CustomDistAlias'] = _init_by_numpy_array(
-                np.array(alias_).astype('int32')
-            )
-            self._inputs['CustomDistAliasProbs'] = _init_by_numpy_array(
-                np.array(alias_probs_).astype('float32')
-            )
-            sampler = 2
-        else:
-            raise Exception("Unsupported sampler type.")
-
-        if num_neg_samples is None:
-            num_neg_samples = 10
-        else:
-            num_neg_samples = int(num_neg_samples)
-        self._num_neg_samples = num_neg_samples
-        remote_prefetch = is_sparse
-        print(
-            "With sparse mode, if your models has only small parameter prefetch may cause speed down"
-        )
-        self._attrs = {
-            'num_total_classes': int(num_total_classes),
-            'num_neg_samples': num_neg_samples,
-            'seed': seed,
-            'sampler': sampler,
-            'is_sparse': is_sparse,
-            'remote_prefetch': remote_prefetch,
-        }
-
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._num_total_classes, dim],
-            is_bias=False,
-            dtype=self._dtype,
-        )
-        if self._bias_attr:
-            self.bias = self.create_parameter(
-                attr=self._bias_attr,
-                shape=[self._num_total_classes, 1],
-                is_bias=True,
-                dtype=self._dtype,
-            )
-            self._inputs['Bias'] = self.bias
-        self._inputs['Weight'] = self.weight
-
-    def forward(self, input, label, sample_weight=None):
-        if _non_static_mode():
-            attrs = (
-                'num_total_classes',
-                self._attrs['num_total_classes'],
-                'num_neg_samples',
-                self._attrs['num_neg_samples'],
-                'seed',
-                self._attrs['seed'],
-                'sampler',
-                self._attrs['sampler'],
-                'is_sparse',
-                self._attrs['is_sparse'],
-                'remote_prefetch',
-                self._attrs['remote_prefetch'],
-            )
-            cost, _, _ = _legacy_C_ops.nce(
-                input,
-                label,
-                self.weight,
-                self.bias,
-                self._inputs['SampleWeight'],
-                self._inputs['CustomDistProbs'],
-                self._inputs['CustomDistAlias'],
-                self._inputs['CustomDistAliasProbs'],
-                *attrs
-            )
-            return cost / (self._num_neg_samples + 1)
-
-        check_variable_and_dtype(input, "input", ['float32', 'float64'], "NCE")
-        check_variable_and_dtype(label, "label", ['int64'], "NCE")
-        check_type(
-            sample_weight, 'sample_weight', (Variable, type(None)), 'NCE'
-        )
-        assert isinstance(input, Variable)
-        assert isinstance(label, Variable)
-
-        self._inputs['Input'] = input
-        self._inputs['Label'] = label
-        self._inputs['SampleWeight'] = (
-            sample_weight if sample_weight is not None else []
-        )
-
-        cost = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype
-        )
-        sample_logits = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype
-        )
-        sample_labels = self._helper.create_variable_for_type_inference(
-            dtype=label.dtype
-        )
-
-        self._helper.append_op(
-            type='nce',
-            inputs=self._inputs,
-            outputs={
-                'Cost': cost,
-                'SampleLogits': sample_logits,
-                'SampleLabels': sample_labels,
-            },
-            attrs=self._attrs,
-        )
-        return cost / (self._num_neg_samples + 1)
-
-
 class PRelu(layers.Layer):
     r"""
     This interface is used to construct a callable object of the ``PRelu`` class.
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index 5aff8c710ae..dc1eedcc8d2 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -25,7 +25,7 @@ os.environ["CUDA_VISIBLE_DEVICES"] = "2"
 import paddle
 import paddle.fluid as fluid
 from paddle import _legacy_C_ops
-from paddle.fluid.dygraph import Embedding, GRUUnit, to_variable
+from paddle.fluid.dygraph import Embedding, to_variable
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.framework import _non_static_mode
 from paddle.jit import ProgramTranslator
@@ -57,13 +57,9 @@ class DynamicGRU(fluid.dygraph.Layer):
     ):
         super().__init__()
 
-        self.gru_unit = GRUUnit(
+        self.gru_unit = paddle.nn.GRUCell(
             size * 3,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            activation=candidate_activation,
-            gate_activation=gate_activation,
-            origin_mode=origin_mode,
+            size,
         )
 
         self.size = size
diff --git a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
index 92fadf591a7..edb8c83ced1 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
@@ -23,22 +23,6 @@ from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.layers import gru_unit
 
 
-class TestGRUUnitAPIError(unittest.TestCase):
-    def test_errors(self):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            D = 5
-            layer = fluid.dygraph.nn.GRUUnit(size=D * 3)
-            # the input must be Variable.
-            x0 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace()
-            )
-            self.assertRaises(TypeError, layer, x0)
-            # the input dtype must be float32 or float64
-            x = fluid.data(name='x', shape=[-1, D * 3], dtype='float16')
-            hidden = fluid.data(name='hidden', shape=[-1, D], dtype='float32')
-            self.assertRaises(TypeError, layer, x, hidden)
-
-
 class GRUActivationType(OpTest):
     identity = 0
     sigmoid = 1
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
index a98d9b994b3..573c1699acd 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
@@ -21,7 +21,7 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
-from paddle.fluid.dygraph.nn import NCE, BatchNorm, Embedding, GroupNorm, PRelu
+from paddle.fluid.dygraph.nn import BatchNorm, Embedding, GroupNorm, PRelu
 from paddle.nn import Linear
 
 
@@ -212,9 +212,6 @@ class TestDygraphLoadStatic(unittest.TestCase):
                     self.layer_norm_1 = paddle.nn.LayerNorm([10])
                     self.layer_norm_2 = paddle.nn.LayerNorm(10)
 
-                    self.nce1 = NCE(10000, 100)
-                    self.nce2 = NCE(10000, 100)
-
                     self.prelu1 = PRelu("channel", channel=5)
                     self.prelu2 = PRelu("channel", channel=5)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 998e9de7ff8..4e4c8aa4351 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -21,7 +21,7 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.dygraph.nn import BatchNorm, Embedding, GRUUnit
+from paddle.fluid.dygraph.nn import BatchNorm, Embedding
 from paddle.fluid.framework import _test_eager_guard
 from paddle.nn import Linear
 
@@ -168,13 +168,9 @@ class DynamicGRU(fluid.dygraph.Layer):
     ):
         super().__init__()
 
-        self.gru_unit = GRUUnit(
+        self.gru_unit = paddle.nn.GRUCell(
             size * 3,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            activation=candidate_activation,
-            gate_activation=gate_activation,
-            origin_mode=origin_mode,
+            size,
         )
 
         self.h_0 = h_0
@@ -189,7 +185,7 @@ class DynamicGRU(fluid.dygraph.Layer):
                 i = inputs.shape[1] - 1 - i
             input_ = paddle.slice(inputs, axes=[1], starts=[i], ends=[i + 1])
             input_ = paddle.reshape(input_, [-1, input_.shape[2]])
-            hidden, reset, gate = self.gru_unit(input_, hidden)
+            hidden, reset = self.gru_unit(input_, hidden)
             hidden_ = paddle.reshape(hidden, [-1, 1, hidden.shape[1]])
             if self.is_reverse:
                 res = [hidden_] + res
@@ -330,9 +326,7 @@ class GRUDecoderWithAttention(fluid.dygraph.Layer):
         self.fc_2_layer = Linear(
             decoder_size, decoder_size * 3, bias_attr=False
         )
-        self.gru_unit = GRUUnit(
-            size=decoder_size * 3, param_attr=None, bias_attr=None
-        )
+        self.gru_unit = paddle.nn.GRUCell(decoder_size * 3, decoder_size)
         self.out_layer = Linear(decoder_size, num_classes + 2, bias_attr=None)
 
         self.decoder_size = decoder_size
@@ -357,7 +351,7 @@ class GRUDecoderWithAttention(fluid.dygraph.Layer):
             fc_2 = self.fc_2_layer(current_word)
             decoder_inputs = paddle.add(x=fc_1, y=fc_2)
 
-            h, _, _ = self.gru_unit(decoder_inputs, hidden_mem)
+            h, _ = self.gru_unit(decoder_inputs, hidden_mem)
             hidden_mem = h
             out = self.out_layer(h)
             out = paddle.nn.functional.softmax(out)
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index dcf442200d1..dcf9d4d1000 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -316,147 +316,6 @@ class TestLayer(LayerTest):
         np.testing.assert_allclose(static_ret, dy_ret_value, rtol=1e-05)
         np.testing.assert_allclose(static_ret, dy_eager_ret_value, rtol=1e-05)
 
-    def test_gru_unit(self):
-        lod = [[2, 4, 3]]
-        D = 5
-        T = sum(lod[0])
-        N = len(lod[0])
-
-        input = np.random.rand(T, 3 * D).astype('float32')
-        hidden_input = np.random.rand(T, D).astype('float32')
-
-        with self.static_graph():
-            x = layers.data(name='x', shape=[-1, D * 3], dtype='float32')
-            hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32')
-            updated_hidden, reset_hidden_pre, gate = layers.gru_unit(
-                input=x, hidden=hidden, size=D * 3
-            )
-            static_ret = self.get_static_graph_result(
-                feed={'x': input, 'hidden': hidden_input},
-                fetch_list=[updated_hidden, reset_hidden_pre, gate],
-            )
-
-        with self.static_graph():
-            x = layers.data(name='x', shape=[-1, D * 3], dtype='float32')
-            hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32')
-            updated_hidden, reset_hidden_pre, gate = layers.gru_unit(
-                input=x, hidden=hidden, size=D * 3
-            )
-            gru = nn.GRUUnit(size=D * 3)
-            updated_hidden, reset_hidden_pre, gate = gru(x, hidden)
-
-            static_ret2 = self.get_static_graph_result(
-                feed={'x': input, 'hidden': hidden_input},
-                fetch_list=[updated_hidden, reset_hidden_pre, gate],
-            )
-
-        with self.dynamic_graph():
-            with _test_eager_guard():
-                gru = nn.GRUUnit(size=D * 3)
-                dy_eager_ret = gru(
-                    base.to_variable(input), base.to_variable(hidden_input)
-                )
-                dy_eager_ret_value = []
-                for i in range(len(static_ret)):
-                    dy_eager_ret_value.append(dy_eager_ret[i].numpy())
-
-            gru = nn.GRUUnit(size=D * 3)
-            dy_ret = gru(
-                base.to_variable(input), base.to_variable(hidden_input)
-            )
-            dy_ret_value = []
-            for i in range(len(static_ret)):
-                dy_ret_value.append(dy_ret[i].numpy())
-
-        for i in range(len(static_ret)):
-            np.testing.assert_allclose(
-                static_ret[i], static_ret2[i], rtol=1e-05
-            )
-            np.testing.assert_allclose(
-                static_ret[i], dy_ret_value[i], rtol=1e-05
-            )
-            np.testing.assert_allclose(
-                static_ret[i], dy_eager_ret_value[i], rtol=1e-05
-            )
-
-        with self.dynamic_graph():
-            with _test_eager_guard():
-                custom_weight = np.random.randn(D, D * 3).astype("float32")
-                weight_attr = fluid.ParamAttr(
-                    initializer=fluid.initializer.NumpyArrayInitializer(
-                        custom_weight
-                    )
-                )
-                gru1 = nn.GRUUnit(size=D * 3)
-                gru2 = nn.GRUUnit(size=D * 3, param_attr=weight_attr)
-                dy_ret1 = gru1(
-                    base.to_variable(input), base.to_variable(hidden_input)
-                )
-                dy_ret2 = gru2(
-                    base.to_variable(input), base.to_variable(hidden_input)
-                )
-                self.assertFalse(
-                    np.array_equal(gru1.weight.numpy(), gru2.weight.numpy())
-                )
-                for o1, o2 in zip(dy_ret1, dy_ret2):
-                    self.assertFalse(np.array_equal(o1.numpy(), o2.numpy()))
-                gru2.weight.set_value(gru1.weight.numpy())
-                gru2.bias.set_value(gru1.bias)
-                dy_ret1 = gru1(
-                    base.to_variable(input), base.to_variable(hidden_input)
-                )
-                dy_ret2 = gru2(
-                    base.to_variable(input), base.to_variable(hidden_input)
-                )
-                for o1, o2 in zip(dy_ret1, dy_ret2):
-                    np.testing.assert_array_equal(o1.numpy(), o2.numpy())
-
-                gru2.weight = gru1.weight
-                gru2.bias = gru1.bias
-                np.testing.assert_array_equal(
-                    gru1.weight.numpy(), gru2.weight.numpy()
-                )
-                np.testing.assert_array_equal(
-                    gru1.bias.numpy(), gru2.bias.numpy()
-                )
-
-            custom_weight = np.random.randn(D, D * 3).astype("float32")
-            weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight
-                )
-            )
-            gru1 = nn.GRUUnit(size=D * 3)
-            gru2 = nn.GRUUnit(size=D * 3, param_attr=weight_attr)
-            dy_ret1 = gru1(
-                base.to_variable(input), base.to_variable(hidden_input)
-            )
-            dy_ret2 = gru2(
-                base.to_variable(input), base.to_variable(hidden_input)
-            )
-            self.assertFalse(
-                np.array_equal(gru1.weight.numpy(), gru2.weight.numpy())
-            )
-            for o1, o2 in zip(dy_ret1, dy_ret2):
-                self.assertFalse(np.array_equal(o1.numpy(), o2.numpy()))
-            gru2.weight.set_value(gru1.weight.numpy())
-            gru2.bias.set_value(gru1.bias)
-            dy_ret1 = gru1(
-                base.to_variable(input), base.to_variable(hidden_input)
-            )
-            dy_ret2 = gru2(
-                base.to_variable(input), base.to_variable(hidden_input)
-            )
-            for o1, o2 in zip(dy_ret1, dy_ret2):
-                np.testing.assert_array_equal(o1.numpy(), o2.numpy())
-
-            gru2.weight = gru1.weight
-            gru2.bias = gru1.bias
-            np.testing.assert_array_equal(
-                gru1.weight.numpy(), gru2.weight.numpy()
-            )
-            np.testing.assert_array_equal(gru1.bias.numpy(), gru2.bias.numpy())
-
     def test_elementwise_math(self):
         n = np.ones([3, 3], dtype='float32')
         n2 = np.ones([3, 3], dtype='float32') * 1.1
@@ -1139,336 +998,6 @@ class TestLayer(LayerTest):
                 emb1.weight.numpy(), emb2.weight.numpy()
             )
 
-    def test_nce(self):
-        window_size = 5
-        dict_size = 20
-        label_word = int(window_size // 2) + 1
-        inp_word = np.array([[1], [2], [3], [4], [5]]).astype('int64')
-        nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32')
-        seed = 1
-        with self.static_graph():
-            words = []
-            for i in range(window_size):
-                words.append(
-                    layers.data(
-                        name='word_{0}'.format(i), shape=[None], dtype='int64'
-                    )
-                )
-            sample_weights = layers.fill_constant(
-                shape=[5, 1], dtype='float32', value=1
-            )
-            embs = []
-            for i in range(window_size):
-                if i == label_word:
-                    continue
-
-                emb = fluid.embedding(
-                    input=words[i],
-                    size=[dict_size, 32],
-                    param_attr='emb.w',
-                    is_sparse=False,
-                )
-                embs.append(emb)
-
-            embs = layers.concat(input=embs, axis=1)
-            wl = fluid.layers.unsqueeze(words[label_word], axes=[0])
-            nce_loss = paddle.static.nn.nce(
-                input=embs,
-                label=wl,
-                num_total_classes=dict_size,
-                num_neg_samples=2,
-                sampler="custom_dist",
-                custom_dist=nid_freq_arr.tolist(),
-                seed=seed,
-                param_attr='nce.w',
-                bias_attr='nce.b',
-                sample_weight=sample_weights,
-            )
-            feed_dict = dict()
-            for i in range(window_size):
-                feed_dict['word_{0}'.format(i)] = inp_word[i]
-            static_rlt = self.get_static_graph_result(
-                feed=feed_dict, fetch_list=[nce_loss]
-            )[0]
-
-        with self.static_graph():
-            words = []
-            for i in range(window_size):
-                words.append(
-                    layers.data(
-                        name='word_{0}'.format(i), shape=[None], dtype='int64'
-                    )
-                )
-            sample_weights = layers.fill_constant(
-                shape=[5, 1], dtype='float32', value=1
-            )
-            emb = nn.Embedding(
-                size=[dict_size, 32], param_attr='emb.w', is_sparse=False
-            )
-
-            embs2 = []
-            for i in range(window_size):
-                if i == label_word:
-                    continue
-
-                emb_rlt = emb(words[i])
-                embs2.append(emb_rlt)
-
-            embs2 = layers.concat(input=embs2, axis=1)
-            nce = nn.NCE(
-                num_total_classes=dict_size,
-                dim=embs2.shape[1],
-                num_neg_samples=2,
-                sampler="custom_dist",
-                custom_dist=nid_freq_arr.tolist(),
-                seed=seed,
-                param_attr='nce.w',
-                bias_attr='nce.b',
-                sample_weight=sample_weights,
-            )
-
-            wl = fluid.layers.unsqueeze(words[label_word], axes=[0])
-            nce_loss2 = nce(embs2, wl)
-            feed_dict = dict()
-            for i in range(len(words)):
-                feed_dict['word_{0}'.format(i)] = inp_word[i]
-
-            static_rlt2 = self.get_static_graph_result(
-                feed=feed_dict, fetch_list=[nce_loss2]
-            )[0]
-
-        with self.dynamic_graph():
-            with _test_eager_guard():
-                words = []
-                for i in range(window_size):
-                    words.append(base.to_variable(inp_word[i]))
-                sample_weights = layers.fill_constant(
-                    shape=[5, 1], dtype='float32', value=1
-                )
-                emb = nn.Embedding(
-                    size=[dict_size, 32],
-                    param_attr='eager_emb.w',
-                    is_sparse=False,
-                )
-
-                embs3 = []
-                for i in range(window_size):
-                    if i == label_word:
-                        continue
-
-                    emb_rlt = emb(words[i])
-                    embs3.append(emb_rlt)
-
-                embs3 = layers.concat(
-                    input=embs3, axis=fluid.dygraph.to_variable(np.array([1]))
-                )
-                nce = nn.NCE(
-                    num_total_classes=dict_size,
-                    dim=embs3.shape[1],
-                    num_neg_samples=2,
-                    sampler="custom_dist",
-                    custom_dist=nid_freq_arr.tolist(),
-                    seed=seed,
-                    param_attr='eager_nce.w',
-                    bias_attr='eager_nce.b',
-                    sample_weight=sample_weights,
-                )
-
-                wl = fluid.layers.unsqueeze(words[label_word], axes=[0])
-                dy_eager_rlt = nce(embs3, wl)
-                dy_eager_rlt_value = dy_eager_rlt.numpy()
-
-            words = []
-            for i in range(window_size):
-                words.append(base.to_variable(inp_word[i]))
-            sample_weights = layers.fill_constant(
-                shape=[5, 1], dtype='float32', value=1
-            )
-            emb = nn.Embedding(
-                size=[dict_size, 32], param_attr='emb.w', is_sparse=False
-            )
-
-            embs3 = []
-            for i in range(window_size):
-                if i == label_word:
-                    continue
-
-                emb_rlt = emb(words[i])
-                embs3.append(emb_rlt)
-
-            embs3 = layers.concat(
-                input=embs3, axis=fluid.dygraph.to_variable(np.array([1]))
-            )
-            nce = nn.NCE(
-                num_total_classes=dict_size,
-                dim=embs3.shape[1],
-                num_neg_samples=2,
-                sampler="custom_dist",
-                custom_dist=nid_freq_arr.tolist(),
-                seed=seed,
-                param_attr='nce.w',
-                bias_attr='nce.b',
-                sample_weight=sample_weights,
-            )
-
-            wl = fluid.layers.unsqueeze(words[label_word], axes=[0])
-            dy_rlt = nce(embs3, wl)
-            dy_rlt_value = dy_rlt.numpy()
-
-        np.testing.assert_allclose(static_rlt2, static_rlt, rtol=1e-05)
-        np.testing.assert_allclose(dy_rlt_value, static_rlt, rtol=1e-05)
-        np.testing.assert_allclose(dy_eager_rlt_value, static_rlt, rtol=1e-05)
-
-        with self.dynamic_graph():
-            with _test_eager_guard():
-                custom_weight = np.random.randn(dict_size, 128).astype(
-                    "float32"
-                )
-                weight_attr = fluid.ParamAttr(
-                    initializer=fluid.initializer.NumpyArrayInitializer(
-                        custom_weight
-                    )
-                )
-                words = []
-                for i in range(window_size):
-                    words.append(base.to_variable(inp_word[i]))
-                sample_weights = layers.fill_constant(
-                    shape=fluid.dygraph.to_variable(np.array([5, 1])),
-                    dtype='float32',
-                    value=1,
-                )
-                emb = nn.Embedding(
-                    size=[dict_size, 32],
-                    param_attr='eager_emb.w',
-                    is_sparse=False,
-                )
-
-                embs3 = []
-                for i in range(window_size):
-                    if i == label_word:
-                        continue
-
-                    emb_rlt = emb(words[i])
-                    embs3.append(emb_rlt)
-
-                embs3 = layers.concat(input=embs3, axis=1)
-                nce1 = nn.NCE(
-                    num_total_classes=dict_size,
-                    dim=embs3.shape[1],
-                    num_neg_samples=2,
-                    sampler="custom_dist",
-                    custom_dist=nid_freq_arr.tolist(),
-                    seed=seed,
-                    param_attr='eager_nce1.w',
-                    bias_attr='eager_nce1.b',
-                    sample_weight=sample_weights,
-                )
-
-                nce2 = nn.NCE(
-                    num_total_classes=dict_size,
-                    dim=embs3.shape[1],
-                    num_neg_samples=2,
-                    sampler="custom_dist",
-                    custom_dist=nid_freq_arr.tolist(),
-                    seed=seed,
-                    param_attr=weight_attr,
-                    bias_attr='eager_nce2.b',
-                    sample_weight=sample_weights,
-                )
-
-                wl = fluid.layers.unsqueeze(words[label_word], axes=[0])
-                nce1_loss = nce1(embs3, wl)
-                nce2_loss = nce2(embs3, wl)
-                self.assertFalse(
-                    np.array_equal(nce1_loss.numpy(), nce2_loss.numpy())
-                )
-                nce2.weight.set_value(nce1.weight.numpy())
-                nce2.bias.set_value(nce1.bias)
-                nce1_loss = nce1(embs3, wl)
-                nce2_loss = nce2(embs3, wl)
-                np.testing.assert_array_equal(
-                    nce1_loss.numpy(), nce2_loss.numpy()
-                )
-
-                nce2.weight = nce1.weight
-                nce2.bias = nce1.bias
-                np.testing.assert_array_equal(
-                    nce1.weight.numpy(), nce2.weight.numpy()
-                )
-                np.testing.assert_array_equal(
-                    nce1.bias.numpy(), nce2.bias.numpy()
-                )
-
-            custom_weight = np.random.randn(dict_size, 128).astype("float32")
-            weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight
-                )
-            )
-            words = []
-            for i in range(window_size):
-                words.append(base.to_variable(inp_word[i]))
-            sample_weights = layers.fill_constant(
-                shape=fluid.dygraph.to_variable(np.array([5, 1])),
-                dtype='float32',
-                value=1,
-            )
-            emb = nn.Embedding(
-                size=[dict_size, 32], param_attr='emb.w', is_sparse=False
-            )
-
-            embs3 = []
-            for i in range(window_size):
-                if i == label_word:
-                    continue
-
-                emb_rlt = emb(words[i])
-                embs3.append(emb_rlt)
-
-            embs3 = layers.concat(input=embs3, axis=1)
-            nce1 = nn.NCE(
-                num_total_classes=dict_size,
-                dim=embs3.shape[1],
-                num_neg_samples=2,
-                sampler="custom_dist",
-                custom_dist=nid_freq_arr.tolist(),
-                seed=seed,
-                param_attr='nce1.w',
-                bias_attr='nce1.b',
-                sample_weight=sample_weights,
-            )
-
-            nce2 = nn.NCE(
-                num_total_classes=dict_size,
-                dim=embs3.shape[1],
-                num_neg_samples=2,
-                sampler="custom_dist",
-                custom_dist=nid_freq_arr.tolist(),
-                seed=seed,
-                param_attr=weight_attr,
-                bias_attr='nce2.b',
-                sample_weight=sample_weights,
-            )
-
-            wl = fluid.layers.unsqueeze(words[label_word], axes=[0])
-            nce1_loss = nce1(embs3, wl)
-            nce2_loss = nce2(embs3, wl)
-            self.assertFalse(
-                np.array_equal(nce1_loss.numpy(), nce2_loss.numpy())
-            )
-            nce2.weight.set_value(nce1.weight.numpy())
-            nce2.bias.set_value(nce1.bias)
-            nce1_loss = nce1(embs3, wl)
-            nce2_loss = nce2(embs3, wl)
-            np.testing.assert_array_equal(nce1_loss.numpy(), nce2_loss.numpy())
-
-            nce2.weight = nce1.weight
-            nce2.bias = nce1.bias
-            np.testing.assert_array_equal(
-                nce1.weight.numpy(), nce2.weight.numpy()
-            )
-            np.testing.assert_array_equal(nce1.bias.numpy(), nce2.bias.numpy())
-
     def test_one_hot(self):
         with self.dynamic_graph():
             with _test_eager_guard():
diff --git a/python/paddle/fluid/tests/unittests/test_nce.py b/python/paddle/fluid/tests/unittests/test_nce.py
index 359cc50fb58..ee51b0d608a 100644
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
@@ -330,58 +330,5 @@ class TestNCE_OpError(unittest.TestCase):
             )
 
 
-class TestDygraphNCE_OpError(unittest.TestCase):
-    def test_NCE_errors(self):
-        with program_guard(Program(), Program()):
-            nce = fluid.NCE(20, 5)
-            input1 = fluid.create_lod_tensor(
-                np.array([0.0, 3.0, 2.0, 4.0]), [[1, 1, 2]], fluid.CPUPlace()
-            )
-            label1 = fluid.layers.data(
-                name='label1', shape=[-1, 4], dtype="int64"
-            )
-            # the input(input) of NCE layer must be Variable.
-            self.assertRaises(TypeError, nce, input1, label1)
-
-            input2 = fluid.layers.data(
-                name='input2', shape=[-1, 4], dtype="float32"
-            )
-            label2 = fluid.create_lod_tensor(
-                np.array([0.0, 3.0, 2.0, 4.0]), [[1, 1, 2]], fluid.CPUPlace()
-            )
-            # the input(label) of NCE layer must be Variable.
-            self.assertRaises(TypeError, nce, input2, label2)
-
-            input3 = fluid.layers.data(
-                name='input3', shape=[-1, 4], dtype="float16"
-            )
-            label3 = fluid.layers.data(
-                name='label3', shape=[-1, 1], dtype="int64"
-            )
-            # the data type of input(input) must be float32 or float64.
-            self.assertRaises(TypeError, nce, input3, label3)
-
-            input4 = fluid.layers.data(
-                name='input4', shape=[-1, 4], dtype="float32"
-            )
-            label4 = fluid.layers.data(
-                name='label4', shape=[-1, 1], dtype="int32"
-            )
-            # the data type of input(label) must be int64.
-            self.assertRaises(TypeError, nce, input4, label4)
-
-            input5 = fluid.layers.data(
-                name='input5', shape=[-1, 4], dtype="float32"
-            )
-            label5 = fluid.layers.data(
-                name='label5', shape=[-1, 1], dtype="int64"
-            )
-            sample_weight = fluid.create_lod_tensor(
-                np.array([0.0, 3.0, 2.0, 4.0]), [[1, 1, 2]], fluid.CPUPlace()
-            )
-            # the sample_weight of nce must be Variable or None.
-            self.assertRaises(TypeError, nce, input5, label5, sample_weight)
-
-
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab