[remove fluid] under unittesets of linear api (#48564)

* [remove fluid] under unittesets of linear api * [remove fluid] under unittesets of linear api * [remove fluid] under unittesets of linear api * [remove fluid] under unittesets of linear api * [remove fluid] under unittesets of linear api * [remove fluid] under unittesets of linear api * [remove fluid] fluid dygrapn linear api * [remove fluid] fluid dygrapn linear api * [remove fluid] fluid dygrapn linear api

[remove fluid] under unittesets of linear api (#48564)
* [remove fluid] under unittesets of linear api * [remove fluid] under unittesets of linear api * [remove fluid] under unittesets of linear api * [remove fluid] under unittesets of linear api * [remove fluid] under unittesets of linear api * [remove fluid] under unittesets of linear api * [remove fluid] fluid dygrapn linear api * [remove fluid] fluid dygrapn linear api * [remove fluid] fluid dygrapn linear api
364b0b0a · wangzhen38 · GitHub · 33fa2684 · 364b0b0a · 364b0b0a
8 changed file
--- a/python/paddle/distributed/sharding/group_sharded.py
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -91,7 +91,7 @@ def group_sharded_parallel(

            # required: distributed
            import paddle
-            from paddle.fluid.dygraph.nn import Linear
+            from paddle.nn import Linear
            from paddle.distributed import fleet
            from paddle.distributed.sharding import group_sharded_parallel

@@ -238,7 +238,7 @@ def save_group_sharded_model(model, output, optimizer=None):

            # required: distributed
            import paddle
-            from paddle.fluid.dygraph.nn import Linear
+            from paddle.nn import Linear
            from paddle.distributed import fleet
            from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model


--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
@@ -23,7 +23,7 @@ from paddle.optimizer import Adam
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
 from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
 from paddle.nn import Sequential
-from paddle.fluid.dygraph import Linear
+from paddle.nn import Linear
 from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.framework import _test_eager_guard
@@ -111,7 +111,7 @@ class ModelForConv2dT(nn.Layer):
    def __init__(self, num_classes=10):
        super().__init__()
        self.features = nn.Conv2DTranspose(4, 6, (3, 3))
-        self.fc = Linear(input_dim=600, output_dim=num_classes)
+        self.fc = Linear(600, num_classes)

    def forward(self, inputs):
        x = self.features(inputs)
@@ -143,11 +143,9 @@ class ImperativeLenet(paddle.nn.Layer):
        )

        self.fc = Sequential(
-            Linear(input_dim=400, output_dim=120),
-            Linear(input_dim=120, output_dim=84),
-            Linear(
-                input_dim=84, output_dim=num_classes, act=classifier_activation
-            ),
+            Linear(400, 120),
+            Linear(120, 84),
+            Linear(84, num_classes),
        )

    def forward(self, inputs):

--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -821,11 +821,12 @@ class ReduceLROnPlateau(LearningRateDecay):
    .. code-block:: python

        import paddle.fluid as fluid
+        import paddle
        import numpy as np

        with fluid.dygraph.guard():
            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
-            linear = fluid.dygraph.Linear(10, 10)
+            linear = paddle.nn.Linear(10, 10)
            input = fluid.dygraph.to_variable(x)

            reduce_lr = fluid.dygraph.ReduceLROnPlateau(
@@ -842,7 +843,7 @@ class ReduceLROnPlateau(LearningRateDecay):
                total_loss = 0
                for bath_id in range(5):
                    out = linear(input)
-                    loss = fluid.layers.reduce_mean(out)
+                    loss = paddle.mean(out)
                    total_loss += loss
                    adam.minimize(loss)

@@ -1090,9 +1091,10 @@ class StepDecay(_LearningRateEpochDecay):

            import paddle.fluid as fluid
            import numpy as np
+            import paddle
            with fluid.dygraph.guard():
                x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
-                linear = fluid.dygraph.Linear(10, 10)
+                linear = paddle.nn.Linear(10, 10)
                input = fluid.dygraph.to_variable(x)
                scheduler = fluid.dygraph.StepDecay(0.5, step_size=3)
                adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters())
@@ -1100,7 +1102,7 @@ class StepDecay(_LearningRateEpochDecay):
                for epoch in range(9):
                    for batch_id in range(5):
                        out = linear(input)
-                        loss = fluid.layers.reduce_mean(out)
+                        loss = paddle.mean(out)
                        adam.minimize(loss)
                    scheduler.epoch()

@@ -1170,9 +1172,10 @@ class MultiStepDecay(_LearningRateEpochDecay):

            import paddle.fluid as fluid
            import numpy as np
+            import paddle
            with fluid.dygraph.guard():
                x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
-                linear = fluid.dygraph.Linear(10, 10)
+                linear = paddle.nn.Linear(10, 10)
                input = fluid.dygraph.to_variable(x)
                scheduler = fluid.dygraph.MultiStepDecay(0.5, milestones=[3, 5])
                adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters())
@@ -1180,7 +1183,7 @@ class MultiStepDecay(_LearningRateEpochDecay):
                for epoch in range(6):
                    for batch_id in range(5):
                        out = linear(input)
-                        loss = fluid.layers.reduce_mean(out)
+                        loss = paddle.mean(out)
                        adam.minimize(loss)
                    scheduler.epoch()

@@ -1255,9 +1258,10 @@ class LambdaDecay(_LearningRateEpochDecay):

            import paddle.fluid as fluid
            import numpy as np
+            import paddle
            with fluid.dygraph.guard():
                x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
-                linear = fluid.dygraph.Linear(10, 10)
+                linear = paddle.nn.Linear(10, 10)
                input = fluid.dygraph.to_variable(x)
                scheduler = fluid.dygraph.LambdaDecay(0.5, lr_lambda=lambda x: 0.95**x)
                adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters())
@@ -1265,7 +1269,7 @@ class LambdaDecay(_LearningRateEpochDecay):
                for epoch in range(6):
                    for batch_id in range(5):
                        out = linear(input)
-                        loss = fluid.layers.reduce_mean(out)
+                        loss = paddle.mean(out)
                        adam.minimize(loss)
                    scheduler.epoch()


--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -50,592 +50,11 @@ import paddle.utils.deprecated as deprecated
 from paddle import _C_ops, _legacy_C_ops

 __all__ = [
-    'Conv3D',
-    'Linear',
    'BatchNorm',
    'Embedding',
-    'Conv3DTranspose',
 ]


-class Conv3D(layers.Layer):
-    r"""
-    **Convlution3D Layer**
-
-    The convolution3D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are multidimensional tensors with a shape of
-    :math:`[N, C, D, H, W]` . Where N is batch size, C is the number of
-    channels, D is the depth of the feature, H is the height of the feature,
-    and W is the width of the feature. Convlution3D is similar with Convlution2D
-    but adds one dimension(depth). If bias attribution and activation type are
-    provided, bias is added to the output of the convolution, and the
-    corresponding activation function is applied to the final result.
-
-    For each input :math:`X`, the equation is:
-
-    .. math::
-
-        Out = \sigma (W \\ast X + b)
-
-    In the above equation:
-
-    * :math:`X`: Input value, a tensor with NCDHW or NDHWC format.
-    * :math:`W`: Filter value, a tensor with MCDHW format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`
-
-        - Output:
-          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-            D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
-
-    Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of filter. It is as same as the output image channel.
-        filter_size (int|tuple, optional): The filter size. If filter_size is a tuple,
-            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square, filter_size_depth = filter_size_height
-            = filter_size_width = filter_size.
-        stride (int|tuple, optional): The stride size. If stride is a tuple, it must
-            contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
-            stride_D = stride_H = stride_W = stride. The default value is 1.
-        padding (int|tuple, optional): The padding size. If padding is a tuple, it must
-            contain three integers, (padding_D, padding_H, padding_W). Otherwise, the
-            padding_D = padding_H = padding_W = padding. The default value is 0.
-        dilation (int|tuple, optional): The dilation size. If dilation is a tuple, it must
-            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
-            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
-        groups (int, optional): The groups number of the Conv3D Layer. According to grouped
-            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. The default value is 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
-            of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
-            will create ParamAttr as param_attr. If it is set to None, the parameter
-            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
-            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
-        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv3d.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv3d
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. The default value is None.
-        use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. The default value is True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            The default value is None.
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-
-    Attribute:
-        **weight** (Parameter): the learnable weights of filters of this layer.
-
-        **bias** (Parameter): the learnable bias of this layer.
-
-    Returns:
-        None.
-
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy
-
-          with fluid.dygraph.guard():
-              data = numpy.random.random((5, 3, 12, 32, 32)).astype('float32')
-              conv3d = fluid.dygraph.nn.Conv3D(
-                    num_channels=3, num_filters=2, filter_size=3, act="relu")
-              ret = conv3d(fluid.dygraph.base.to_variable(data))
-
-    """
-
-    def __init__(
-        self,
-        num_channels,
-        num_filters,
-        filter_size,
-        stride=1,
-        padding=0,
-        dilation=1,
-        groups=None,
-        param_attr=None,
-        bias_attr=None,
-        use_cudnn=True,
-        act=None,
-        dtype='float32',
-    ):
-        assert param_attr is not False, "param_attr should not be False here."
-        super().__init__()
-        self._num_channels = num_channels
-        self._groups = groups
-        self._stride = utils.convert_to_list(stride, 3, 'stride')
-        self._padding = utils.convert_to_list(padding, 3, 'padding')
-        self._dilation = utils.convert_to_list(dilation, 3, 'dilation')
-        self._act = act
-        self._use_cudnn = use_cudnn
-        self._filter_size = filter_size
-        self._num_filters = num_filters
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._dtype = dtype
-
-        if self._groups is None:
-            num_filter_channels = self._num_channels
-        else:
-            if self._num_channels % self._groups != 0:
-                raise ValueError("num_channels must be divisible by groups.")
-            num_filter_channels = self._num_channels // self._groups
-
-        filter_size = utils.convert_to_list(self._filter_size, 3, 'filter_size')
-        filter_shape = [self._num_filters, num_filter_channels] + filter_size
-
-        def _get_default_param_initializer():
-            filter_elem_num = (
-                filter_size[0]
-                * filter_size[1]
-                * filter_size[2]
-                * self._num_channels
-            )
-            std = (2.0 / filter_elem_num) ** 0.5
-            return Normal(0.0, std, 0)
-
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=filter_shape,
-            dtype=self._dtype,
-            default_initializer=_get_default_param_initializer(),
-        )
-
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True,
-        )
-
-    def forward(self, input):
-        pre_bias = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype
-        )
-
-        self._helper.append_op(
-            type='conv3d',
-            inputs={
-                'Input': input,
-                'Filter': self.weight,
-            },
-            outputs={"Output": pre_bias},
-            attrs={
-                'strides': self._stride,
-                'paddings': self._padding,
-                'dilations': self._dilation,
-                'groups': self._groups if self._groups else 1,
-                'use_cudnn': self._use_cudnn,
-                'use_mkldnn': False,
-            },
-        )
-
-        if self.bias is not None:
-            pre_act = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype
-            )
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias], 'Y': [self.bias]},
-                outputs={'Out': [pre_act]},
-                attrs={'axis': 1},
-            )
-        else:
-            pre_act = pre_bias
-
-        return self._helper.append_activation(pre_act, act=self._act)
-
-
-class Conv3DTranspose(layers.Layer):
-    r"""
-    **Convlution3D transpose layer**
-
-    The convolution3D transpose layer calculates the output based on the input,
-    filter, and dilations, strides, paddings. Input(Input) and output(Output)
-    are in NCDHW format. Where N is batch size, C is the number of channels,
-    D is the depth of the feature, H is the height of the feature, and W
-    is the width of the feature. Parameters(dilations, strides, paddings) are
-    two elements. These two elements represent height and width, respectively.
-    The details of convolution transpose layer, please refer to the following
-    explanation and references `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
-    If bias attribution and activation type are provided, bias is added to
-    the output of the convolution, and the corresponding activation function
-    is applied to the final result.
-
-    For each input :math:`X`, the equation is:
-
-    .. math::
-
-        Out = \sigma (W \\ast X + b)
-
-    In the above equation:
-
-    * :math:`X`: Input value, a tensor with NCDHW format.
-    * :math:`W`: Filter value, a tensor with MCDHW format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)`
-
-        - Output:
-
-          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
-           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1 \\\\
-           D_{out} &\in [ D^\prime_{out}, D^\prime_{out} + strides[0] ] \\\\
-           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[1] ] \\\\
-
-    **Note**:
-
-          The conv3d_transpose can be seen as the backward of the conv3d. For conv3d,
-          when stride > 1, conv3d maps multiple input shape to the same output shape,
-          so for conv3d_transpose, when stride > 1, input shape maps multiple output shape.
-          If output_size is None, :math:`H_{out} = H^\prime_{out}, :math:`H_{out} = \
-          H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output
-          size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`,
-          the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}`
-          and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must
-          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`,
-          conv3d_transpose can compute the kernel size automatically.
-
-
-    Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of the filter. It is as same as the output
-            image channel.
-        filter_size(int|tuple): The filter size. If filter_size is a tuple,
-            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        padding(int|tuple, optional): The padding size. The padding argument effectively
-             adds `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a string,
-             either 'VALID' or 'SAME' supported, which is the padding algorithm. If `padding`
-             is a tuple or list, it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `'NCDHW'`, `padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `'NDHWC'`, `padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            The default value is 0.
-        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution.
-            If stride is a tuple, it must contain three integers, (stride_depth, stride_height,
-            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride.
-            The default value is 1.
-        dilation(int|tuple, optional): The dilation size. If dilation is a tuple, it must
-            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
-            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
-        groups(int, optional): The groups number of the Conv3D transpose layer. Inspired by
-            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
-            when group=2, the first half of the filters is only connected to the
-            first half of the input channels, while the second half of the
-            filters is only connected to the second half of the input channels.
-            The default value is 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
-            of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. The default value is None.
-        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv3d_transpose.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv3d_transpose
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. The default value is None.
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. The default value is True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            The default value is None.
-        name(str, optional): The default value is None. Normally there is no need for user
-            to set this property. For more information, please refer to :ref:`api_guide_Name`.
-
-    Attribute:
-        **weight** (Parameter): the learnable weights of filters of this layer.
-
-        **bias** (Parameter): the learnable bias of this layer.
-
-    Returns:
-        None.
-
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-
-    Examples:
-       .. code-block:: python
-
-         import paddle.fluid as fluid
-         import numpy
-
-         with fluid.dygraph.guard():
-             data = numpy.random.random((5, 3, 12, 32, 32)).astype('float32')
-             conv3dTranspose = fluid.dygraph.nn.Conv3DTranspose(
-                    num_channels=3,
-                    num_filters=12,
-                    filter_size=12,
-                    use_cudnn=False)
-             ret = conv3dTranspose(fluid.dygraph.base.to_variable(data))
-
-    """
-
-    def __init__(
-        self,
-        num_channels,
-        num_filters,
-        filter_size,
-        padding=0,
-        stride=1,
-        dilation=1,
-        groups=None,
-        param_attr=None,
-        bias_attr=None,
-        use_cudnn=True,
-        act=None,
-        dtype='float32',
-    ):
-        super().__init__()
-        if not isinstance(use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-        assert (
-            param_attr is not False
-        ), "param_attr should not be False in conv3d_transpose."
-        self._padding = utils.convert_to_list(padding, 3, 'padding')
-        self._stride = utils.convert_to_list(stride, 3, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 3, 'dilation')
-        self._param_attr = param_attr
-        self._num_channels = num_channels
-        self._filter_size = filter_size
-        self._groups = 1 if groups is None else groups
-        self._num_filters = num_filters
-        self._use_cudnn = use_cudnn
-        self._bias_attr = bias_attr
-        self._act = act
-        self._dtype = dtype
-
-        self._filter_size = utils.convert_to_list(
-            self._filter_size, 3, 'conv3d_transpose.filter_size'
-        )
-
-        filter_shape = [
-            self._num_channels,
-            self._num_filters // self._groups,
-        ] + self._filter_size
-        self.weight = self.create_parameter(
-            dtype=self._dtype, shape=filter_shape, attr=self._param_attr
-        )
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True,
-        )
-
-    def forward(self, input):
-        pre_bias = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype
-        )
-        self._helper.append_op(
-            type="conv3d_transpose",
-            inputs={'Input': [input], 'Filter': [self.weight]},
-            outputs={'Output': pre_bias},
-            attrs={
-                'strides': self._stride,
-                'paddings': self._padding,
-                'dilations': self._dilation,
-                'groups': self._groups if self._groups else 1,
-                'use_cudnn': self._use_cudnn,
-            },
-        )
-
-        if self._bias_attr:
-            pre_act = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype
-            )
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias], 'Y': [self.bias]},
-                outputs={'Out': [pre_act]},
-                attrs={'axis': 1},
-            )
-        else:
-            pre_act = pre_bias
-
-        # Currently, we don't support inplace in imperative mode
-        return self._helper.append_activation(pre_act, act=self._act)
-
-
-class Linear(layers.Layer):
-    """
-
-    Fully-connected linear transformation layer:
-
-    .. math::
-
-        Out = Act({XW + b})
-
-    where :math:`X` is the input Tensor, :math:`W` and :math:`b` are weight and bias respectively.
-
-    Linear layer takes only one ``Tensor`` input.
-    The Linear layer multiplies input tensor with weight matrix and
-    produces an output Tensor of shape [N, *, `output_dim`],
-    where N is batch size and `*` means any number of additional dimensions.
-    If ``bias_attr`` is not None, a bias variable will be created and added to the output.
-    Finally, if ``act`` is not None, it will be applied to the output as well.
-
-    Parameters:
-        input_dim(int): The number of input units in this layer.
-        output_dim(int): The number of output units in this layer.
-        param_attr(ParamAttr or list of ParamAttr, optional): The parameter attribute for learnable
-            weights(Parameter) of this layer. Default: None.
-        bias_attr(ParamAttr or list of ParamAttr, optional): The attribute for the bias
-            of this layer. If it is set to False, no bias will be added to the output units.
-            If it is set to None, the bias is initialized zero. Default: None.
-        act(str, optional): Activation to be applied to the output of this layer. Default: None.
-        dtype(str, optional): Dtype used for weight, it can be "float32" or "float64". Default: "float32".
-
-    Attributes:
-        **weight** (Parameter): the learnable weights of this layer.
-
-        **bias** (Parameter or None): the learnable bias of this layer.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-          from paddle.fluid.dygraph.base import to_variable
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph import Linear
-          import numpy as np
-
-          data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
-          with fluid.dygraph.guard():
-              linear = Linear(32, 64)
-              data = to_variable(data)
-              res = linear(data)  # [30, 10, 64]
-    """
-
-    def __init__(
-        self,
-        input_dim,
-        output_dim,
-        param_attr=None,
-        bias_attr=None,
-        act=None,
-        dtype="float32",
-    ):
-        super().__init__()
-        self._act = act
-        self._dtype = dtype
-        self.weight = self.create_parameter(
-            shape=[input_dim, output_dim],
-            attr=param_attr,
-            dtype=dtype,
-            is_bias=False,
-        )
-        self.bias = self.create_parameter(
-            shape=[output_dim], attr=bias_attr, dtype=dtype, is_bias=True
-        )
-
-        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
-
-    def forward(self, input):
-        if _non_static_mode():
-            pre_bias = _varbase_creator(dtype=input.dtype)
-            _legacy_C_ops.matmul(
-                input,
-                self.weight,
-                pre_bias,
-                'transpose_X',
-                False,
-                'transpose_Y',
-                False,
-                "alpha",
-                1,
-                "use_mkldnn",
-                self._use_mkldnn,
-            )
-            pre_act = dygraph_utils._append_bias_in_dygraph(
-                pre_bias,
-                self.bias,
-                axis=len(input.shape) - 1,
-                use_mkldnn=self._use_mkldnn,
-            )
-
-            return dygraph_utils._append_activation_in_dygraph(
-                pre_act, self._act, use_mkldnn=self._use_mkldnn
-            )
-
-        check_variable_and_dtype(
-            input, 'input', ['float16', 'float32', 'float64'], "Linear"
-        )
-
-        attrs = {
-            "transpose_X": False,
-            "transpose_Y": False,
-            "alpha": 1,
-            "use_mkldnn": self._use_mkldnn,
-        }
-        inputs = {"X": [input], "Y": [self.weight]}
-
-        tmp = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type="matmul", inputs=inputs, outputs={"Out": tmp}, attrs=attrs
-        )
-        if self.bias is not None:
-            pre_activation = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype
-            )
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [tmp], 'Y': [self.bias]},
-                outputs={'Out': [pre_activation]},
-                attrs={
-                    'axis': len(input.shape) - 1,
-                    'use_mkldnn': self._use_mkldnn,
-                },
-            )
-        else:
-            pre_activation = tmp
-        return self._helper.append_activation(pre_activation, act=self._act)
-
-
 class BatchNorm(layers.Layer):
    r"""


--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -165,12 +165,12 @@ def monkey_patch_varbase():

                import paddle.fluid as fluid
                from paddle.fluid.dygraph.base import to_variable
-                from paddle.fluid.dygraph import Linear
+                from paddle.nn import Linear
                import numpy as np

                data = np.ones([3, 1024], dtype='float32')
                with fluid.dygraph.guard():
-                    linear = fluid.dygraph.Linear(1024, 4)
+                    linear = Linear(1024, 4)
                    t = to_variable(data)
                    linear(t)  # call with default weight
                    custom_weight = np.random.randn(1024, 4).astype("float32")

--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -39,8 +39,10 @@ __all__ = ['run_check']
 class SimpleLayer(Layer):
    def __init__(self, input_size):
        super().__init__()
-        self._linear1 = nn.Linear(
-            input_size, 3, param_attr=ParamAttr(initializer=Constant(value=0.1))
+        self._linear1 = paddle.nn.Linear(
+            input_size,
+            3,
+            weight_attr=ParamAttr(initializer=Constant(value=0.1)),
        )

    def forward(self, inputs):

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -475,9 +475,10 @@ class Optimizer:
            .. code-block:: python

                import paddle.fluid as fluid
+                import paddle

                with fluid.dygraph.guard():
-                    linear = fluid.dygraph.nn.Linear(10, 10)
+                    linear = paddle.nn.Linear(10, 10)

                    adam = fluid.optimizer.Adam(0.1, parameter_list=linear.parameters())

@@ -576,6 +577,7 @@ class Optimizer:

                import paddle.fluid as fluid
                import numpy as np
+                import paddle

                # example1: LearningRateDecay is not used, return value is all the same
                with fluid.dygraph.guard():
@@ -587,10 +589,10 @@ class Optimizer:
                # example2: PiecewiseDecay is used, return the step learning rate
                with fluid.dygraph.guard():
                    inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
-                    linear = fluid.dygraph.nn.Linear(10, 10)
+                    linear = paddle.nn.Linear(10, 10)
                    inp = fluid.dygraph.to_variable(inp)
                    out = linear(inp)
-                    loss = fluid.layers.reduce_mean(out)
+                    loss = paddle.mean(out)

                    bd = [2, 4, 6, 8]
                    value = [0.2, 0.4, 0.6, 0.8, 1.0]
@@ -1340,12 +1342,13 @@ class Optimizer:
            .. code-block:: python

                import paddle.fluid as fluid
+                import paddle
                import numpy as np

                with fluid.dygraph.guard():
                    value = np.arange(26).reshape(2, 13).astype("float32")
                    a = fluid.dygraph.to_variable(value)
-                    linear = fluid.Linear(13, 5, dtype="float32")
+                    linear = paddle.nn.Linear(13, 5)
                    # This can be any optimizer supported by dygraph.
                    adam = fluid.optimizer.Adam(learning_rate = 0.01,
                                                parameter_list = linear.parameters())

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
@@ -18,7 +18,7 @@ from test_dist_base import TestParallelDyGraphRunnerBase, runtime_main
 import paddle
 import paddle.fluid as fluid
 import paddle.nn.functional as F
-from paddle.fluid.dygraph import Embedding, Layer, Linear, to_variable
+from paddle.fluid.dygraph import Embedding, Layer, to_variable
 from paddle.optimizer.lr import NoamDecay

 """
@@ -269,8 +269,8 @@ class PrePostProcessLayer(Layer):
 class PositionwiseFeedForwardLayer(Layer):
    def __init__(self, d_inner_hid, d_hid, dropout_rate):
        super().__init__()
-        self._i2h = Linear(d_hid, d_inner_hid, act="relu")
-        self._h2o = Linear(d_inner_hid, d_hid)
+        self._i2h = paddle.nn.Linear(d_hid, d_inner_hid)
+        self._h2o = paddle.nn.Linear(d_inner_hid, d_hid)
        self._dropout_rate = dropout_rate

    def forward(self, x):
@@ -304,10 +304,18 @@ class MultiHeadAttentionLayer(Layer):
        self._d_value = d_value
        self._d_model = d_model
        self._dropout_rate = dropout_rate
-        self._q_fc = Linear(self._d_model, d_key * n_head, bias_attr=False)
-        self._k_fc = Linear(self._d_model, d_key * n_head, bias_attr=False)
-        self._v_fc = Linear(self._d_model, d_value * n_head, bias_attr=False)
-        self._proj_fc = Linear(d_value * n_head, self._d_model, bias_attr=False)
+        self._q_fc = paddle.nn.Linear(
+            self._d_model, d_key * n_head, bias_attr=False
+        )
+        self._k_fc = paddle.nn.Linear(
+            self._d_model, d_key * n_head, bias_attr=False
+        )
+        self._v_fc = paddle.nn.Linear(
+            self._d_model, d_value * n_head, bias_attr=False
+        )
+        self._proj_fc = paddle.nn.Linear(
+            d_value * n_head, self._d_model, bias_attr=False
+        )

    def forward(self, queries, keys, values, attn_bias):
        # compute q ,k ,v
@@ -825,7 +833,9 @@ class WrapDecoderLayer(Layer):
        )
        self._weight_sharing = weight_sharing
        if not weight_sharing:
-            self._fc = Linear(d_model, trg_vocab_size, bias_attr=False)
+            self._fc = paddle.nn.Linear(
+                d_model, trg_vocab_size, bias_attr=False
+            )

    def forward(self, dec_inputs=None, enc_output=None):
        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs