unset fluid api in nn.layer (#34129)

1c95631f · zhiboniu · GitHub · 1cb4c154 · 1c95631f · 1c95631f
12 changed file
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -15,6 +15,11 @@
 # TODO: import all neural network related api under this directory,
 # including layers, linear, conv, rnn etc.
+from ..fluid.dygraph.layers import Layer  # noqa: F401
+from ..fluid.dygraph.container import LayerList  # noqa: F401
+from ..fluid.dygraph.container import ParameterList  # noqa: F401
+from ..fluid.dygraph.container import Sequential  # noqa: F401
 from .clip import ClipGradByGlobalNorm  # noqa: F401
 from .clip import ClipGradByNorm  # noqa: F401
 from .clip import ClipGradByValue  # noqa: F401
@@ -130,10 +135,6 @@ from .utils.spectral_norm_hook import spectral_norm
 # TODO: remove loss, keep it for too many used in unitests
 from .layer import loss  # noqa: F401
-from ..fluid.dygraph.layers import Layer  # noqa: F401
-from ..fluid.dygraph.container import LayerList  # noqa: F401
-from ..fluid.dygraph.container import ParameterList  # noqa: F401
-from ..fluid.dygraph.container import Sequential  # noqa: F401
 from . import utils  # noqa: F401
 from . import functional  # noqa: F401

--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -14,18 +14,18 @@
 # TODO: define activation functions of neural network
-from ...fluid.dygraph import layers
 from ...fluid import core
 from ...fluid.framework import in_dygraph_mode
-from ...fluid.param_attr import ParamAttr
+from ...framework import ParamAttr
-from ...fluid.initializer import Constant
+from ..initializer import Constant
 from paddle.framework import get_default_dtype
 from .. import functional as F
+from paddle.nn import Layer
 __all__ = []
-class ELU(layers.Layer):
+class ELU(Layer):
    r"""
    ELU Activation.
@@ -67,7 +67,7 @@ class ELU(layers.Layer):
        return 'alpha={}{}'.format(self._alpha, name_str)
-class GELU(layers.Layer):
+class GELU(Layer):
    r"""
    GELU Activation.
@@ -120,7 +120,7 @@ class GELU(layers.Layer):
        return 'approximate={}{}'.format(self._approximate, name_str)
-class Hardshrink(layers.Layer):
+class Hardshrink(Layer):
    r"""
    Hardshrink Activation
@@ -168,7 +168,7 @@ class Hardshrink(layers.Layer):
        return 'threshold={}{}'.format(self._threshold, name_str)
-class Hardswish(layers.Layer):
+class Hardswish(Layer):
    r"""
    Hardswish activation
@@ -218,7 +218,7 @@ class Hardswish(layers.Layer):
        return name_str
-class Tanh(layers.Layer):
+class Tanh(Layer):
    r"""
    Tanh Activation.
@@ -259,7 +259,7 @@ class Tanh(layers.Layer):
        return name_str
-class Hardtanh(layers.Layer):
+class Hardtanh(Layer):
    r"""
    Hardtanh Activation
@@ -305,7 +305,7 @@ class Hardtanh(layers.Layer):
        return 'min={}, max={}{}'.format(self._min, self._max, name_str)
-class PReLU(layers.Layer):
+class PReLU(Layer):
    """
    PReLU Activation.
@@ -377,7 +377,7 @@ class PReLU(layers.Layer):
            self._num_parameters, self._init, self._dtype, name_str)
-class ReLU(layers.Layer):
+class ReLU(Layer):
    """
    ReLU Activation.
@@ -415,7 +415,7 @@ class ReLU(layers.Layer):
        return name_str
-class ReLU6(layers.Layer):
+class ReLU6(Layer):
    """
    ReLU6 Activation
@@ -454,7 +454,7 @@ class ReLU6(layers.Layer):
        return name_str
-class SELU(layers.Layer):
+class SELU(Layer):
    r"""
    SELU Activation
@@ -505,7 +505,7 @@ class SELU(layers.Layer):
                                                       name_str)
-class LeakyReLU(layers.Layer):
+class LeakyReLU(Layer):
    r"""
    Leaky ReLU Activation.
@@ -553,7 +553,7 @@ class LeakyReLU(layers.Layer):
        return 'negative_slope={}{}'.format(self._negative_slope, name_str)
-class Sigmoid(layers.Layer):
+class Sigmoid(Layer):
    """
    this interface is used to construct a callable object of the ``Sigmoid`` class. This layer calcluate the `sigmoid` of input x.
@@ -593,7 +593,7 @@ class Sigmoid(layers.Layer):
        return name_str
-class Hardsigmoid(layers.Layer):
+class Hardsigmoid(Layer):
    r"""
    This interface is used to construct a callable object of the ``Hardsigmoid`` class.
    This layer calcluate the `hardsigmoid` of input x.
@@ -644,7 +644,7 @@ class Hardsigmoid(layers.Layer):
        return name_str
-class Softplus(layers.Layer):
+class Softplus(Layer):
    r"""
    Softplus Activation
@@ -689,7 +689,7 @@ class Softplus(layers.Layer):
                                                name_str)
-class Softshrink(layers.Layer):
+class Softshrink(Layer):
    r"""
    Softshrink Activation
@@ -734,7 +734,7 @@ class Softshrink(layers.Layer):
        return 'threshold={}{}'.format(self._threshold, name_str)
-class Softsign(layers.Layer):
+class Softsign(Layer):
    r"""
    Softsign Activation
@@ -773,7 +773,7 @@ class Softsign(layers.Layer):
        return name_str
-class Swish(layers.Layer):
+class Swish(Layer):
    r"""
    Swish Activation.
@@ -812,7 +812,7 @@ class Swish(layers.Layer):
        return name_str
-class Tanhshrink(layers.Layer):
+class Tanhshrink(Layer):
    """
    Tanhshrink Activation
@@ -851,7 +851,7 @@ class Tanhshrink(layers.Layer):
        return name_str
-class ThresholdedReLU(layers.Layer):
+class ThresholdedReLU(Layer):
    r"""
    Thresholded ReLU Activation
@@ -895,7 +895,7 @@ class ThresholdedReLU(layers.Layer):
        return 'threshold={}{}'.format(self._threshold, name_str)
-class Silu(layers.Layer):
+class Silu(Layer):
    """
    Silu Activation.
    .. math::
@@ -933,7 +933,7 @@ class Silu(layers.Layer):
        return name_str
-class LogSigmoid(layers.Layer):
+class LogSigmoid(Layer):
    r"""
    LogSigmoid Activation.
@@ -972,7 +972,7 @@ class LogSigmoid(layers.Layer):
        return name_str
-class Softmax(layers.Layer):
+class Softmax(Layer):
    r"""
    Softmax Activation.
@@ -1099,7 +1099,7 @@ class Softmax(layers.Layer):
        return 'axis={}{}'.format(self._axis, name_str)
-class LogSoftmax(layers.Layer):
+class LogSoftmax(Layer):
    r"""
    This operator implements the log_softmax layer. The calculation process is as follows:
@@ -1157,7 +1157,7 @@ class LogSoftmax(layers.Layer):
        return 'axis={}{}'.format(self._axis, name_str)
-class Maxout(layers.Layer):
+class Maxout(Layer):
    r"""
    Maxout Activation.

--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -15,10 +15,10 @@
 # TODO: define the common classes to build a neural network
 import paddle
 from ...fluid.dygraph import Flatten  # noqa: F401
-from ...fluid.dygraph import layers
 from ...fluid.framework import in_dygraph_mode
 from .. import functional as F
 from ...fluid.framework import _dygraph_tracer
+from paddle.nn import Layer
 __all__ = []
@@ -30,7 +30,7 @@ def _npairs(x, n):
    return x
-class Linear(layers.Layer):
+class Linear(Layer):
    r"""
    Fully-connected linear transformation layer. For each input :math:`X` ,
@@ -135,7 +135,7 @@ class Linear(layers.Layer):
            self.weight.shape[0], self.weight.shape[1], self._dtype, name_str)
-class Upsample(layers.Layer):
+class Upsample(Layer):
    """
    This op resizes a batch of images.
@@ -385,7 +385,7 @@ class Upsample(layers.Layer):
            self.data_format, name_str)
-class UpsamplingNearest2D(layers.Layer):
+class UpsamplingNearest2D(Layer):
    """
    This op upsamples a batch of images, using nearest neighbours' pixel values.
    The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w),
@@ -470,7 +470,7 @@ class UpsamplingNearest2D(layers.Layer):
                                             name_str)
-class UpsamplingBilinear2D(layers.Layer):
+class UpsamplingBilinear2D(Layer):
    """
    This op upsamples a batch of images, using bilinear' pixel values.
    The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w),
@@ -556,7 +556,7 @@ class UpsamplingBilinear2D(layers.Layer):
                                             name_str)
-class Bilinear(layers.Layer):
+class Bilinear(Layer):
    r"""
    This layer performs bilinear on two inputs.
@@ -651,7 +651,7 @@ class Bilinear(layers.Layer):
            self._dtype, name_str)
-class Dropout(layers.Layer):
+class Dropout(Layer):
    """
    Dropout is a regularization technique for reducing overfitting by preventing
    neuron co-adaption during training as described in the paper:
@@ -725,7 +725,7 @@ class Dropout(layers.Layer):
                                                 name_str)
-class Dropout2D(layers.Layer):
+class Dropout2D(Layer):
    """
    Randomly zero out entire channels (in the batched input 4d tensor with the shape `NCHW` ,
    a channel is a 2D feature map with the shape `HW`). Each channel will be zeroed out independently
@@ -786,7 +786,7 @@ class Dropout2D(layers.Layer):
                                               name_str)
-class Dropout3D(layers.Layer):
+class Dropout3D(Layer):
    """
    Randomly zero out entire channels (in the batched input 5d tensor with the shape `NCDHW` ,
    a channel is a 3D feature map with the shape `DHW` ). Each channel will be zeroed out independently
@@ -847,7 +847,7 @@ class Dropout3D(layers.Layer):
                                               name_str)
-class AlphaDropout(layers.Layer):
+class AlphaDropout(Layer):
    """
    Alpha Dropout is a type of Dropout that maintains the self-normalizing property. For an input with
    zero mean and unit standard deviation, the output of Alpha Dropout maintains the original mean and
@@ -900,7 +900,7 @@ class AlphaDropout(layers.Layer):
        return 'p={}{}'.format(self.p, name_str)
-class Pad1D(layers.Layer):
+class Pad1D(Layer):
    """
    This interface is used to construct a callable object of the ``Pad1D`` class.
    Pad tensor according to 'pad', 'mode' and 'value'.
@@ -981,7 +981,7 @@ class Pad1D(layers.Layer):
            self._pad, self._mode, self._value, self._data_format, name_str)
-class Pad2D(layers.Layer):
+class Pad2D(Layer):
    """
    This interface is used to construct a callable object of the ``Pad2D`` class.
    Pad tensor according to 'pad', 'mode' and 'value'.
@@ -1065,7 +1065,7 @@ class Pad2D(layers.Layer):
            self._pad, self._mode, self._value, self._data_format, name_str)
-class Pad3D(layers.Layer):
+class Pad3D(Layer):
    """
    This interface is used to construct a callable object of the ``Pad3D`` class.
    Pad tensor according to 'pad', 'mode' and 'value'.
@@ -1149,7 +1149,7 @@ class Pad3D(layers.Layer):
            self._pad, self._mode, self._value, self._data_format, name_str)
-class CosineSimilarity(layers.Layer):
+class CosineSimilarity(Layer):
    """
    This interface is used to compute cosine similarity between x1 and x2 along axis.
@@ -1206,7 +1206,7 @@ class CosineSimilarity(layers.Layer):
        return 'axis={_axis}, eps={_eps}'.format(**self.__dict__)
-class Embedding(layers.Layer):
+class Embedding(Layer):
    r"""
    **Embedding Layer**
@@ -1367,7 +1367,7 @@ class Embedding(layers.Layer):
        return main_str.format(**self.__dict__)
-class Unfold(layers.Layer):
+class Unfold(Layer):
    """
    This op returns a col buffer of sliding local blocks of input x, also known
    as im2col for batched 2D image tensors. For each block under the convolution filter,

--- a/python/paddle/nn/layer/container.py
+++ b/python/paddle/nn/layer/container.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from collections import OrderedDict
-from ...fluid.dygraph.layers import Layer
+from .. import Layer
 from collections.abc import Iterable, Mapping
 __all__ = []

--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -19,8 +19,8 @@ import numpy as np
 from ...fluid import get_flags
 from ...fluid import core
 from ...device import get_cudnn_version
-from ...fluid.dygraph import layers
+from .. import Layer
-from ...fluid.initializer import Normal
+from ..initializer import Normal
 from .. import functional as F
 from ...fluid.layers import utils
 from ..functional.conv import _update_padding_nd
@@ -31,7 +31,7 @@ __all__ = []
 def _get_default_param_initializer(num_channels, filter_size):
    filter_elem_num = num_channels * np.prod(filter_size)
    std = (2.0 / filter_elem_num)**0.5
-    return Normal(0.0, std, 0)
+    return Normal(0.0, std)
 def _reverse_repeat_list(t, n):
@@ -42,7 +42,7 @@ def _reverse_repeat_list(t, n):
    return list(x for x in reversed(t) for _ in range(n))
-class _ConvNd(layers.Layer):
+class _ConvNd(Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
@@ -127,7 +127,7 @@ class _ConvNd(layers.Layer):
                return None
            filter_elem_num = np.prod(self._kernel_size) * self._in_channels
            std = (2.0 / filter_elem_num)**0.5
-            return Normal(0.0, std, 0)
+            return Normal(0.0, std)
        self.weight = self.create_parameter(
            shape=filter_shape,

--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -15,7 +15,7 @@
 import numpy as np
 import paddle
-from ...fluid.dygraph import layers
+from .. import Layer
 from ...fluid.framework import core, in_dygraph_mode
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid.layer_helper import LayerHelper
@@ -24,7 +24,7 @@ from paddle import _C_ops
 __all__ = []
-class PairwiseDistance(layers.Layer):
+class PairwiseDistance(Layer):
    r"""
    This operator computes the pairwise distance between two vectors. The
    distance is calculated by p-oreder norm:
@@ -87,7 +87,7 @@ class PairwiseDistance(layers.Layer):
                                 'PairwiseDistance')
        check_variable_and_dtype(y, 'y', ['float32', 'float64'],
                                 'PairwiseDistance')
-        sub = paddle.fluid.layers.elementwise_sub(x, y)
+        sub = paddle.subtract(x, y)
        helper = LayerHelper("PairwiseDistance", name=self.name)
        attrs = {

--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -20,11 +20,12 @@ import paddle.fluid.core as core
 import paddle
 from .. import functional as F
 from paddle.fluid.framework import core, in_dygraph_mode, _varbase_creator
+from .. import Layer
 __all__ = []
-class BCEWithLogitsLoss(fluid.dygraph.Layer):
+class BCEWithLogitsLoss(Layer):
    r"""
    This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
    Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
@@ -128,7 +129,7 @@ class BCEWithLogitsLoss(fluid.dygraph.Layer):
        return out
-class CrossEntropyLoss(fluid.dygraph.Layer):
+class CrossEntropyLoss(Layer):
    r"""
    By default, this operator implements the cross entropy loss function with softmax. This function 
    combines the calculation of the softmax operation and the cross entropy loss function 
@@ -407,7 +408,7 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
        return ret
-class HSigmoidLoss(fluid.dygraph.Layer):
+class HSigmoidLoss(Layer):
    """
    Hierarchical Sigmoid Layer.
@@ -529,7 +530,7 @@ class HSigmoidLoss(fluid.dygraph.Layer):
        return out
-class MSELoss(fluid.dygraph.layers.Layer):
+class MSELoss(Layer):
    r"""
    **Mean Square Error Loss**
    Computes the mean square error (squared L2 norm) of given input and label.
@@ -596,8 +597,7 @@ class MSELoss(fluid.dygraph.layers.Layer):
            fluid.data_feeder.check_variable_and_dtype(
                label, 'label', ['float32', 'float64'], 'MSELoss')
-        square_out = fluid.layers.square(
+        square_out = paddle.square(paddle.subtract(input, label))
-            fluid.layers.elementwise_sub(input, label))
        if self.reduction == 'none':
            return square_out
@@ -608,7 +608,7 @@ class MSELoss(fluid.dygraph.layers.Layer):
        return getattr(fluid.layers, reduce_op)(square_out)
-class L1Loss(fluid.dygraph.Layer):
+class L1Loss(Layer):
    r"""
    This interface is used to construct a callable object of the ``L1Loss`` class.
    The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
@@ -687,7 +687,7 @@ class L1Loss(fluid.dygraph.Layer):
            input, label, self.reduction, name=self.name)
-class BCELoss(fluid.dygraph.Layer):
+class BCELoss(Layer):
    """
    This interface is used to construct a callable object of the ``BCELoss`` class.
    The BCELoss layer measures the binary_cross_entropy loss between input predictions ``input``
@@ -777,7 +777,7 @@ class BCELoss(fluid.dygraph.Layer):
        return out
-class NLLLoss(fluid.dygraph.Layer):
+class NLLLoss(Layer):
    r"""
 	:alias_main: paddle.nn.NLLLoss
 	:alias: paddle.nn.NLLLoss,paddle.nn.layer.NLLLoss,paddle.nn.layer.loss.NLLLoss
@@ -886,7 +886,7 @@ class NLLLoss(fluid.dygraph.Layer):
            name=self._name)
-class KLDivLoss(fluid.dygraph.Layer):
+class KLDivLoss(Layer):
    r"""
    This interface calculates the Kullback-Leibler divergence loss
    between Input(X) and Input(Target). Notes that Input(X) is the
@@ -959,7 +959,7 @@ class KLDivLoss(fluid.dygraph.Layer):
        return out
-class MarginRankingLoss(fluid.dygraph.Layer):
+class MarginRankingLoss(Layer):
    r"""
    This interface is used to construct a callable object of the ``MarginRankingLoss`` class.
@@ -1031,7 +1031,7 @@ class MarginRankingLoss(fluid.dygraph.Layer):
        return out
-class CTCLoss(fluid.dygraph.Layer):
+class CTCLoss(Layer):
    """
    An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
@@ -1127,7 +1127,7 @@ class CTCLoss(fluid.dygraph.Layer):
            norm_by_times=norm_by_times)
-class SmoothL1Loss(fluid.dygraph.Layer):
+class SmoothL1Loss(Layer):
    r"""
    This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
    term if the absolute element-wise error falls below 1 and an L1 term otherwise.

--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -30,15 +30,13 @@
 import six
 from ...fluid.dygraph import BatchNorm  # noqa: F401
 from ...fluid.dygraph import SpectralNorm  # noqa: F401
-from ...fluid.dygraph import layers
 from ...framework import get_default_dtype, set_default_dtype
 from ...fluid.framework import in_dygraph_mode
-from ...fluid.initializer import Constant
+from ..initializer import Constant
-from ...fluid.param_attr import ParamAttr
+from ...framework import ParamAttr
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid import core, dygraph_utils
@@ -47,14 +45,15 @@ from ..functional import batch_norm, layer_norm, instance_norm
 import numpy as np
 import numbers
 import warnings
-from ...fluid.dygraph.base import no_grad
+from ...framework import no_grad
 from .. import functional as F
 from paddle import _C_ops
+from .. import Layer
 __all__ = []
-class _InstanceNormBase(layers.Layer):
+class _InstanceNormBase(Layer):
    """
    This class is based class for InstanceNorm1D, 2d, 3d. 
@@ -317,7 +316,7 @@ class InstanceNorm3D(_InstanceNormBase):
                len(input.shape)))
-class GroupNorm(layers.Layer):
+class GroupNorm(Layer):
    """
    This interface is used to construct a callable object of the ``GroupNorm`` class.
    For more details, refer to code examples.
@@ -436,7 +435,7 @@ class GroupNorm(layers.Layer):
            self._num_groups, self._num_channels, self._epsilon)
-class LayerNorm(layers.Layer):
+class LayerNorm(Layer):
    r"""
    :alias_main: paddle.nn.LayerNorm
 	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
@@ -544,7 +543,7 @@ class LayerNorm(layers.Layer):
                                                        self._epsilon)
-class _BatchNormBase(layers.Layer):
+class _BatchNormBase(Layer):
    """
    BatchNorm base .
    """
@@ -1181,7 +1180,7 @@ class SyncBatchNorm(_BatchNormBase):
        return layer_output
-class LocalResponseNorm(layers.Layer):
+class LocalResponseNorm(Layer):
    """
        Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions.
        For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_

--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ...fluid.dygraph import layers
 from ...fluid.layer_helper import LayerHelper
 from .. import functional as F
+from .. import Layer
 __all__ = []
-class AvgPool1D(layers.Layer):
+class AvgPool1D(Layer):
    r"""
    This operation applies a 1D average pooling over an input signal composed
    of several input planes, based on the input, output_size, return_mask parameters.
@@ -109,7 +109,7 @@ class AvgPool1D(layers.Layer):
            **self.__dict__)
-class AvgPool2D(layers.Layer):
+class AvgPool2D(Layer):
    r"""
    This operation applies 2D average pooling over input features based on the input,
    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
@@ -220,7 +220,7 @@ class AvgPool2D(layers.Layer):
            **self.__dict__)
-class AvgPool3D(layers.Layer):
+class AvgPool3D(Layer):
    """
    This operation applies 3D max pooling over input features based on the input,
    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
@@ -318,7 +318,7 @@ class AvgPool3D(layers.Layer):
            **self.__dict__)
-class MaxPool1D(layers.Layer):
+class MaxPool1D(Layer):
    """
    This operation applies 1D max pooling over input signal
    composed of several input planes based on the input,
@@ -412,7 +412,7 @@ class MaxPool1D(layers.Layer):
            **self.__dict__)
-class MaxPool2D(layers.Layer):
+class MaxPool2D(Layer):
    r"""
    This operation applies 2D max pooling over input feature based on the input,
    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
@@ -522,7 +522,7 @@ class MaxPool2D(layers.Layer):
            **self.__dict__)
-class MaxPool3D(layers.Layer):
+class MaxPool3D(Layer):
    """
    This operation applies 3D max pooling over input features based on the input,
    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
@@ -620,7 +620,7 @@ class MaxPool3D(layers.Layer):
            **self.__dict__)
-class AdaptiveAvgPool1D(layers.Layer):
+class AdaptiveAvgPool1D(Layer):
    r"""
    This operation applies a 1D adaptive average pooling over an input signal composed
@@ -693,7 +693,7 @@ class AdaptiveAvgPool1D(layers.Layer):
        return 'output_size={}'.format(self.output_size)
-class AdaptiveAvgPool2D(layers.Layer):
+class AdaptiveAvgPool2D(Layer):
    r"""
    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
@@ -779,7 +779,7 @@ class AdaptiveAvgPool2D(layers.Layer):
        return 'output_size={}'.format(self._output_size)
-class AdaptiveAvgPool3D(layers.Layer):
+class AdaptiveAvgPool3D(Layer):
    r"""
    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
@@ -872,7 +872,7 @@ class AdaptiveAvgPool3D(layers.Layer):
        return 'output_size={}'.format(self._output_size)
-class AdaptiveMaxPool1D(layers.Layer):
+class AdaptiveMaxPool1D(Layer):
    """
    This operation applies a 1D adaptive max pooling over an input signal composed
@@ -956,7 +956,7 @@ class AdaptiveMaxPool1D(layers.Layer):
                                                       self.return_mask)
-class AdaptiveMaxPool2D(layers.Layer):
+class AdaptiveMaxPool2D(Layer):
    """
    This operation applies 2D adaptive max pooling on input tensor. The h and w dimensions
    of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and
@@ -1037,7 +1037,7 @@ class AdaptiveMaxPool2D(layers.Layer):
                                                       self._return_mask)
-class AdaptiveMaxPool3D(layers.Layer):
+class AdaptiveMaxPool3D(Layer):
    """
    This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions of the output tensor are
    determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus

--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -28,7 +28,7 @@ from paddle import framework
 from paddle.device import get_device, get_cudnn_version
 from paddle.nn import functional as F
 from paddle.nn import initializer as I
-from paddle.fluid.dygraph import Layer, LayerList
+from paddle.nn import Layer, LayerList
 from paddle.fluid.layers import utils
 from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
 from paddle.fluid.data_feeder import convert_dtype
@@ -962,7 +962,7 @@ class RNNBase(LayerList):
            # for static-graph, append coalesce_tensor into startup program
            with fluid.program_guard(fluid.default_startup_program(),
                                     fluid.default_startup_program()):
-                with framework.no_grad():
+                with paddle.no_grad():
                    self._helper.append_op(
                        type="coalesce_tensor",
                        inputs={"Input": self._all_weights},
@@ -1040,10 +1040,10 @@ class RNNBase(LayerList):
            ])
        else:
            initial_states = [initial_states] if isinstance(
-                initial_states,
+                initial_states, paddle.static.Variable) else initial_states
-                paddle.fluid.framework.Variable) else initial_states
-        if self.could_use_cudnn and (not fluid.core.is_compiled_with_rocm() or
+        if self.could_use_cudnn and (
+                not paddle.device.is_compiled_with_rocm() or
                sequence_length is None):
            # Add CPU kernel and dispatch in backend later
            return self._cudnn_impl(inputs, initial_states, sequence_length)

--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -24,8 +24,8 @@ from .norm import LayerNorm
 from .. import functional as F
 from ... import tensor
 from ...fluid import layers
-from ...fluid.dygraph import Layer, LayerList
+from .. import Layer, LayerList
-from ...fluid.param_attr import ParamAttr
+from ...framework import ParamAttr
 from ...fluid.data_feeder import convert_dtype
 __all__ = []

--- a/python/paddle/nn/layer/vision.py
+++ b/python/paddle/nn/layer/vision.py
@@ -14,13 +14,13 @@
 # TODO: define specitial functions used in computer vision task 
-from ...fluid.dygraph import layers
+from .. import Layer
 from .. import functional
 __all__ = []
-class PixelShuffle(layers.Layer):
+class PixelShuffle(Layer):
    """
    PixelShuffle Layer