diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 5fe17e8c193e3ea99eddbd8bfb2668e3a1228286..8f094877e74b6730ace0bd0222042a7aa2f60b48 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -15,6 +15,11 @@
 # TODO: import all neural network related api under this directory,
 # including layers, linear, conv, rnn etc.
 
+from ..fluid.dygraph.layers import Layer  # noqa: F401
+from ..fluid.dygraph.container import LayerList  # noqa: F401
+from ..fluid.dygraph.container import ParameterList  # noqa: F401
+from ..fluid.dygraph.container import Sequential  # noqa: F401
+
 from .clip import ClipGradByGlobalNorm  # noqa: F401
 from .clip import ClipGradByNorm  # noqa: F401
 from .clip import ClipGradByValue  # noqa: F401
@@ -130,10 +135,6 @@ from .utils.spectral_norm_hook import spectral_norm
 
 # TODO: remove loss, keep it for too many used in unitests
 from .layer import loss  # noqa: F401
-from ..fluid.dygraph.layers import Layer  # noqa: F401
-from ..fluid.dygraph.container import LayerList  # noqa: F401
-from ..fluid.dygraph.container import ParameterList  # noqa: F401
-from ..fluid.dygraph.container import Sequential  # noqa: F401
 
 from . import utils  # noqa: F401
 from . import functional  # noqa: F401
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index d5b37144cfffed55396787cc7745ea7b80639672..695e387bda84f073718b717ca201161489517b50 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -14,18 +14,18 @@
 
 # TODO: define activation functions of neural network
 
-from ...fluid.dygraph import layers
 from ...fluid import core
 from ...fluid.framework import in_dygraph_mode
-from ...fluid.param_attr import ParamAttr
-from ...fluid.initializer import Constant
+from ...framework import ParamAttr
+from ..initializer import Constant
 from paddle.framework import get_default_dtype
 from .. import functional as F
+from paddle.nn import Layer
 
 __all__ = []
 
 
-class ELU(layers.Layer):
+class ELU(Layer):
     r"""
     ELU Activation.
 
@@ -67,7 +67,7 @@ class ELU(layers.Layer):
         return 'alpha={}{}'.format(self._alpha, name_str)
 
 
-class GELU(layers.Layer):
+class GELU(Layer):
     r"""
     GELU Activation.
 
@@ -120,7 +120,7 @@ class GELU(layers.Layer):
         return 'approximate={}{}'.format(self._approximate, name_str)
 
 
-class Hardshrink(layers.Layer):
+class Hardshrink(Layer):
     r"""
     Hardshrink Activation
 
@@ -168,7 +168,7 @@ class Hardshrink(layers.Layer):
         return 'threshold={}{}'.format(self._threshold, name_str)
 
 
-class Hardswish(layers.Layer):
+class Hardswish(Layer):
     r"""
     Hardswish activation
 
@@ -218,7 +218,7 @@ class Hardswish(layers.Layer):
         return name_str
 
 
-class Tanh(layers.Layer):
+class Tanh(Layer):
     r"""
     Tanh Activation.
 
@@ -259,7 +259,7 @@ class Tanh(layers.Layer):
         return name_str
 
 
-class Hardtanh(layers.Layer):
+class Hardtanh(Layer):
     r"""
     Hardtanh Activation
 
@@ -305,7 +305,7 @@ class Hardtanh(layers.Layer):
         return 'min={}, max={}{}'.format(self._min, self._max, name_str)
 
 
-class PReLU(layers.Layer):
+class PReLU(Layer):
     """
     PReLU Activation.
 
@@ -377,7 +377,7 @@ class PReLU(layers.Layer):
             self._num_parameters, self._init, self._dtype, name_str)
 
 
-class ReLU(layers.Layer):
+class ReLU(Layer):
     """
     ReLU Activation.
 
@@ -415,7 +415,7 @@ class ReLU(layers.Layer):
         return name_str
 
 
-class ReLU6(layers.Layer):
+class ReLU6(Layer):
     """
     ReLU6 Activation
 
@@ -454,7 +454,7 @@ class ReLU6(layers.Layer):
         return name_str
 
 
-class SELU(layers.Layer):
+class SELU(Layer):
     r"""
     SELU Activation
 
@@ -505,7 +505,7 @@ class SELU(layers.Layer):
                                                        name_str)
 
 
-class LeakyReLU(layers.Layer):
+class LeakyReLU(Layer):
     r"""
     Leaky ReLU Activation.
 
@@ -553,7 +553,7 @@ class LeakyReLU(layers.Layer):
         return 'negative_slope={}{}'.format(self._negative_slope, name_str)
 
 
-class Sigmoid(layers.Layer):
+class Sigmoid(Layer):
     """
     this interface is used to construct a callable object of the ``Sigmoid`` class. This layer calcluate the `sigmoid` of input x.
 
@@ -593,7 +593,7 @@ class Sigmoid(layers.Layer):
         return name_str
 
 
-class Hardsigmoid(layers.Layer):
+class Hardsigmoid(Layer):
     r"""
     This interface is used to construct a callable object of the ``Hardsigmoid`` class.
     This layer calcluate the `hardsigmoid` of input x.
@@ -644,7 +644,7 @@ class Hardsigmoid(layers.Layer):
         return name_str
 
 
-class Softplus(layers.Layer):
+class Softplus(Layer):
     r"""
     Softplus Activation
 
@@ -689,7 +689,7 @@ class Softplus(layers.Layer):
                                                 name_str)
 
 
-class Softshrink(layers.Layer):
+class Softshrink(Layer):
     r"""
     Softshrink Activation
 
@@ -734,7 +734,7 @@ class Softshrink(layers.Layer):
         return 'threshold={}{}'.format(self._threshold, name_str)
 
 
-class Softsign(layers.Layer):
+class Softsign(Layer):
     r"""
     Softsign Activation
 
@@ -773,7 +773,7 @@ class Softsign(layers.Layer):
         return name_str
 
 
-class Swish(layers.Layer):
+class Swish(Layer):
     r"""
     Swish Activation.
 
@@ -812,7 +812,7 @@ class Swish(layers.Layer):
         return name_str
 
 
-class Tanhshrink(layers.Layer):
+class Tanhshrink(Layer):
     """
     Tanhshrink Activation
 
@@ -851,7 +851,7 @@ class Tanhshrink(layers.Layer):
         return name_str
 
 
-class ThresholdedReLU(layers.Layer):
+class ThresholdedReLU(Layer):
     r"""
     Thresholded ReLU Activation
 
@@ -895,7 +895,7 @@ class ThresholdedReLU(layers.Layer):
         return 'threshold={}{}'.format(self._threshold, name_str)
 
 
-class Silu(layers.Layer):
+class Silu(Layer):
     """
     Silu Activation.
     .. math::
@@ -933,7 +933,7 @@ class Silu(layers.Layer):
         return name_str
 
 
-class LogSigmoid(layers.Layer):
+class LogSigmoid(Layer):
     r"""
     LogSigmoid Activation.
 
@@ -972,7 +972,7 @@ class LogSigmoid(layers.Layer):
         return name_str
 
 
-class Softmax(layers.Layer):
+class Softmax(Layer):
     r"""
     Softmax Activation.
 
@@ -1099,7 +1099,7 @@ class Softmax(layers.Layer):
         return 'axis={}{}'.format(self._axis, name_str)
 
 
-class LogSoftmax(layers.Layer):
+class LogSoftmax(Layer):
     r"""
     This operator implements the log_softmax layer. The calculation process is as follows:
 
@@ -1157,7 +1157,7 @@ class LogSoftmax(layers.Layer):
         return 'axis={}{}'.format(self._axis, name_str)
 
 
-class Maxout(layers.Layer):
+class Maxout(Layer):
     r"""
     Maxout Activation.
 
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 1d7f7c6589986bdf478347d428c5ec689a7be882..9aa8097befc98bfc6bb93f083411a0d4e534bbb5 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -15,10 +15,10 @@
 # TODO: define the common classes to build a neural network
 import paddle
 from ...fluid.dygraph import Flatten  # noqa: F401
-from ...fluid.dygraph import layers
 from ...fluid.framework import in_dygraph_mode
 from .. import functional as F
 from ...fluid.framework import _dygraph_tracer
+from paddle.nn import Layer
 
 __all__ = []
 
@@ -30,7 +30,7 @@ def _npairs(x, n):
     return x
 
 
-class Linear(layers.Layer):
+class Linear(Layer):
     r"""
 
     Fully-connected linear transformation layer. For each input :math:`X` ,
@@ -135,7 +135,7 @@ class Linear(layers.Layer):
             self.weight.shape[0], self.weight.shape[1], self._dtype, name_str)
 
 
-class Upsample(layers.Layer):
+class Upsample(Layer):
     """
     This op resizes a batch of images.
 
@@ -385,7 +385,7 @@ class Upsample(layers.Layer):
             self.data_format, name_str)
 
 
-class UpsamplingNearest2D(layers.Layer):
+class UpsamplingNearest2D(Layer):
     """
     This op upsamples a batch of images, using nearest neighbours' pixel values.
     The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w),
@@ -470,7 +470,7 @@ class UpsamplingNearest2D(layers.Layer):
                                              name_str)
 
 
-class UpsamplingBilinear2D(layers.Layer):
+class UpsamplingBilinear2D(Layer):
     """
     This op upsamples a batch of images, using bilinear' pixel values.
     The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w),
@@ -556,7 +556,7 @@ class UpsamplingBilinear2D(layers.Layer):
                                              name_str)
 
 
-class Bilinear(layers.Layer):
+class Bilinear(Layer):
     r"""
 
     This layer performs bilinear on two inputs.
@@ -651,7 +651,7 @@ class Bilinear(layers.Layer):
             self._dtype, name_str)
 
 
-class Dropout(layers.Layer):
+class Dropout(Layer):
     """
     Dropout is a regularization technique for reducing overfitting by preventing
     neuron co-adaption during training as described in the paper:
@@ -725,7 +725,7 @@ class Dropout(layers.Layer):
                                                  name_str)
 
 
-class Dropout2D(layers.Layer):
+class Dropout2D(Layer):
     """
     Randomly zero out entire channels (in the batched input 4d tensor with the shape `NCHW` ,
     a channel is a 2D feature map with the shape `HW`). Each channel will be zeroed out independently
@@ -786,7 +786,7 @@ class Dropout2D(layers.Layer):
                                                name_str)
 
 
-class Dropout3D(layers.Layer):
+class Dropout3D(Layer):
     """
     Randomly zero out entire channels (in the batched input 5d tensor with the shape `NCDHW` ,
     a channel is a 3D feature map with the shape `DHW` ). Each channel will be zeroed out independently
@@ -847,7 +847,7 @@ class Dropout3D(layers.Layer):
                                                name_str)
 
 
-class AlphaDropout(layers.Layer):
+class AlphaDropout(Layer):
     """
     Alpha Dropout is a type of Dropout that maintains the self-normalizing property. For an input with
     zero mean and unit standard deviation, the output of Alpha Dropout maintains the original mean and
@@ -900,7 +900,7 @@ class AlphaDropout(layers.Layer):
         return 'p={}{}'.format(self.p, name_str)
 
 
-class Pad1D(layers.Layer):
+class Pad1D(Layer):
     """
     This interface is used to construct a callable object of the ``Pad1D`` class.
     Pad tensor according to 'pad', 'mode' and 'value'.
@@ -981,7 +981,7 @@ class Pad1D(layers.Layer):
             self._pad, self._mode, self._value, self._data_format, name_str)
 
 
-class Pad2D(layers.Layer):
+class Pad2D(Layer):
     """
     This interface is used to construct a callable object of the ``Pad2D`` class.
     Pad tensor according to 'pad', 'mode' and 'value'.
@@ -1065,7 +1065,7 @@ class Pad2D(layers.Layer):
             self._pad, self._mode, self._value, self._data_format, name_str)
 
 
-class Pad3D(layers.Layer):
+class Pad3D(Layer):
     """
     This interface is used to construct a callable object of the ``Pad3D`` class.
     Pad tensor according to 'pad', 'mode' and 'value'.
@@ -1149,7 +1149,7 @@ class Pad3D(layers.Layer):
             self._pad, self._mode, self._value, self._data_format, name_str)
 
 
-class CosineSimilarity(layers.Layer):
+class CosineSimilarity(Layer):
     """
     This interface is used to compute cosine similarity between x1 and x2 along axis.
 
@@ -1206,7 +1206,7 @@ class CosineSimilarity(layers.Layer):
         return 'axis={_axis}, eps={_eps}'.format(**self.__dict__)
 
 
-class Embedding(layers.Layer):
+class Embedding(Layer):
     r"""
     **Embedding Layer**
 
@@ -1367,7 +1367,7 @@ class Embedding(layers.Layer):
         return main_str.format(**self.__dict__)
 
 
-class Unfold(layers.Layer):
+class Unfold(Layer):
     """
     This op returns a col buffer of sliding local blocks of input x, also known
     as im2col for batched 2D image tensors. For each block under the convolution filter,
diff --git a/python/paddle/nn/layer/container.py b/python/paddle/nn/layer/container.py
index 48697aa8f509090d44a173a2bc47b1a18184a622..aadaf1efce50faf0c81238ea1f3ea0eda1f87513 100644
--- a/python/paddle/nn/layer/container.py
+++ b/python/paddle/nn/layer/container.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from collections import OrderedDict
-from ...fluid.dygraph.layers import Layer
+from .. import Layer
 from collections.abc import Iterable, Mapping
 
 __all__ = []
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 76011aeff5b4fb129dac365be63068e494c258fd..26fd544ecce11234301b948f91128a4e6c052210 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -19,8 +19,8 @@ import numpy as np
 from ...fluid import get_flags
 from ...fluid import core
 from ...device import get_cudnn_version
-from ...fluid.dygraph import layers
-from ...fluid.initializer import Normal
+from .. import Layer
+from ..initializer import Normal
 from .. import functional as F
 from ...fluid.layers import utils
 from ..functional.conv import _update_padding_nd
@@ -31,7 +31,7 @@ __all__ = []
 def _get_default_param_initializer(num_channels, filter_size):
     filter_elem_num = num_channels * np.prod(filter_size)
     std = (2.0 / filter_elem_num)**0.5
-    return Normal(0.0, std, 0)
+    return Normal(0.0, std)
 
 
 def _reverse_repeat_list(t, n):
@@ -42,7 +42,7 @@ def _reverse_repeat_list(t, n):
     return list(x for x in reversed(t) for _ in range(n))
 
 
-class _ConvNd(layers.Layer):
+class _ConvNd(Layer):
     def __init__(self,
                  in_channels,
                  out_channels,
@@ -127,7 +127,7 @@ class _ConvNd(layers.Layer):
                 return None
             filter_elem_num = np.prod(self._kernel_size) * self._in_channels
             std = (2.0 / filter_elem_num)**0.5
-            return Normal(0.0, std, 0)
+            return Normal(0.0, std)
 
         self.weight = self.create_parameter(
             shape=filter_shape,
diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py
index 27e904980d143d8e80282cdd6e6d5adc40ef5dcb..0547bf75a4bf6c4b2b4a878fdf37f00c007ef4bc 100644
--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -15,7 +15,7 @@
 import numpy as np
 
 import paddle
-from ...fluid.dygraph import layers
+from .. import Layer
 from ...fluid.framework import core, in_dygraph_mode
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid.layer_helper import LayerHelper
@@ -24,7 +24,7 @@ from paddle import _C_ops
 __all__ = []
 
 
-class PairwiseDistance(layers.Layer):
+class PairwiseDistance(Layer):
     r"""
     This operator computes the pairwise distance between two vectors. The
     distance is calculated by p-oreder norm:
@@ -87,7 +87,7 @@ class PairwiseDistance(layers.Layer):
                                  'PairwiseDistance')
         check_variable_and_dtype(y, 'y', ['float32', 'float64'],
                                  'PairwiseDistance')
-        sub = paddle.fluid.layers.elementwise_sub(x, y)
+        sub = paddle.subtract(x, y)
 
         helper = LayerHelper("PairwiseDistance", name=self.name)
         attrs = {
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 8f43eb8866b4bb7e6d1738999b7f64335fa62185..31b552bed162c2b1152acfd8252aaea7cb106eb8 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -20,11 +20,12 @@ import paddle.fluid.core as core
 import paddle
 from .. import functional as F
 from paddle.fluid.framework import core, in_dygraph_mode, _varbase_creator
+from .. import Layer
 
 __all__ = []
 
 
-class BCEWithLogitsLoss(fluid.dygraph.Layer):
+class BCEWithLogitsLoss(Layer):
     r"""
     This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
     Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
@@ -128,7 +129,7 @@ class BCEWithLogitsLoss(fluid.dygraph.Layer):
         return out
 
 
-class CrossEntropyLoss(fluid.dygraph.Layer):
+class CrossEntropyLoss(Layer):
     r"""
     By default, this operator implements the cross entropy loss function with softmax. This function 
     combines the calculation of the softmax operation and the cross entropy loss function 
@@ -407,7 +408,7 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
         return ret
 
 
-class HSigmoidLoss(fluid.dygraph.Layer):
+class HSigmoidLoss(Layer):
     """
     Hierarchical Sigmoid Layer.
     
@@ -529,7 +530,7 @@ class HSigmoidLoss(fluid.dygraph.Layer):
         return out
 
 
-class MSELoss(fluid.dygraph.layers.Layer):
+class MSELoss(Layer):
     r"""
     **Mean Square Error Loss**
     Computes the mean square error (squared L2 norm) of given input and label.
@@ -596,8 +597,7 @@ class MSELoss(fluid.dygraph.layers.Layer):
             fluid.data_feeder.check_variable_and_dtype(
                 label, 'label', ['float32', 'float64'], 'MSELoss')
 
-        square_out = fluid.layers.square(
-            fluid.layers.elementwise_sub(input, label))
+        square_out = paddle.square(paddle.subtract(input, label))
         if self.reduction == 'none':
             return square_out
 
@@ -608,7 +608,7 @@ class MSELoss(fluid.dygraph.layers.Layer):
         return getattr(fluid.layers, reduce_op)(square_out)
 
 
-class L1Loss(fluid.dygraph.Layer):
+class L1Loss(Layer):
     r"""
     This interface is used to construct a callable object of the ``L1Loss`` class.
     The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
@@ -687,7 +687,7 @@ class L1Loss(fluid.dygraph.Layer):
             input, label, self.reduction, name=self.name)
 
 
-class BCELoss(fluid.dygraph.Layer):
+class BCELoss(Layer):
     """
     This interface is used to construct a callable object of the ``BCELoss`` class.
     The BCELoss layer measures the binary_cross_entropy loss between input predictions ``input``
@@ -777,7 +777,7 @@ class BCELoss(fluid.dygraph.Layer):
         return out
 
 
-class NLLLoss(fluid.dygraph.Layer):
+class NLLLoss(Layer):
     r"""
 	:alias_main: paddle.nn.NLLLoss
 	:alias: paddle.nn.NLLLoss,paddle.nn.layer.NLLLoss,paddle.nn.layer.loss.NLLLoss
@@ -886,7 +886,7 @@ class NLLLoss(fluid.dygraph.Layer):
             name=self._name)
 
 
-class KLDivLoss(fluid.dygraph.Layer):
+class KLDivLoss(Layer):
     r"""
     This interface calculates the Kullback-Leibler divergence loss
     between Input(X) and Input(Target). Notes that Input(X) is the
@@ -959,7 +959,7 @@ class KLDivLoss(fluid.dygraph.Layer):
         return out
 
 
-class MarginRankingLoss(fluid.dygraph.Layer):
+class MarginRankingLoss(Layer):
     r"""
 
     This interface is used to construct a callable object of the ``MarginRankingLoss`` class.
@@ -1031,7 +1031,7 @@ class MarginRankingLoss(fluid.dygraph.Layer):
         return out
 
 
-class CTCLoss(fluid.dygraph.Layer):
+class CTCLoss(Layer):
     """
 
     An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
@@ -1127,7 +1127,7 @@ class CTCLoss(fluid.dygraph.Layer):
             norm_by_times=norm_by_times)
 
 
-class SmoothL1Loss(fluid.dygraph.Layer):
+class SmoothL1Loss(Layer):
     r"""
     This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
     term if the absolute element-wise error falls below 1 and an L1 term otherwise.
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index b93412a7b22ccd8d354c1263fa5a6c476303e469..9abbc494258948ff81e8b547048e14a173d53979 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -30,15 +30,13 @@
 import six
 
 from ...fluid.dygraph import BatchNorm  # noqa: F401
-
 from ...fluid.dygraph import SpectralNorm  # noqa: F401
 
-from ...fluid.dygraph import layers
 from ...framework import get_default_dtype, set_default_dtype
 from ...fluid.framework import in_dygraph_mode
 
-from ...fluid.initializer import Constant
-from ...fluid.param_attr import ParamAttr
+from ..initializer import Constant
+from ...framework import ParamAttr
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid import core, dygraph_utils
 
@@ -47,14 +45,15 @@ from ..functional import batch_norm, layer_norm, instance_norm
 import numpy as np
 import numbers
 import warnings
-from ...fluid.dygraph.base import no_grad
+from ...framework import no_grad
 from .. import functional as F
 from paddle import _C_ops
+from .. import Layer
 
 __all__ = []
 
 
-class _InstanceNormBase(layers.Layer):
+class _InstanceNormBase(Layer):
     """
     This class is based class for InstanceNorm1D, 2d, 3d. 
 
@@ -317,7 +316,7 @@ class InstanceNorm3D(_InstanceNormBase):
                 len(input.shape)))
 
 
-class GroupNorm(layers.Layer):
+class GroupNorm(Layer):
     """
     This interface is used to construct a callable object of the ``GroupNorm`` class.
     For more details, refer to code examples.
@@ -436,7 +435,7 @@ class GroupNorm(layers.Layer):
             self._num_groups, self._num_channels, self._epsilon)
 
 
-class LayerNorm(layers.Layer):
+class LayerNorm(Layer):
     r"""
     :alias_main: paddle.nn.LayerNorm
 	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
@@ -544,7 +543,7 @@ class LayerNorm(layers.Layer):
                                                         self._epsilon)
 
 
-class _BatchNormBase(layers.Layer):
+class _BatchNormBase(Layer):
     """
     BatchNorm base .
     """
@@ -1181,7 +1180,7 @@ class SyncBatchNorm(_BatchNormBase):
         return layer_output
 
 
-class LocalResponseNorm(layers.Layer):
+class LocalResponseNorm(Layer):
     """
         Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions.
         For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 528572ee21b7cc0859c0488bc791239418a4c9f8..881f92568414dcbf03dbc3e91569fc0812492716 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.dygraph import layers
 from ...fluid.layer_helper import LayerHelper
 from .. import functional as F
+from .. import Layer
 
 __all__ = []
 
 
-class AvgPool1D(layers.Layer):
+class AvgPool1D(Layer):
     r"""
     This operation applies a 1D average pooling over an input signal composed
     of several input planes, based on the input, output_size, return_mask parameters.
@@ -109,7 +109,7 @@ class AvgPool1D(layers.Layer):
             **self.__dict__)
 
 
-class AvgPool2D(layers.Layer):
+class AvgPool2D(Layer):
     r"""
     This operation applies 2D average pooling over input features based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
@@ -220,7 +220,7 @@ class AvgPool2D(layers.Layer):
             **self.__dict__)
 
 
-class AvgPool3D(layers.Layer):
+class AvgPool3D(Layer):
     """
     This operation applies 3D max pooling over input features based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
@@ -318,7 +318,7 @@ class AvgPool3D(layers.Layer):
             **self.__dict__)
 
 
-class MaxPool1D(layers.Layer):
+class MaxPool1D(Layer):
     """
     This operation applies 1D max pooling over input signal
     composed of several input planes based on the input,
@@ -412,7 +412,7 @@ class MaxPool1D(layers.Layer):
             **self.__dict__)
 
 
-class MaxPool2D(layers.Layer):
+class MaxPool2D(Layer):
     r"""
     This operation applies 2D max pooling over input feature based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
@@ -522,7 +522,7 @@ class MaxPool2D(layers.Layer):
             **self.__dict__)
 
 
-class MaxPool3D(layers.Layer):
+class MaxPool3D(Layer):
     """
     This operation applies 3D max pooling over input features based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
@@ -620,7 +620,7 @@ class MaxPool3D(layers.Layer):
             **self.__dict__)
 
 
-class AdaptiveAvgPool1D(layers.Layer):
+class AdaptiveAvgPool1D(Layer):
     r"""
 
     This operation applies a 1D adaptive average pooling over an input signal composed
@@ -693,7 +693,7 @@ class AdaptiveAvgPool1D(layers.Layer):
         return 'output_size={}'.format(self.output_size)
 
 
-class AdaptiveAvgPool2D(layers.Layer):
+class AdaptiveAvgPool2D(Layer):
     r"""
 
     This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
@@ -779,7 +779,7 @@ class AdaptiveAvgPool2D(layers.Layer):
         return 'output_size={}'.format(self._output_size)
 
 
-class AdaptiveAvgPool3D(layers.Layer):
+class AdaptiveAvgPool3D(Layer):
     r"""
 
     This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
@@ -872,7 +872,7 @@ class AdaptiveAvgPool3D(layers.Layer):
         return 'output_size={}'.format(self._output_size)
 
 
-class AdaptiveMaxPool1D(layers.Layer):
+class AdaptiveMaxPool1D(Layer):
     """
 
     This operation applies a 1D adaptive max pooling over an input signal composed
@@ -956,7 +956,7 @@ class AdaptiveMaxPool1D(layers.Layer):
                                                        self.return_mask)
 
 
-class AdaptiveMaxPool2D(layers.Layer):
+class AdaptiveMaxPool2D(Layer):
     """
     This operation applies 2D adaptive max pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and
@@ -1037,7 +1037,7 @@ class AdaptiveMaxPool2D(layers.Layer):
                                                        self._return_mask)
 
 
-class AdaptiveMaxPool3D(layers.Layer):
+class AdaptiveMaxPool3D(Layer):
     """
     This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions of the output tensor are
     determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 693ec0200b0d059eca7871742dabae00e0580bdf..77168566d88c6055bdce3a8f168b102a1ef29343 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -28,7 +28,7 @@ from paddle import framework
 from paddle.device import get_device, get_cudnn_version
 from paddle.nn import functional as F
 from paddle.nn import initializer as I
-from paddle.fluid.dygraph import Layer, LayerList
+from paddle.nn import Layer, LayerList
 from paddle.fluid.layers import utils
 from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
 from paddle.fluid.data_feeder import convert_dtype
@@ -962,7 +962,7 @@ class RNNBase(LayerList):
             # for static-graph, append coalesce_tensor into startup program
             with fluid.program_guard(fluid.default_startup_program(),
                                      fluid.default_startup_program()):
-                with framework.no_grad():
+                with paddle.no_grad():
                     self._helper.append_op(
                         type="coalesce_tensor",
                         inputs={"Input": self._all_weights},
@@ -1040,11 +1040,11 @@ class RNNBase(LayerList):
             ])
         else:
             initial_states = [initial_states] if isinstance(
-                initial_states,
-                paddle.fluid.framework.Variable) else initial_states
+                initial_states, paddle.static.Variable) else initial_states
 
-        if self.could_use_cudnn and (not fluid.core.is_compiled_with_rocm() or
-                                     sequence_length is None):
+        if self.could_use_cudnn and (
+                not paddle.device.is_compiled_with_rocm() or
+                sequence_length is None):
             # Add CPU kernel and dispatch in backend later
             return self._cudnn_impl(inputs, initial_states, sequence_length)
 
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 5aba8ae85ad1b32a35de48cddc8dadd5d3929e70..eacf5aac9daa9f2d37795aaed5695df6b642f786 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -24,8 +24,8 @@ from .norm import LayerNorm
 from .. import functional as F
 from ... import tensor
 from ...fluid import layers
-from ...fluid.dygraph import Layer, LayerList
-from ...fluid.param_attr import ParamAttr
+from .. import Layer, LayerList
+from ...framework import ParamAttr
 from ...fluid.data_feeder import convert_dtype
 
 __all__ = []
diff --git a/python/paddle/nn/layer/vision.py b/python/paddle/nn/layer/vision.py
index e6d3af9a37b329231d625a4542eecea54d943e50..7f8b51ca10818ec10a794f2910f066f16cf26278 100644
--- a/python/paddle/nn/layer/vision.py
+++ b/python/paddle/nn/layer/vision.py
@@ -14,13 +14,13 @@
 
 # TODO: define specitial functions used in computer vision task 
 
-from ...fluid.dygraph import layers
+from .. import Layer
 from .. import functional
 
 __all__ = []
 
 
-class PixelShuffle(layers.Layer):
+class PixelShuffle(Layer):
     """
     
     PixelShuffle Layer