【fluid clean】Move out layers and layers helper (#49415)

* remove no used fluid beam_search_decoder * move Layer and related helper to paddle.nn.common * modify Layer references from dygraph.layers.Layer to paddle.nn.common.layers * stash changge * remove fluid layer_object_helper, layers.py * remove fluid layers init * add setip * fix unitest * delete layers in fluid.dygraph * merge paddle.tensor.stat,py * fix circle import * fix curcle import * remove redundant in_dygraph_mode import * revoce paddle.nn.common.* in fluid.__init__ * recovery nn.rnn * paddle.frame use lazy import import paddle.jit to avoid circle import * remove left dygraph.layers ref * merge develop * fix import error * fix test error * fxi merge error * fix test fluid.Layer * fix test error * fix test error * fix import error * fix import error * fix comments * fix circle import * fix rnn import error * fix circle import

【fluid clean】Move out layers and layers helper (#49415)
* remove no used fluid beam_search_decoder * move Layer and related helper to paddle.nn.common * modify Layer references from dygraph.layers.Layer to paddle.nn.common.layers * stash changge * remove fluid layer_object_helper, layers.py * remove fluid layers init * add setip * fix unitest * delete layers in fluid.dygraph * merge paddle.tensor.stat,py * fix circle import * fix curcle import * remove redundant in_dygraph_mode import * revoce paddle.nn.common.* in fluid.__init__ * recovery nn.rnn * paddle.frame use lazy import import paddle.jit to avoid circle import * remove left dygraph.layers ref * merge develop * fix import error * fix test error * fxi merge error * fix test fluid.Layer * fix test error * fix test error * fix import error * fix import error * fix comments * fix circle import * fix rnn import error * fix circle import
1d5cad23 · GGBond8488 · GitHub · 5b6d2f85 · 1d5cad23 · 1d5cad23
140 changed file
--- a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
@@ -14,7 +14,6 @@

 import paddle
 from paddle.fluid import core
-from paddle.nn import Layer
 from paddle.nn import functional as F

 from ...base import topology as tp
@@ -32,7 +31,7 @@ def is_fused_matmul_bias_supported():
    return hasattr(core.eager.ops.legacy, 'fused_gemm_epilogue')


-class VocabParallelEmbedding(Layer):
+class VocabParallelEmbedding(paddle.nn.Layer):
    """Embedding mp parallelized in the vocabulary dimension.
    this class is used for splitting embedding in mp group.

@@ -170,7 +169,7 @@ class VocabParallelEmbedding(Layer):
        return output


-class ColumnParallelLinear(Layer):
+class ColumnParallelLinear(paddle.nn.Layer):
    """Linear layer with mp parallelized(column).
    this class is used for splitting Linear Layer in mp group, column split the weight of the Linear layer.

@@ -329,7 +328,7 @@ class ColumnParallelLinear(Layer):
        return output


-class RowParallelLinear(Layer):
+class RowParallelLinear(paddle.nn.Layer):
    """Linear layer with mp parallelized(row).
    this class is used for splitting Linear Layer in mp group, row split the weight of the Linear layer.

@@ -495,7 +494,7 @@ class RowParallelLinear(Layer):
        return output


-class ParallelCrossEntropy(Layer):
+class ParallelCrossEntropy(paddle.nn.Layer):
    """CrossEntropy with mp parallelized.
    this class is used for splitting softmax cross entropy in mp group.


--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -46,7 +46,8 @@ from paddle.distributed.fleet.launch_utils import check_backend
 # (TODO: GhostScreaming) It will be removed later.
 from paddle.framework import ParamBase, _set_expected_place
 from paddle.framework import base as imperative_base
-from paddle.framework import core, in_dygraph_mode, layers, to_variable
+from paddle.framework import core, in_dygraph_mode, to_variable
+from paddle.nn.layer import layers
 from paddle.utils import deprecated

 from . import parallel_helper

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -86,7 +86,6 @@ from .parallel_executor import *
 from . import compiler
 from .compiler import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable
-from .dygraph.layers import *
 from .dygraph.base import enable_dygraph, disable_dygraph
 from .dygraph.varbase_patch_methods import monkey_patch_varbase
 from .core import _cuda_synchronize

--- a/python/paddle/fluid/contrib/tests/test_correlation.py
+++ b/python/paddle/fluid/contrib/tests/test_correlation.py
@@ -135,7 +135,7 @@ class TestCorrelationOp(unittest.TestCase):
        np.testing.assert_allclose(res[0], out_np, rtol=1e-05, atol=1e-8)


-class Net(fluid.dygraph.Layer):
+class Net(paddle.nn.Layer):
    def __init__(self, name_scope):
        super().__init__(name_scope)


--- a/python/paddle/fluid/dygraph/__init__.py
+++ b/python/paddle/fluid/dygraph/__init__.py
@@ -15,9 +15,6 @@
 from . import base
 from .base import *

-from . import layers
-from .layers import *
-
 from . import tracer
 from .tracer import *

@@ -27,6 +24,5 @@ from .learning_rate_scheduler import *
 from .math_op_patch import monkey_patch_math_varbase

 __all__ = []
-__all__ += layers.__all__
 __all__ += base.__all__
 __all__ += learning_rate_scheduler.__all__
--- a/python/paddle/fluid/dygraph/layer_hooks.py
+++ b/python/paddle/fluid/dygraph/layer_hooks.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from paddle.fluid.framework import default_main_program, in_dygraph_mode
-
-
-class LayerOpsRecoder:
-    """
-    Record generated operators information in nn.Layer.
-    """
-
-    def __init__(self, start=-1, end=-1, ops=None, is_valid=False, hooks=None):
-        self.start = start
-        self.end = end
-        self.ops = ops
-        self.is_valid = is_valid
-        self.hooks = hooks
-
-
-def record_program_ops_pre_hook(layer, inputs):
-    """
-    A pre-hook to mark op numbers before enter layer.forward.
-    """
-    if not in_dygraph_mode():
-        if layer._op_recorder.start < 0:
-            layer._op_recorder.start = len(
-                default_main_program().current_block().ops
-            )
-            layer._op_recorder.is_valid = True
-        else:
-            layer._op_recorder.is_valid = False
-            warnings.warn(
-                "{} has recorded the op information before. Please check whether you call this layer twice.".format(
-                    layer._full_name
-                )
-            )
-
-    return None
-
-
-def set_op_customized_attrs_post_hook(layer, inputs, outputs):
-    """
-    A post-hook to append customized attributes into all operators generated in current layer.
-    """
-    if not in_dygraph_mode() and layer._op_recorder.is_valid:
-
-        start = layer._op_recorder.start
-        end = len(default_main_program().current_block().ops)
-        assert start >= 0 and end >= start
-        ops = default_main_program().current_block().ops[start:end]
-
-        layer._op_recorder.end = end
-        layer._op_recorder.ops = ops
-
-        for op in ops:
-            for attr_name, val in layer._customized_attrs.items():
-                op._set_attr(attr_name, val)
-
-        # remove pre-hook and post-hook
-        for hook_helper in layer._op_recorder.hooks:
-            hook_helper.remove()
-
-    return None
--- a/python/paddle/fluid/dygraph/layer_object_helper.py
+++ b/python/paddle/fluid/dygraph/layer_object_helper.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-from ..framework import Parameter, in_dygraph_mode, _global_flags
-from ..param_attr import ParamAttr
-from .. import core
-
-from ..layer_helper_base import LayerHelperBase
-from ..dygraph_utils import _append_activation_in_dygraph
-
-
-class LayerObjectHelper(LayerHelperBase):
-    def __init__(self, name):
-        super().__init__(name, layer_type=name)
-
-    def append_op(
-        self,
-        type=None,
-        inputs=None,
-        outputs=None,
-        attrs=None,
-        stop_gradient=None,
-    ):
-        """append an operator for this layer object.
-
-           Args:
-               type: operator type
-               inputs: input variable of the operator
-               dtype: data type of this parameter
-               is_bias: if this is a bias parameter
-               default_initializer: set the default initializer for this parameter
-
-        Returns created parameter Variable.
-        """
-        return self.main_program.current_block().append_op(
-            type=type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=attrs,
-            stop_gradient=stop_gradient,
-        )
-
-    def _multiple_input(self, inputs_in):
-        inputs = inputs_in
-        ret = []
-        if isinstance(inputs, (list, tuple)):
-            for inp in inputs:
-                ret.append(self.to_variable(inp))
-        else:
-            ret.append(self.to_variable(inputs))
-        return ret
-
-    # TODO: make it public when we need it
-    def _input(self, inputs_in):
-        inputs = self._multiple_input(inputs_in)
-        if len(inputs) != 1:
-            raise "{0} layer only takes one input in".format(self.layer_type)
-        return inputs[0]
-
-    def _multiple_param_attr(self, length, param_attr_in=None):
-        param_attr = param_attr_in
-        if isinstance(param_attr, ParamAttr):
-            param_attr = [param_attr]
-
-        if len(param_attr) != 1 and len(param_attr) != length:
-            raise ValueError(
-                "parameter number mismatch in {}".format(self.name)
-            )
-        elif len(param_attr) == 1 and length != 1:
-            tmp = [None] * length
-            for i in range(length):
-                tmp[i] = copy.deepcopy(param_attr[0])
-            param_attr = tmp
-        return param_attr
-
-    def iter_inputs_and_params(self, inputs_in, param_attr_in=None):
-        """Access all inputs and params one by one
-
-           Args:
-               inputs_in: inputs to be iter
-               param_attr_in: param_attr to be iter
-
-        Returns input, param_attr
-        """
-        param_attr_in = ParamAttr._to_attr(param_attr_in)
-        if isinstance(param_attr_in, bool):
-            raise ValueError(
-                'Param_attr should not be False in {}'.format(self.name)
-            )
-        inputs = inputs_in if (inputs_in is not None) else []
-        inputs = self._multiple_input(inputs)
-        param_attrs = self._multiple_param_attr(len(inputs), param_attr_in)
-        for ipt, param_attr in zip(inputs, param_attrs):
-            yield ipt, param_attr
-
-    def input_dtype(self, inputs_in):
-        """Get input data type
-
-           Args:
-               inputs_in: inputs wanted know the data type
-
-        Returns dtype of the input
-        """
-        inputs_in = inputs_in if (inputs_in is not None) else []
-        inputs = self._multiple_input(inputs_in)
-        dtype = None
-        for each in inputs:
-            if dtype is None:
-                dtype = each.dtype
-            elif dtype != each.dtype:
-                raise ValueError(
-                    "Data Type mismatch: %d to %d in %s"
-                    % (dtype, each.dtype, self.name)
-                )
-        return dtype
-
-    def get_parameter(self, name):
-        """Get parameter specifically
-
-           Args:
-               name: parameter's name
-
-        Returns target parameter
-        """
-        param = self.main_program.global_block().var(name)
-        if not isinstance(param, Parameter):
-            raise ValueError(
-                "no Parameter name %s found in %s" % (name, self.name)
-            )
-        return param
-
-    # TODO: this should not be called anymore after all activation func move to Layers
-    def append_activation(self, input_var, act=None, use_cudnn=None):
-        """Append activation
-
-            Args:
-                input_var: the input variable. The len(input_var.shape) is
-                larger or equal than 2.
-                act: activation type
-                use_cudnn: if use cudnn
-
-        Return the Variable of after append activation
-        """
-        act = act
-        if act is None:
-            return input_var
-        if isinstance(act, str):
-            act = {'type': act}
-        else:
-            raise TypeError(
-                str(act) + " should be unicode or str in %s ", self.name
-            )
-
-        if (use_cudnn is not None) and use_cudnn:
-            act['use_cudnn'] = use_cudnn
-        use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
-        if (use_mkldnn is not None) and use_mkldnn:
-            act['use_mkldnn'] = use_mkldnn
-        act_type = act.pop('type')
-        if in_dygraph_mode():
-            res = _append_activation_in_dygraph(
-                input_var, act_type, use_cudnn, use_mkldnn
-            )
-            return res
-        else:
-            tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
-            self.append_op(
-                type=act_type,
-                inputs={"X": [input_var]},
-                outputs={"Out": [tmp]},
-                attrs=act,
-            )
-            return tmp
-
-    def is_instance(self, param, cls):
-        """Check if the input parameter is instance of input class
-
-            Args:
-                param: parameter to be check
-                cls: class of the parameter
-
-        Return result of the check (True or False)
-        """
-        param = param
-        if not isinstance(param, cls):
-            raise TypeError(
-                "The input {0} parameter of method {1} must be {2}, in layer {3}",
-                param,
-                self.layer_type,
-                cls.__name__,
-                self.name,
-            )
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -442,7 +442,7 @@ def set_ipu_shard(call_func, index=-1, stage=-1):

        return wrapper

-    from .dygraph.layers import Layer
+    from paddle.nn import Layer

    if not isinstance(call_func, Layer):
        if callable(call_func):

--- a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
@@ -20,11 +20,11 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.dygraph.layers import Layer, _convert_camel_to_snake
 from paddle.incubate import asp as sparsity
 from paddle.incubate.asp.supported_layer_list import (
    supported_layers_and_prune_func_map,
 )
+from paddle.nn.layer.layers import Layer, _convert_camel_to_snake


 class MyOwnLayer(Layer):
@@ -99,10 +99,8 @@ class TestASPDynamicCustomerizedPruneFunc(unittest.TestCase):
        sparsity.add_supported_layer(CustomerLayer, my_own_pruning)

        self.layer = CustomerLayer()
-        self.customer_prefix = (
-            paddle.fluid.dygraph.layers._convert_camel_to_snake(
-                CustomerLayer.__name__
-            )
+        self.customer_prefix = paddle.nn.layer.layers._convert_camel_to_snake(
+            CustomerLayer.__name__
        )
        self.supported_layer_count_ref = 3


--- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_dist_save_load.py
@@ -22,7 +22,6 @@ import tempfile
 import numpy as np

 import paddle
-import paddle.fluid as fluid
 from paddle import distributed as dist
 from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import (
@@ -39,7 +38,7 @@ epoch = 2
 linear_size = 1000


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_api.py
@@ -18,7 +18,6 @@ import tempfile
 import numpy as np

 import paddle
-import paddle.fluid as fluid
 from paddle.distributed import fleet
 from paddle.distributed.sharding import (
    group_sharded_parallel,
@@ -35,7 +34,7 @@ l2_decay = 1e-4
 batch_size = 100


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_api_eager.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_api_eager.py
@@ -17,7 +17,6 @@ import tempfile
 import numpy as np

 import paddle
-import paddle.fluid as fluid
 from paddle.distributed.sharding import (
    group_sharded_parallel,
    save_group_sharded_model,
@@ -33,7 +32,7 @@ l2_decay = 1e-4
 batch_size = 100


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage2.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage2.py
@@ -21,7 +21,6 @@ import tempfile
 import numpy as np

 import paddle
-import paddle.fluid as fluid
 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import (
    GroupShardedOptimizerStage2,
 )
@@ -38,7 +37,7 @@ np.random.seed(seed)
 paddle.seed(seed)


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage2_comm_overlap.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage2_comm_overlap.py
@@ -21,7 +21,6 @@ import tempfile
 import numpy as np

 import paddle
-import paddle.fluid as fluid
 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import (
    GroupShardedOptimizerStage2,
 )
@@ -38,7 +37,7 @@ np.random.seed(seed)
 paddle.seed(seed)


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3_offload.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3_offload.py
@@ -18,7 +18,6 @@
 import numpy as np

 import paddle
-import paddle.fluid as fluid
 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import (
    GroupShardedStage3,
 )
@@ -36,7 +35,7 @@ momentum_rate = 0.9
 l2_decay = 1e-4


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py
@@ -23,7 +23,6 @@ import tempfile
 import numpy as np

 import paddle
-import paddle.fluid as fluid
 from paddle import distributed as dist
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel import engine
@@ -86,7 +85,7 @@ class MLP_pipe(PipelineLayer):
        )


-class MLP_Hybrid(fluid.Layer):
+class MLP_Hybrid(paddle.nn.Layer):
    def __init__(
        self,
        embedding_size=1000,
@@ -121,7 +120,7 @@ class MLP_Hybrid(fluid.Layer):
        return y


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(
        self,
        embedding_size=1000,

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_layers.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_layers.py
@@ -20,7 +20,6 @@ import numpy as np
 import paddle
 import paddle.distributed as dist
 import paddle.distributed.fleet as fleet
-import paddle.fluid as fluid


 def set_random_seed(seed):
@@ -31,7 +30,7 @@ def set_random_seed(seed):
    fleet.meta_parallel.model_parallel_random_seed(seed)


-class ColumnLinearNet(fluid.dygraph.Layer):
+class ColumnLinearNet(paddle.nn.Layer):
    def __init__(self, input_size, output_size, global_dtype):
        super().__init__()
        self.parallel_linear = fleet.meta_parallel.ColumnParallelLinear(
@@ -48,7 +47,7 @@ class ColumnLinearNet(fluid.dygraph.Layer):
        return output


-class RowLinearNet(fluid.dygraph.Layer):
+class RowLinearNet(paddle.nn.Layer):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.parallel_linear = fleet.meta_parallel.RowParallelLinear(
@@ -64,7 +63,7 @@ class RowLinearNet(fluid.dygraph.Layer):
        return output


-class EmbeddingNet(fluid.dygraph.Layer):
+class EmbeddingNet(paddle.nn.Layer):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
@@ -76,7 +75,7 @@ class EmbeddingNet(fluid.dygraph.Layer):
        return output


-class SimpleMatmul(fluid.dygraph.Layer):
+class SimpleMatmul(paddle.nn.Layer):
    def __init__(self, weight, output_size, global_dtype):
        super().__init__()
        self.weight = paddle.create_parameter(
@@ -99,7 +98,7 @@ class SimpleMatmul(fluid.dygraph.Layer):
        return output


-class SimpleEmbedding(fluid.dygraph.Layer):
+class SimpleEmbedding(paddle.nn.Layer):
    def __init__(self, vocab_size, hidden_size, weight):
        super().__init__()
        self.embedding = paddle.nn.Embedding(

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_model.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_model.py
@@ -20,7 +20,6 @@ import numpy as np
 import paddle
 import paddle.distributed as dist
 import paddle.distributed.fleet as fleet
-import paddle.fluid as fluid


 def set_random_seed(seed, dp_id, rank_id):
@@ -62,7 +61,7 @@ def parallel_matmul(lm_output, logit_weights, parallel_output):
        return logits


-class SimpleMPNet(fluid.dygraph.Layer):
+class SimpleMPNet(paddle.nn.Layer):
    def __init__(
        self,
        vocab_size,
@@ -128,7 +127,7 @@ class SimpleMPNet(fluid.dygraph.Layer):
        return x


-class SimpleDPNet(fluid.dygraph.Layer):
+class SimpleDPNet(paddle.nn.Layer):
    def __init__(
        self, vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2
    ):

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_embedding.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_embedding.py
@@ -22,8 +22,7 @@ import paddle.distributed as dist
 import paddle.distributed.fleet as fleet
 import paddle.nn as nn
 from paddle.distributed.fleet.meta_parallel import PipelineLayer
-from paddle.fluid.dygraph.layers import Layer
-from paddle.nn import Sequential
+from paddle.nn import Layer, Sequential


 def set_random_seed(seed, dp_id, rank_id):

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_layer_with_virtual_stage.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_layer_with_virtual_stage.py
@@ -23,7 +23,7 @@ from paddle.distributed.fleet.meta_parallel import (
    PipelineLayer,
    PipelineParallelWithInterleave,
 )
-from paddle.fluid.dygraph.layers import Layer
+from paddle.nn import Layer


 class ReshapeHelp(Layer):

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_recompute.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_recompute.py
@@ -24,7 +24,7 @@ import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle import framework
 from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
-from paddle.fluid.dygraph.layers import Layer
+from paddle.nn import Layer


 def set_random_seed(seed, dp_id, rank_id):

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer.py
@@ -23,7 +23,7 @@ import paddle.distributed.fleet as fleet
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
-from paddle.fluid.dygraph.layers import Layer
+from paddle.nn import Layer


 def set_random_seed(seed, dp_id, rank_id):

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py
@@ -23,7 +23,7 @@ import paddle.distributed.fleet as fleet
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
-from paddle.fluid.dygraph.layers import Layer
+from paddle.nn import Layer


 def set_random_seed(seed, dp_id, rank_id):

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_sharding_model.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_sharding_model.py
@@ -20,7 +20,6 @@ import numpy as np
 import paddle
 import paddle.distributed as dist
 import paddle.distributed.fleet as fleet
-import paddle.fluid as fluid
 from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizer import (
    DygraphShardingOptimizer,
 )
@@ -58,7 +57,7 @@ def parallel_matmul(lm_output, logit_weights, parallel_output):
        return logits


-class SimpleMPNet(fluid.dygraph.Layer):
+class SimpleMPNet(paddle.nn.Layer):
    def __init__(
        self,
        vocab_size,
@@ -124,7 +123,7 @@ class SimpleMPNet(fluid.dygraph.Layer):
        return x


-class SimpleDPNet(fluid.dygraph.Layer):
+class SimpleDPNet(paddle.nn.Layer):
    def __init__(
        self, vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2
    ):

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_shared_weight.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_shared_weight.py
@@ -26,7 +26,7 @@ from paddle.distributed.fleet.meta_parallel import (
    PipelineLayer,
    SharedLayerDesc,
 )
-from paddle.fluid.dygraph.layers import Layer
+from paddle.nn import Layer


 def print_hook_fn(grad):

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_different.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_different.py
@@ -16,14 +16,13 @@ import numpy as np
 from test_dist_base import TestParallelDyGraphRunnerBase, runtime_main

 import paddle
-import paddle.fluid as fluid
 import paddle.nn.functional as F

 paddle.seed(123)
 np.random.seed(2021)


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(self, hidden_size, vocab_size, is_sparse=False):
        super().__init__()
        self.hidden_size = hidden_size

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_same.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_same.py
@@ -16,7 +16,6 @@ import numpy as np
 from test_dist_base import TestParallelDyGraphRunnerBase, runtime_main

 import paddle
-import paddle.fluid as fluid
 from paddle.fluid.dygraph.base import to_variable
 from paddle.nn import Linear

@@ -27,7 +26,7 @@ batch_size = 4
 batch_num = 1000


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.net_a = paddle.nn.Sequential(

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync.py
@@ -34,7 +34,7 @@ batch_size = 4
 batch_num = 1000


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.net_a = Linear(10, 20)

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync_control_flow.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync_control_flow.py
@@ -17,7 +17,6 @@ from parallel_dygraph_no_sync import TestNoSync
 from test_dist_base import runtime_main

 import paddle
-import paddle.fluid as fluid
 from paddle.nn import Linear

 seed = 90
@@ -26,7 +25,7 @@ batch_size = 4
 batch_num = 1000


-class SimpleNetControlFlow(fluid.Layer):
+class SimpleNetControlFlow(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.net_a = Linear(10, 20)

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync_gradient_check.py
@@ -18,7 +18,6 @@ import numpy as np

 import paddle
 import paddle.distributed as dist
-import paddle.fluid as fluid
 from paddle.nn import Linear

 paddle.seed(1024)
@@ -29,7 +28,7 @@ in_dim = 10
 out_dim = 20


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(self, train_id):
        super().__init__()
        self.w1 = self.create_parameter(

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync_unused_params.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync_unused_params.py
@@ -17,7 +17,6 @@ from parallel_dygraph_no_sync import TestNoSync
 from test_dist_base import runtime_main

 import paddle
-import paddle.fluid as fluid
 from paddle.nn import Linear

 seed = 90
@@ -26,7 +25,7 @@ batch_size = 4
 batch_num = 1000


-class SimpleNetUnusedParam(fluid.Layer):
+class SimpleNetUnusedParam(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.net_a = Linear(10, 20)

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py
@@ -76,7 +76,7 @@ def optimizer_setting(params, parameter_list=None):
    return optimizer


-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -109,7 +109,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
        return y


-class SqueezeExcitation(fluid.dygraph.Layer):
+class SqueezeExcitation(paddle.nn.Layer):
    def __init__(self, num_channels, reduction_ratio):

        super().__init__()
@@ -143,7 +143,7 @@ class SqueezeExcitation(fluid.dygraph.Layer):
        return y


-class BottleneckBlock(fluid.dygraph.Layer):
+class BottleneckBlock(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -207,7 +207,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
        return y


-class SeResNeXt(fluid.dygraph.Layer):
+class SeResNeXt(paddle.nn.Layer):
    def __init__(self, layers=50, class_dim=102):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_sync_batch_norm.py
@@ -21,7 +21,7 @@ from paddle.fluid.dygraph.base import to_variable
 from paddle.nn import Conv2D, SyncBatchNorm


-class TestLayer(fluid.dygraph.Layer):
+class TestLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
@@ -18,7 +18,8 @@ from test_dist_base import TestParallelDyGraphRunnerBase, runtime_main
 import paddle
 import paddle.fluid as fluid
 import paddle.nn.functional as F
-from paddle.fluid.dygraph import Layer, to_variable
+from paddle.fluid.dygraph import to_variable
+from paddle.nn import Layer
 from paddle.optimizer.lr import NoamDecay

 """

--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_static_mp_layers.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_static_mp_layers.py
@@ -17,12 +17,11 @@ import unittest

 import paddle
 import paddle.distributed.fleet as fleet
-import paddle.fluid as fluid

 paddle.enable_static()


-class ColumnLinearNet(fluid.dygraph.Layer):
+class ColumnLinearNet(paddle.nn.Layer):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.parallel_linear = fleet.meta_parallel.ColumnParallelLinear(
@@ -39,7 +38,7 @@ class ColumnLinearNet(fluid.dygraph.Layer):
        return output


-class RowLinearNet(fluid.dygraph.Layer):
+class RowLinearNet(paddle.nn.Layer):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.parallel_linear = fleet.meta_parallel.RowParallelLinear(
@@ -55,7 +54,7 @@ class RowLinearNet(fluid.dygraph.Layer):
        return output


-class EmbeddingNet(fluid.dygraph.Layer):
+class EmbeddingNet(paddle.nn.Layer):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.embedding = fleet.meta_parallel.VocabParallelEmbedding(

--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
@@ -29,7 +29,7 @@ if fluid.core.is_compiled_with_cuda():
    fluid.set_flags({"FLAGS_cudnn_deterministic": True})


-class SimpleConv(fluid.dygraph.Layer):
+class SimpleConv(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,

--- a/python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_group_sharded_stage3.py
+++ b/python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_group_sharded_stage3.py
@@ -21,7 +21,6 @@ import tempfile
 import numpy as np

 import paddle
-import paddle.fluid as fluid
 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import (
    GroupShardedOptimizerStage2,
 )
@@ -44,7 +43,7 @@ momentum_rate = 0.9
 l2_decay = 1e-4


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
@@ -16,9 +16,8 @@ from transformer_dygraph_model import MultiHeadAttention, PrePostProcessLayer

 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.dygraph import Layer
 from paddle.jit.api import to_static
-from paddle.nn import Linear
+from paddle.nn import Layer, Linear


 class PositionwiseFeedForwardLayer(Layer):

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
@@ -13,13 +13,12 @@
 # limitations under the License.

 import paddle
-import paddle.fluid as fluid
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.regularizer import L2Decay
 from paddle.nn import BatchNorm


-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(paddle.nn.Layer):
    def __init__(
        self,
        ch_in,
@@ -68,7 +67,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
        return out


-class DownSample(fluid.dygraph.Layer):
+class DownSample(paddle.nn.Layer):
    def __init__(
        self, ch_in, ch_out, filter_size=3, stride=2, padding=1, is_test=True
    ):
@@ -90,7 +89,7 @@ class DownSample(fluid.dygraph.Layer):
        return out


-class BasicBlock(fluid.dygraph.Layer):
+class BasicBlock(paddle.nn.Layer):
    def __init__(self, ch_in, ch_out, is_test=True):
        super().__init__()

@@ -118,7 +117,7 @@ class BasicBlock(fluid.dygraph.Layer):
        return out


-class LayerWarp(fluid.dygraph.Layer):
+class LayerWarp(paddle.nn.Layer):
    def __init__(self, ch_in, ch_out, count, is_test=True):
        super().__init__()

@@ -142,7 +141,7 @@ class LayerWarp(fluid.dygraph.Layer):
 DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}


-class DarkNet53_conv_body(fluid.dygraph.Layer):
+class DarkNet53_conv_body(paddle.nn.Layer):
    def __init__(self, ch_in=3, is_test=True):
        super().__init__()
        self.stages = DarkNet_cfg[53]

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
@@ -232,7 +232,7 @@ def nested_if_else_3(x):
    return res


-class NetWithControlFlowIf(fluid.dygraph.Layer):
+class NetWithControlFlowIf(paddle.nn.Layer):
    def __init__(self, hidden_dim=16):
        super().__init__()
        self.hidden_dim = hidden_dim

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
@@ -19,10 +19,9 @@ from seq2seq_utils import Seq2SeqModelHyperParams as args
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import ParamAttr
-from paddle.fluid.dygraph import Layer
 from paddle.fluid.dygraph.base import to_variable
 from paddle.jit.api import to_static
-from paddle.nn import Embedding
+from paddle.nn import Embedding, Layer

 INF = 1.0 * 1e5
 alpha = 0.6
@@ -84,7 +83,7 @@ class BasicLSTMUnit(Layer):
        return new_hidden, new_cell


-class BaseModel(fluid.dygraph.Layer):
+class BaseModel(paddle.nn.Layer):
    def __init__(
        self,
        hidden_size,
@@ -511,7 +510,7 @@ class BaseModel(fluid.dygraph.Layer):
        return predicted_ids


-class AttentionModel(fluid.dygraph.Layer):
+class AttentionModel(paddle.nn.Layer):
    def __init__(
        self,
        hidden_size,

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
@@ -17,8 +17,8 @@ from functools import reduce
 import paddle
 import paddle.fluid.param_attr as attr
 from paddle.common_ops_import import Variable
-from paddle.fluid.dygraph import Layer
 from paddle.jit.api import to_static
+from paddle.nn import Layer


 class EmbeddingLayer:

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -99,7 +99,7 @@ def _get_interp1d_bin_mask(
    return p_mask


-class Conv1D(fluid.dygraph.Layer):
+class Conv1D(paddle.nn.Layer):
    def __init__(
        self,
        prefix,
@@ -140,7 +140,7 @@ class Conv1D(fluid.dygraph.Layer):
        return x


-class BMN(fluid.dygraph.Layer):
+class BMN(paddle.nn.Layer):
    def __init__(self, cfg):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
@@ -117,7 +117,7 @@ class TestRecursiveCall1(unittest.TestCase):
 lambda_fun = lambda x: x


-class MyConvLayer(fluid.dygraph.Layer):
+class MyConvLayer(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self._conv = paddle.nn.Conv2D(
@@ -145,7 +145,7 @@ class MyConvLayer(fluid.dygraph.Layer):
        return x_v


-class MyLayer(fluid.dygraph.Layer):
+class MyLayer(paddle.nn.Layer):
    def __init__(self):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
@@ -61,7 +61,7 @@ IMAGE_SIZE = 64
 SEED = 2020


-class Cycle_Gan(fluid.dygraph.Layer):
+class Cycle_Gan(paddle.nn.Layer):
    def __init__(self, input_channel, istrain=True):
        super().__init__()

@@ -151,7 +151,7 @@ class Cycle_Gan(fluid.dygraph.Layer):
        return rec_A, fake_pool_rec_A


-class build_resnet_block(fluid.dygraph.Layer):
+class build_resnet_block(paddle.nn.Layer):
    def __init__(self, dim, use_bias=False):
        super().__init__()

@@ -185,7 +185,7 @@ class build_resnet_block(fluid.dygraph.Layer):
        return out_res + inputs


-class build_generator_resnet_9blocks(fluid.dygraph.Layer):
+class build_generator_resnet_9blocks(paddle.nn.Layer):
    def __init__(self, input_channel):
        super().__init__()

@@ -267,7 +267,7 @@ class build_generator_resnet_9blocks(fluid.dygraph.Layer):
        return y


-class build_gen_discriminator(fluid.dygraph.Layer):
+class build_gen_discriminator(paddle.nn.Layer):
    def __init__(self, input_channel):
        super().__init__()

@@ -330,7 +330,7 @@ class build_gen_discriminator(fluid.dygraph.Layer):
        return y


-class conv2d(fluid.dygraph.Layer):
+class conv2d(paddle.nn.Layer):
    """docstring for Conv2D"""

    def __init__(
@@ -398,7 +398,7 @@ class conv2d(fluid.dygraph.Layer):
        return conv


-class DeConv2D(fluid.dygraph.Layer):
+class DeConv2D(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -21,12 +21,13 @@ from test_basic_api_transformation import dyfunc_to_variable

 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.dygraph import Layer, to_variable
+from paddle.fluid.dygraph import to_variable
 from paddle.jit.api import to_static
 from paddle.jit.dy2static.program_translator import (
    ConcreteProgram,
    StaticFunction,
 )
+from paddle.nn import Layer
 from paddle.static import InputSpec



--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
@@ -25,7 +25,7 @@ PLACE = (
 )


-class SubNetWithDict(fluid.dygraph.Layer):
+class SubNetWithDict(paddle.nn.Layer):
    def __init__(self, hidden_size=16, output_size=16):
        super().__init__()

@@ -72,7 +72,7 @@ class SubNetWithDict(fluid.dygraph.Layer):
        return out


-class MainNetWithDict(fluid.dygraph.Layer):
+class MainNetWithDict(paddle.nn.Layer):
    def __init__(self, batch_size=64, hidden_size=16, output_size=16):
        super().__init__()
        self.batch_size = batch_size

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
@@ -67,7 +67,7 @@ def func_decorated_by_other_2():
    return 1


-class LayerErrorInCompiletime(fluid.dygraph.Layer):
+class LayerErrorInCompiletime(paddle.nn.Layer):
    def __init__(self, fc_size=20):
        super().__init__()
        self._linear = paddle.nn.Linear(fc_size, fc_size)
@@ -82,7 +82,7 @@ class LayerErrorInCompiletime(fluid.dygraph.Layer):
        return out


-class LayerErrorInCompiletime2(fluid.dygraph.Layer):
+class LayerErrorInCompiletime2(paddle.nn.Layer):
    def __init__(self):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fetch_feed.py
@@ -23,7 +23,7 @@ from paddle.jit.api import to_static
 SEED = 2020


-class Pool2D(fluid.dygraph.Layer):
+class Pool2D(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.pool2d = paddle.nn.AvgPool2D(kernel_size=2, stride=1)
@@ -38,7 +38,7 @@ class Pool2D(fluid.dygraph.Layer):
        return pre


-class Linear(fluid.dygraph.Layer):
+class Linear(paddle.nn.Layer):
    def __init__(self, input_dim=10, output_dim=5):
        super().__init__()
        self.fc = paddle.nn.Linear(

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
@@ -289,7 +289,7 @@ class TestAst2FuncWithExternalFunc(TestDygraphIfElse):
        self.dyfunc = call_external_func


-class NetWithExternalFunc(fluid.dygraph.Layer):
+class NetWithExternalFunc(paddle.nn.Layer):
    @paddle.jit.to_static
    def forward(self, x, label=None):
        if paddle.mean(x) < 0:

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -40,7 +40,7 @@ input_specs = [
 ]


-class DynamicGRU(fluid.dygraph.Layer):
+class DynamicGRU(paddle.nn.Layer):
    def __init__(
        self,
        size,
@@ -90,7 +90,7 @@ class DynamicGRU(fluid.dygraph.Layer):
        return res


-class BiGRU(fluid.dygraph.Layer):
+class BiGRU(paddle.nn.Layer):
    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
        super().__init__()

@@ -158,7 +158,7 @@ class BiGRU(fluid.dygraph.Layer):
        return bi_merge


-class LinearChainCRF(fluid.dygraph.Layer):
+class LinearChainCRF(paddle.nn.Layer):
    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
        super().__init__()

@@ -222,7 +222,7 @@ class LinearChainCRF(fluid.dygraph.Layer):
        return log_likelihood


-class CRFDecoding(fluid.dygraph.Layer):
+class CRFDecoding(paddle.nn.Layer):
    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
        super().__init__()

@@ -271,7 +271,7 @@ class CRFDecoding(fluid.dygraph.Layer):
        return viterbi_path


-class ChunkEval(fluid.dygraph.Layer):
+class ChunkEval(paddle.nn.Layer):
    def __init__(
        self, num_chunk_types, chunk_scheme, excluded_chunk_types=None
    ):
@@ -344,7 +344,7 @@ class ChunkEval(fluid.dygraph.Layer):
        )


-class LexNet(fluid.dygraph.Layer):
+class LexNet(paddle.nn.Layer):
    def __init__(self, args, length=None):
        super().__init__()
        """

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -34,7 +34,7 @@ if paddle.fluid.is_compiled_with_cuda():
    paddle.fluid.set_flags({'FLAGS_cudnn_deterministic': True})


-class SimpleImgConvPool(fluid.dygraph.Layer):
+class SimpleImgConvPool(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -80,7 +80,7 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
        return x


-class MNIST(fluid.dygraph.Layer):
+class MNIST(paddle.nn.Layer):
    def __init__(self):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -36,7 +36,7 @@ if fluid.is_compiled_with_cuda():
 SEED = 2020


-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -83,7 +83,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
        return y


-class DepthwiseSeparable(fluid.dygraph.Layer):
+class DepthwiseSeparable(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -120,7 +120,7 @@ class DepthwiseSeparable(fluid.dygraph.Layer):
        return y


-class MobileNetV1(fluid.dygraph.Layer):
+class MobileNetV1(paddle.nn.Layer):
    def __init__(self, scale=1.0, class_dim=1000):
        super().__init__()
        self.scale = scale
@@ -276,7 +276,7 @@ class MobileNetV1(fluid.dygraph.Layer):
        return y


-class InvertedResidualUnit(fluid.dygraph.Layer):
+class InvertedResidualUnit(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -329,7 +329,7 @@ class InvertedResidualUnit(fluid.dygraph.Layer):
        return y


-class InvresiBlocks(fluid.dygraph.Layer):
+class InvresiBlocks(paddle.nn.Layer):
    def __init__(self, in_c, t, c, n, s):
        super().__init__()

@@ -366,7 +366,7 @@ class InvresiBlocks(fluid.dygraph.Layer):
        return y


-class MobileNetV2(fluid.dygraph.Layer):
+class MobileNetV2(paddle.nn.Layer):
    def __init__(self, class_dim=1000, scale=1.0):
        super().__init__()
        self.scale = scale

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
@@ -173,7 +173,7 @@ class TestWithNoGrad(unittest.TestCase):
                )


-class GPT2LMHeadModel(fluid.dygraph.Layer):
+class GPT2LMHeadModel(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.embedding0 = paddle.nn.Embedding(20, 16)

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
@@ -203,7 +203,7 @@ class StaticCode2:
        return __return_value_1


-class NetWithError(fluid.dygraph.layers.Layer):
+class NetWithError(paddle.nn.Layer):
    @to_static
    def forward(self, x):
        linear = paddle.nn.Linear(32, 64)
@@ -240,7 +240,7 @@ class TestEnableDeclarative(unittest.TestCase):
            )


-class Net(fluid.dygraph.layers.Layer):
+class Net(paddle.nn.Layer):
    def __init__(self):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
@@ -28,7 +28,7 @@ PRINT_STEP = 20
 SEED = 2020


-class SimpleLSTMRNN(fluid.Layer):
+class SimpleLSTMRNN(paddle.nn.Layer):
    def __init__(
        self, hidden_size, num_steps, num_layers=2, init_scale=0.1, dropout=None
    ):
@@ -128,7 +128,7 @@ class SimpleLSTMRNN(fluid.Layer):
        return real_res, last_hidden, last_cell


-class PtbModel(fluid.Layer):
+class PtbModel(paddle.nn.Layer):
    def __init__(
        self,
        hidden_size,

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
@@ -22,8 +22,9 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 import paddle.nn.functional as F
-from paddle.fluid.dygraph import Layer, to_variable
+from paddle.fluid.dygraph import to_variable
 from paddle.jit.api import to_static
+from paddle.nn import Layer

 SEED = 2020


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -55,7 +55,7 @@ def optimizer_setting(parameter_list=None):
    return optimizer


-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -86,7 +86,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
        return y


-class BottleneckBlock(fluid.dygraph.Layer):
+class BottleneckBlock(paddle.nn.Layer):
    def __init__(self, num_channels, num_filters, stride, shortcut=True):
        super().__init__()

@@ -140,7 +140,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
        return layer_helper.append_activation(y)


-class ResNet(fluid.dygraph.Layer):
+class ResNet(paddle.nn.Layer):
    def __init__(self, layers=50, class_dim=102):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
@@ -33,7 +33,7 @@ place = (
 )


-class SimpleFcLayer(fluid.dygraph.Layer):
+class SimpleFcLayer(paddle.nn.Layer):
    def __init__(self, fc_size):
        super().__init__()
        self._linear = paddle.nn.Linear(fc_size, fc_size)

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -89,7 +89,7 @@ def optimizer_setting(params, parameter_list):
    return optimizer


-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -120,7 +120,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
        return y


-class SqueezeExcitation(fluid.dygraph.Layer):
+class SqueezeExcitation(paddle.nn.Layer):
    def __init__(self, num_channels, reduction_ratio):

        super().__init__()
@@ -154,7 +154,7 @@ class SqueezeExcitation(fluid.dygraph.Layer):
        return y


-class BottleneckBlock(fluid.dygraph.Layer):
+class BottleneckBlock(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -218,7 +218,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
        return y


-class SeResNeXt(fluid.dygraph.Layer):
+class SeResNeXt(paddle.nn.Layer):
    def __init__(self, layers=50, class_dim=102):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
@@ -32,7 +32,7 @@ if fluid.is_compiled_with_cuda():
    fluid.set_flags({'FLAGS_cudnn_deterministic': True})


-class SimpleConvPool(fluid.dygraph.Layer):
+class SimpleConvPool(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -57,7 +57,7 @@ class SimpleConvPool(fluid.dygraph.Layer):
        return x


-class CNN(fluid.dygraph.Layer):
+class CNN(paddle.nn.Layer):
    def __init__(self, dict_dim, batch_size, seq_len):
        super().__init__()
        self.dict_dim = dict_dim
@@ -112,7 +112,7 @@ class CNN(fluid.dygraph.Layer):
        return avg_cost, prediction, acc


-class BOW(fluid.dygraph.Layer):
+class BOW(paddle.nn.Layer):
    def __init__(self, dict_dim, batch_size, seq_len):
        super().__init__()
        self.dict_dim = dict_dim
@@ -157,7 +157,7 @@ class BOW(fluid.dygraph.Layer):
        return avg_cost, prediction, acc


-class GRU(fluid.dygraph.Layer):
+class GRU(paddle.nn.Layer):
    def __init__(self, dict_dim, batch_size, seq_len):
        super().__init__()
        self.dict_dim = dict_dim
@@ -205,7 +205,7 @@ class GRU(fluid.dygraph.Layer):
        return avg_cost, prediction, acc


-class BiGRU(fluid.dygraph.Layer):
+class BiGRU(paddle.nn.Layer):
    def __init__(self, dict_dim, batch_size, seq_len):
        super().__init__()
        self.dict_dim = dict_dim

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
@@ -49,7 +49,7 @@ def parse_args():
    return args


-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -86,7 +86,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
        return y


-class BottleneckBlock(fluid.dygraph.Layer):
+class BottleneckBlock(paddle.nn.Layer):
    def __init__(
        self, num_channels, num_filters, stride, shortcut=True, seg_num=8
    ):
@@ -138,7 +138,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
        return y


-class TSM_ResNet(fluid.dygraph.Layer):
+class TSM_ResNet(paddle.nn.Layer):
    def __init__(self, name_scope, config, mode):
        super().__init__(name_scope)


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
@@ -219,7 +219,7 @@ def build_batch(dataset, batch_size, epoch_num):
        )


-class SkipGram(fluid.dygraph.Layer):
+class SkipGram(paddle.nn.Layer):
    def __init__(self, name_scope, vocab_size, embedding_size, init_scale=0.1):
        super().__init__(name_scope)
        self.vocab_size = vocab_size

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
@@ -17,9 +17,9 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 import paddle.nn.functional as F
-from paddle.fluid.dygraph import Layer, to_variable
+from paddle.fluid.dygraph import to_variable
 from paddle.jit.api import dygraph_to_static_func
-from paddle.nn import Linear
+from paddle.nn import Layer, Linear


 def position_encoding_init(n_position, d_pos_vec):

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
@@ -130,7 +130,7 @@ cfg.use_gpu = fluid.is_compiled_with_cuda()
 cfg.class_num = 80


-class YoloDetectionBlock(fluid.dygraph.Layer):
+class YoloDetectionBlock(paddle.nn.Layer):
    def __init__(self, ch_in, channel, is_test=True):
        super().__init__()

@@ -197,7 +197,7 @@ class YoloDetectionBlock(fluid.dygraph.Layer):
        return route, tip


-class Upsample(fluid.dygraph.Layer):
+class Upsample(paddle.nn.Layer):
    def __init__(self, scale=2):
        super().__init__()
        self.scale = scale
@@ -219,7 +219,7 @@ class Upsample(fluid.dygraph.Layer):
        return out


-class YOLOv3(fluid.dygraph.Layer):
+class YOLOv3(paddle.nn.Layer):
    def __init__(self, ch_in, is_train=True, use_random=False):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
@@ -20,8 +20,7 @@ import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
-from paddle.fluid.dygraph.layers import Layer
-from paddle.nn import Sequential
+from paddle.nn import Layer, Sequential


 class ReshapeHelp(Layer):

--- a/python/paddle/fluid/tests/unittests/mlu/parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/mlu/parallel_dygraph_sync_batch_norm.py
@@ -31,7 +31,7 @@ sys.path.append("..")
 from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase


-class TestLayer(fluid.dygraph.Layer):
+class TestLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,

--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
@@ -18,7 +18,6 @@ import numpy as np

 import paddle
 import paddle.distributed as dist
-import paddle.fluid as fluid
 from paddle.nn import Linear

 paddle.seed(1024)
@@ -29,7 +28,7 @@ in_dim = 10
 out_dim = 20


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(self, train_id):
        super().__init__()
        self.w1 = self.create_parameter(

--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py
@@ -18,7 +18,6 @@ import numpy as np

 import paddle
 import paddle.distributed as dist
-import paddle.fluid as fluid
 from paddle.nn import Linear

 paddle.seed(1024)
@@ -29,7 +28,7 @@ in_dim = 10
 out_dim = 20


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(self, train_id):
        super().__init__()
        self.w1 = self.create_parameter(

--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
@@ -16,11 +16,10 @@ import numpy as np
 from test_dist_base import TestParallelDyGraphRunnerBase, runtime_main

 import paddle
-import paddle.fluid as fluid
 from paddle.fluid.dygraph.base import to_variable


-class SimpleImgConvPool(fluid.dygraph.Layer):
+class SimpleImgConvPool(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -66,7 +65,7 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
        return x


-class MNIST(fluid.dygraph.Layer):
+class MNIST(paddle.nn.Layer):
    def __init__(self):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py
@@ -16,7 +16,6 @@ import numpy as np
 from test_dist_base import TestParallelDyGraphRunnerBase, runtime_main

 import paddle
-import paddle.fluid as fluid

 np.random.seed(2021)
 paddle.seed(1024)
@@ -25,7 +24,7 @@ batch_size = 4
 batch_num = 1000


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.net_a = paddle.nn.Sequential(

--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py
@@ -16,7 +16,6 @@ import numpy as np
 from test_dist_base import TestParallelDyGraphRunnerBase, runtime_main

 import paddle
-import paddle.fluid as fluid
 from paddle.fluid.dygraph.base import to_variable
 from paddle.nn import Linear

@@ -24,7 +23,7 @@ np.random.seed(2021)
 paddle.seed(1024)


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(self):
        # bias is unused parameters, and it share with net_a
        super().__init__()

--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
@@ -21,7 +21,7 @@ from paddle.fluid.dygraph.base import to_variable
 from paddle.nn import Embedding


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(
        self,
        hidden_size,

--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -22,7 +22,7 @@ from paddle.fluid.dygraph import to_variable
 from paddle.fluid.framework import EagerParamBase, ParamBase, in_dygraph_mode


-class L1(fluid.Layer):
+class L1(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self._param_attr = fluid.ParamAttr(
@@ -39,7 +39,7 @@ class L1(fluid.Layer):
        return self.w1 + self.w2


-class L2(fluid.Layer):
+class L2(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.layer1 = L1()
@@ -49,7 +49,7 @@ class L2(fluid.Layer):
        return self.layer1() + self.layer2()


-class L3(fluid.Layer):
+class L3(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.layer1 = L2()
@@ -97,7 +97,7 @@ class TestBaseLayer(unittest.TestCase):

    def test_add_parameter_with_error(self):
        with fluid.dygraph.guard():
-            net = fluid.Layer()
+            net = paddle.nn.Layer()
            param = net.create_parameter(shape=[1])

            with self.assertRaises(TypeError):
@@ -121,7 +121,7 @@ class TestBaseLayer(unittest.TestCase):
            net.add_parameter("load_param", load_param)


-class BufferLayer(fluid.Layer):
+class BufferLayer(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        buffer_var = to_variable(np.zeros([2, 4]).astype('int32'))
@@ -131,7 +131,7 @@ class BufferLayer(fluid.Layer):
        pass


-class BufferNet(fluid.Layer):
+class BufferNet(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.buffer_layer = BufferLayer()
@@ -173,7 +173,7 @@ class TestBuffer(unittest.TestCase):

    def test_register_buffer_with_error(self):
        with fluid.dygraph.guard():
-            net = fluid.Layer()
+            net = paddle.nn.Layer()
            var = to_variable(np.zeros([1]))

            with self.assertRaisesRegex(
@@ -217,7 +217,7 @@ class TestBuffer(unittest.TestCase):

    def test_register_buffer_same_name(self):
        with fluid.dygraph.guard():
-            net = fluid.Layer()
+            net = paddle.nn.Layer()
            var1 = to_variable(np.zeros([1]))
            var2 = to_variable(np.zeros([2]))
            var3 = to_variable(np.zeros([3]))
@@ -231,7 +231,7 @@ class TestBuffer(unittest.TestCase):

    def test_buffer_not_persistable(self):
        with fluid.dygraph.guard():
-            net = fluid.Layer()
+            net = paddle.nn.Layer()
            var1 = to_variable(np.zeros([1]))

            net.register_buffer("buffer_name", var1, persistable=False)
@@ -240,7 +240,7 @@ class TestBuffer(unittest.TestCase):

    def test_buffer_not_persistable_del(self):
        with fluid.dygraph.guard():
-            net = fluid.Layer()
+            net = paddle.nn.Layer()
            var1 = to_variable(np.zeros([1]))
            net.register_buffer("buffer_name", var1, persistable=False)
            del net.buffer_name
@@ -248,7 +248,7 @@ class TestBuffer(unittest.TestCase):

    def test_buffer_not_persistable_overwrite(self):
        with fluid.dygraph.guard():
-            net = fluid.Layer()
+            net = paddle.nn.Layer()
            var1 = to_variable(np.zeros([1]))
            var2 = to_variable(np.zeros([2]))
            net.register_buffer("buffer_name", var1, persistable=False)
@@ -264,7 +264,7 @@ class TestBuffer(unittest.TestCase):

    def test_buffer_not_persistable_assign(self):
        with fluid.dygraph.guard():
-            net = fluid.Layer()
+            net = paddle.nn.Layer()
            var1 = to_variable(np.zeros([1]))
            net.register_buffer("buffer_name", var1, persistable=False)

@@ -288,14 +288,14 @@ class TestBuffer(unittest.TestCase):

    def test_buffer_not_persistable_load(self):
        with fluid.dygraph.guard():
-            net = fluid.Layer()
+            net = paddle.nn.Layer()
            var1 = to_variable(np.zeros([1]))
            net.register_buffer("buffer_name", var1, persistable=False)
            net.load_dict({})

    def test_buffer_state_dict(self):
        with fluid.dygraph.guard():
-            net = fluid.Layer()
+            net = paddle.nn.Layer()
            var1 = to_variable(np.zeros([2, 3]))
            var2 = to_variable(np.zeros([3, 2]))
            net.register_buffer("buffer_var1", var1)
@@ -307,7 +307,7 @@ class TestBuffer(unittest.TestCase):
            )

            # load state_dict
-            net_load = fluid.Layer()
+            net_load = paddle.nn.Layer()
            var = to_variable(np.ones([2, 3]))
            net_load.register_buffer("buffer_var1", var)
            net_load.load_dict(net.state_dict())

--- a/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py
@@ -21,7 +21,7 @@ import paddle.fluid as fluid
 from paddle.nn import Linear


-class SimpleImgConvPool(fluid.dygraph.Layer):
+class SimpleImgConvPool(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -68,7 +68,7 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
        return x


-class MNIST(fluid.dygraph.Layer):
+class MNIST(paddle.nn.Layer):
    def __init__(self, dtype="float32"):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
@@ -27,7 +27,7 @@ from paddle.nn import Linear
 SEED = 123123111


-class SimpleImgConvPool(fluid.dygraph.Layer):
+class SimpleImgConvPool(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -73,7 +73,7 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
        return x


-class MNIST(fluid.dygraph.Layer):
+class MNIST(paddle.nn.Layer):
    def __init__(self):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -22,7 +22,7 @@ from paddle.nn import Embedding
 from paddle.tensor import random


-class AutoPruneLayer0(fluid.Layer):
+class AutoPruneLayer0(paddle.nn.Layer):
    def __init__(self, input_size):
        super().__init__()
        self.linear1 = paddle.nn.Linear(
@@ -50,7 +50,7 @@ class AutoPruneLayer0(fluid.Layer):
        return d


-class AutoPruneLayer1(fluid.Layer):
+class AutoPruneLayer1(paddle.nn.Layer):
    def __init__(self, input_size):
        super().__init__()
        self.linear1 = paddle.nn.Linear(
@@ -79,7 +79,7 @@ class AutoPruneLayer1(fluid.Layer):
        return d


-class AutoPruneLayer2(fluid.Layer):
+class AutoPruneLayer2(paddle.nn.Layer):
    def __init__(self, input_size):
        super().__init__()
        self.linear = paddle.nn.Linear(input_size, 10)
@@ -98,7 +98,7 @@ class AutoPruneLayer2(fluid.Layer):
        return loss


-class AutoPruneLayer3(fluid.Layer):
+class AutoPruneLayer3(paddle.nn.Layer):
    def __init__(self, input_size):
        super().__init__()
        self.linear = paddle.nn.Linear(input_size, 20)
@@ -117,7 +117,7 @@ class AutoPruneLayer3(fluid.Layer):
            return loss, part1, part2


-class MyLayer(fluid.Layer):
+class MyLayer(paddle.nn.Layer):
    def __init__(self, input_size, vocab_size, size, dtype="float32"):
        super().__init__(dtype=dtype)
        self.embed0 = Embedding(vocab_size, size)
@@ -139,7 +139,7 @@ class MyLayer(fluid.Layer):
        return loss


-class MyLayer2(fluid.Layer):
+class MyLayer2(paddle.nn.Layer):
    def __init__(self, input_size, vocab_size, size, dtype="float32"):
        super().__init__(dtype=dtype)
        self.embed0 = Embedding(vocab_size, size)

--- a/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py
@@ -20,7 +20,7 @@ import paddle
 import paddle.fluid as fluid


-class MyLayer(fluid.Layer):
+class MyLayer(paddle.nn.Layer):
    def __init__(self, layerlist):
        super().__init__()
        self.layerlist = layerlist

--- a/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py
@@ -21,7 +21,7 @@ import paddle.fluid as fluid
 from paddle import _legacy_C_ops


-class MyLayer(fluid.Layer):
+class MyLayer(paddle.nn.Layer):
    def __init__(self, num_stacked_param, use_fluid_api):
        super().__init__()
        # create ParameterList with iterable Parameters

--- a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
@@ -22,7 +22,7 @@ import paddle.fluid.core as core
 from paddle.nn import Linear


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -27,7 +27,7 @@ from paddle.fluid.dygraph.base import to_variable
 from paddle.nn import Linear


-class DMF(fluid.Layer):
+class DMF(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self._user_latent = Linear(1000, 256)
@@ -78,7 +78,7 @@ class DMF(fluid.Layer):
        return paddle.multiply(users, items)


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self._user_latent = Linear(1000, 256)
@@ -111,7 +111,7 @@ class MLP(fluid.Layer):
        return match_vec


-class DeepCF(fluid.Layer):
+class DeepCF(paddle.nn.Layer):
    def __init__(self, num_users, num_items, matrix):
        super().__init__()
        self._num_users = num_users

--- a/python/paddle/fluid/tests/unittests/test_imperative_framework.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_framework.py
@@ -21,7 +21,7 @@ import paddle
 import paddle.fluid as fluid


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, input_size):
        super().__init__()
        self._linear1 = paddle.nn.Linear(

--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -25,7 +25,7 @@ from paddle.fluid.optimizer import SGDOptimizer
 from paddle.nn import Linear


-class Discriminator(fluid.Layer):
+class Discriminator(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self._fc1 = Linear(1, 32)
@@ -38,7 +38,7 @@ class Discriminator(fluid.Layer):
        return x


-class Generator(fluid.Layer):
+class Generator(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self._fc1 = Linear(2, 64)

--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -30,7 +30,7 @@ def gen_data():
    pass


-class GraphConv(fluid.Layer):
+class GraphConv(paddle.nn.Layer):
    def __init__(self, name_scope, in_features, out_features):
        super().__init__(name_scope)

@@ -51,7 +51,7 @@ class GraphConv(fluid.Layer):
        return paddle.matmul(adj, support) + self.bias


-class GCN(fluid.Layer):
+class GCN(paddle.nn.Layer):
    def __init__(self, name_scope, num_hidden):
        super().__init__(name_scope)
        self.gc = GraphConv(self.full_name(), num_hidden, 32)

--- a/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
@@ -21,7 +21,7 @@ import paddle.fluid as fluid
 import paddle.nn as nn


-class LeNetDygraph(fluid.dygraph.Layer):
+class LeNetDygraph(paddle.nn.Layer):
    def __init__(self, num_classes=10, classifier_activation='softmax'):
        super().__init__()
        self.num_classes = num_classes

--- a/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
@@ -21,7 +21,7 @@ import paddle.fluid as fluid
 import paddle.nn as nn


-class LeNetDygraph(fluid.dygraph.Layer):
+class LeNetDygraph(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(

--- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
@@ -170,7 +170,7 @@ class TestDygraphLoadStatic(unittest.TestCase):

        with fluid.dygraph.guard():

-            class MyTest(fluid.dygraph.Layer):
+            class MyTest(paddle.nn.Layer):
                def __init__(self):
                    super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -25,7 +25,7 @@ from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.optimizer import SGDOptimizer


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(
        self,
        hidden_size,

--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
@@ -25,7 +25,7 @@ from paddle.fluid.optimizer import SGDOptimizer
 from paddle.nn import Linear


-class SimpleImgConvPool(fluid.dygraph.Layer):
+class SimpleImgConvPool(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -70,7 +70,7 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
        return x


-class MNIST(fluid.dygraph.Layer):
+class MNIST(paddle.nn.Layer):
    def __init__(self):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
@@ -20,7 +20,7 @@ import paddle
 import paddle.fluid as fluid


-class MyLayer(fluid.Layer):
+class MyLayer(paddle.nn.Layer):
    def __init__(self, num_channel, dim, num_filter=5):
        super().__init__()
        self.fc = paddle.nn.Linear(dim, dim)
@@ -84,7 +84,7 @@ class TestImperativeNamedParameters(unittest.TestCase):
    def test_dir_layer(self):
        with fluid.dygraph.guard():

-            class Mymodel(fluid.dygraph.Layer):
+            class Mymodel(paddle.nn.Layer):
                def __init__(self):
                    super().__init__()
                    self.linear1 = paddle.nn.Linear(10, 10)

--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -59,7 +59,7 @@ class Config:
    DATA_SHAPE = [1, 16, 64]


-class ConvBNPool(fluid.dygraph.Layer):
+class ConvBNPool(paddle.nn.Layer):
    def __init__(
        self,
        group,
@@ -122,7 +122,7 @@ class ConvBNPool(fluid.dygraph.Layer):
        return bn_1


-class OCRConv(fluid.dygraph.Layer):
+class OCRConv(paddle.nn.Layer):
    def __init__(self, is_test=False, use_cudnn=True):
        super().__init__()
        self.conv_bn_pool_1 = ConvBNPool(
@@ -152,7 +152,7 @@ class OCRConv(fluid.dygraph.Layer):
        return inputs_4


-class DynamicGRU(fluid.dygraph.Layer):
+class DynamicGRU(paddle.nn.Layer):
    def __init__(
        self,
        size,
@@ -193,7 +193,7 @@ class DynamicGRU(fluid.dygraph.Layer):
        return res


-class EncoderNet(fluid.dygraph.Layer):
+class EncoderNet(paddle.nn.Layer):
    def __init__(
        self, rnn_hidden_size=Config.encoder_size, is_test=False, use_cudnn=True
    ):
@@ -277,7 +277,7 @@ class EncoderNet(fluid.dygraph.Layer):
        return gru_backward, encoded_vector, encoded_proj


-class SimpleAttention(fluid.dygraph.Layer):
+class SimpleAttention(paddle.nn.Layer):
    def __init__(self, decoder_size):
        super().__init__()

@@ -312,7 +312,7 @@ class SimpleAttention(fluid.dygraph.Layer):
        return context


-class GRUDecoderWithAttention(fluid.dygraph.Layer):
+class GRUDecoderWithAttention(paddle.nn.Layer):
    def __init__(self, decoder_size, num_classes):
        super().__init__()
        self.simple_attention = SimpleAttention(decoder_size)
@@ -359,7 +359,7 @@ class GRUDecoderWithAttention(fluid.dygraph.Layer):
        return res1


-class OCRAttention(fluid.dygraph.Layer):
+class OCRAttention(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.encoder_net = EncoderNet()

--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -46,7 +46,7 @@ from paddle.fluid.optimizer import (
 # In dygraph, don't support ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer.


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
@@ -43,7 +43,7 @@ from paddle.fluid.optimizer import (
 # In dygraph, don't support ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer.


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_parallel_coalesce_split.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_parallel_coalesce_split.py
@@ -24,7 +24,7 @@ from paddle.fluid import core
 from paddle.fluid.dygraph.base import to_variable


-class MyLayer(fluid.Layer):
+class MyLayer(paddle.nn.Layer):
    def __init__(self, name_scope):
        super().__init__(name_scope)


--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -27,7 +27,7 @@ from paddle.fluid.optimizer import SGDOptimizer
 from paddle.nn import Embedding


-class SimpleLSTMRNN(fluid.Layer):
+class SimpleLSTMRNN(paddle.nn.Layer):
    def __init__(
        self, hidden_size, num_steps, num_layers=2, init_scale=0.1, dropout=None
    ):
@@ -145,7 +145,7 @@ class SimpleLSTMRNN(fluid.Layer):
        return real_res, last_hidden, last_cell


-class PtbModel(fluid.Layer):
+class PtbModel(paddle.nn.Layer):
    def __init__(
        self,
        hidden_size,

--- a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py
@@ -23,7 +23,7 @@ import paddle.fluid.core as core
 from paddle.fluid.dygraph.base import to_variable


-class RecurrentTest(fluid.Layer):
+class RecurrentTest(paddle.nn.Layer):
    def __init__(self, name_scope):
        super().__init__(name_scope)


--- a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
@@ -24,7 +24,7 @@ from paddle.fluid import core
 from paddle.fluid.optimizer import SGDOptimizer


-class Policy(fluid.dygraph.Layer):
+class Policy(paddle.nn.Layer):
    def __init__(self, input_size):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -75,7 +75,7 @@ def optimizer_setting(params, parameter_list=None):
    return optimizer


-class ConvBNLayer(fluid.Layer):
+class ConvBNLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -107,7 +107,7 @@ class ConvBNLayer(fluid.Layer):
        return y


-class BottleneckBlock(fluid.Layer):
+class BottleneckBlock(paddle.nn.Layer):
    def __init__(
        self, num_channels, num_filters, stride, shortcut=True, use_cudnn=False
    ):
@@ -163,7 +163,7 @@ class BottleneckBlock(fluid.Layer):
        return layer_helper.append_activation(y)


-class ResNet(fluid.Layer):
+class ResNet(paddle.nn.Layer):
    def __init__(self, layers=50, class_dim=102, use_cudnn=True):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -27,7 +27,7 @@ from paddle.nn import Embedding
 from paddle.optimizer import Adam


-class SimpleLSTMRNN(fluid.Layer):
+class SimpleLSTMRNN(paddle.nn.Layer):
    def __init__(
        self, hidden_size, num_steps, num_layers=2, init_scale=0.1, dropout=None
    ):
@@ -142,7 +142,7 @@ class SimpleLSTMRNN(fluid.Layer):
        return real_res, last_hidden, last_cell


-class PtbModel(fluid.Layer):
+class PtbModel(paddle.nn.Layer):
    def __init__(
        self,
        hidden_size,

--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -64,7 +64,7 @@ def optimizer_setting(params, parameter_list=None):
    return optimizer


-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -95,7 +95,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
        return y


-class SqueezeExcitation(fluid.dygraph.Layer):
+class SqueezeExcitation(paddle.nn.Layer):
    def __init__(self, num_channels, reduction_ratio):

        super().__init__()
@@ -129,7 +129,7 @@ class SqueezeExcitation(fluid.dygraph.Layer):
        return y


-class BottleneckBlock(fluid.dygraph.Layer):
+class BottleneckBlock(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -192,7 +192,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
        return y


-class SeResNeXt(fluid.dygraph.Layer):
+class SeResNeXt(paddle.nn.Layer):
    def __init__(self, layers=50, class_dim=102):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
@@ -26,7 +26,7 @@ from paddle.fluid.optimizer import SGDOptimizer
 from paddle.nn import Embedding


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(
        self,
        hidden_size,

--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -106,7 +106,7 @@ def create_mnist_dataset(cfg):
    return __impl__


-class InstanceNorm(fluid.dygraph.Layer):
+class InstanceNorm(paddle.nn.Layer):
    def __init__(self, num_channels, epsilon=1e-5):
        super().__init__()
        self.epsilon = epsilon
@@ -129,7 +129,7 @@ class InstanceNorm(fluid.dygraph.Layer):
            )


-class Conv2DLayer(fluid.dygraph.Layer):
+class Conv2DLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -170,7 +170,7 @@ class Conv2DLayer(fluid.dygraph.Layer):
        return conv


-class Deconv2DLayer(fluid.dygraph.Layer):
+class Deconv2DLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -212,7 +212,7 @@ class Deconv2DLayer(fluid.dygraph.Layer):
        return deconv


-class ResidualBlock(fluid.dygraph.Layer):
+class ResidualBlock(paddle.nn.Layer):
    def __init__(self, num_channels, num_filters):
        super().__init__()
        self._conv0 = Conv2DLayer(
@@ -241,7 +241,7 @@ class ResidualBlock(fluid.dygraph.Layer):
        return input + conv1


-class Generator(fluid.dygraph.Layer):
+class Generator(paddle.nn.Layer):
    def __init__(self, cfg, num_channels=3):
        super().__init__()
        conv_base = Conv2DLayer(
@@ -324,7 +324,7 @@ class Generator(fluid.dygraph.Layer):
        return out


-class Discriminator(fluid.dygraph.Layer):
+class Discriminator(paddle.nn.Layer):
    def __init__(self, cfg, num_channels=3):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py
@@ -21,7 +21,7 @@ import paddle
 import paddle.fluid as fluid


-class SimpleFCLayer(fluid.dygraph.Layer):
+class SimpleFCLayer(paddle.nn.Layer):
    def __init__(self, feature_size, batch_size, fc_size):
        super().__init__()
        self._linear = paddle.nn.Linear(feature_size, fc_size)

--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -20,9 +20,9 @@ from test_imperative_base import new_program_scope
 import paddle
 import paddle.fluid as fluid
 import paddle.nn.functional as F
-from paddle.fluid import Layer, core
+from paddle.fluid import core
 from paddle.fluid.dygraph import guard, to_variable
-from paddle.nn import Linear
+from paddle.nn import Layer, Linear

 np.set_printoptions(suppress=True)


--- a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
@@ -20,7 +20,7 @@ import paddle
 import paddle.fluid as fluid


-class ConvBNLayer(fluid.Layer):
+class ConvBNLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,

--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -51,7 +51,7 @@ def random_batch_reader(input_size, label_size):
    return __reader__


-class LinearNet(fluid.dygraph.Layer):
+class LinearNet(paddle.nn.Layer):
    def __init__(self, in_size, out_size):
        super().__init__()
        self._linear = Linear(in_size, out_size)
@@ -61,7 +61,7 @@ class LinearNet(fluid.dygraph.Layer):
        return self._linear(x)


-class LinearNetWithInputSpec(fluid.dygraph.Layer):
+class LinearNetWithInputSpec(paddle.nn.Layer):
    def __init__(self, in_size, out_size):
        super().__init__()
        self._linear = Linear(in_size, out_size)
@@ -71,7 +71,7 @@ class LinearNetWithInputSpec(fluid.dygraph.Layer):
        return self._linear(x)


-class LinearNetNotDeclarative(fluid.dygraph.Layer):
+class LinearNetNotDeclarative(paddle.nn.Layer):
    def __init__(self, in_size, out_size):
        super().__init__()
        self._linear = Linear(in_size, out_size)
@@ -136,7 +136,7 @@ class LinerNetWithUselessInput(paddle.nn.Layer):
        return out


-class LinearNetReturnLoss(fluid.dygraph.Layer):
+class LinearNetReturnLoss(paddle.nn.Layer):
    def __init__(self, in_size, out_size):
        super().__init__()
        self._linear = Linear(in_size, out_size)
@@ -149,7 +149,7 @@ class LinearNetReturnLoss(fluid.dygraph.Layer):
        return z, loss


-class LinearNetMultiInput(fluid.dygraph.Layer):
+class LinearNetMultiInput(paddle.nn.Layer):
    def __init__(self, in_size, out_size):
        super().__init__()
        self._linear1 = Linear(in_size, out_size)
@@ -168,7 +168,7 @@ class LinearNetMultiInput(fluid.dygraph.Layer):
        return x_out, y_out, loss


-class LinearNetMultiInput1(fluid.dygraph.Layer):
+class LinearNetMultiInput1(paddle.nn.Layer):
    def __init__(self, in_size, out_size):
        super().__init__()
        self._linear1 = Linear(in_size, out_size)
@@ -187,7 +187,7 @@ class LinearNetMultiInput1(fluid.dygraph.Layer):
        return x_out, y_out, loss


-class MultiLoadingLinearNet(fluid.dygraph.Layer):
+class MultiLoadingLinearNet(paddle.nn.Layer):
    def __init__(self, size, model_path):
        super().__init__()
        self._linear = Linear(size, size)
@@ -203,7 +203,7 @@ class MultiLoadingLinearNet(fluid.dygraph.Layer):
        return y


-class LinearNetReturnHidden(fluid.dygraph.Layer):
+class LinearNetReturnHidden(paddle.nn.Layer):
    def __init__(self, in_size, out_size):
        super().__init__()
        self._linear_1 = Linear(in_size, out_size)
@@ -217,7 +217,7 @@ class LinearNetReturnHidden(fluid.dygraph.Layer):
        return y, loss


-class LinearNetWithNestOut(fluid.dygraph.Layer):
+class LinearNetWithNestOut(paddle.nn.Layer):
    def __init__(self, in_size, out_size):
        super().__init__()
        self._linear_1 = Linear(in_size, out_size)
@@ -278,7 +278,7 @@ class NoParamLayer(paddle.nn.Layer):
        return x + y


-class LinearNetWithMultiStaticFunc(fluid.dygraph.Layer):
+class LinearNetWithMultiStaticFunc(paddle.nn.Layer):
    def __init__(self, in_size, out_size):
        super().__init__()
        self._linear_0 = Linear(in_size, out_size)

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -80,7 +80,7 @@ class LayerTest(unittest.TestCase):

 class TestLayer(LayerTest):
    def test_custom_layer_with_kwargs(self):
-        class CustomLayer(fluid.Layer):
+        class CustomLayer(paddle.nn.Layer):
            def __init__(self, input_size, linear1_size=4):
                super().__init__()
                self.linear1 = paddle.nn.Linear(

--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
@@ -34,7 +34,7 @@ from paddle.io import DataLoader
 from paddle.nn import Linear


-class SimpleFCNet(fluid.dygraph.Layer):
+class SimpleFCNet(paddle.nn.Layer):
    def __init__(self):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
@@ -34,7 +34,7 @@ from paddle.io import DataLoader
 from paddle.nn import Linear


-class SimpleFCNet(fluid.dygraph.Layer):
+class SimpleFCNet(paddle.nn.Layer):
    def __init__(self):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_op_function_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_op_function_generator.py
@@ -22,7 +22,7 @@ import paddle.nn.functional as F
 from paddle import _legacy_C_ops


-class TestTracedLayer(fluid.dygraph.Layer):
+class TestTracedLayer(paddle.nn.Layer):
    def __init__(self, name_scope):
        super().__init__(name_scope)


--- a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
@@ -140,7 +140,7 @@ def static(
    return out_hidden, out_pred, loss


-class DygraphLayer(fluid.dygraph.Layer):
+class DygraphLayer(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.fc_1 = paddle.nn.Linear(

--- a/python/paddle/fluid/tests/unittests/test_retain_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py
@@ -25,7 +25,7 @@ np.random.seed(SEED)
 paddle.seed(SEED)


-class Generator(fluid.dygraph.Layer):
+class Generator(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.conv1 = paddle.nn.Conv2D(3, 3, 3, padding=1)
@@ -36,7 +36,7 @@ class Generator(fluid.dygraph.Layer):
        return x


-class Discriminator(fluid.dygraph.Layer):
+class Discriminator(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.convd = paddle.nn.Conv2D(6, 3, 1)

--- a/python/paddle/fluid/tests/unittests/test_static_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py
@@ -31,7 +31,7 @@ from paddle.fluid.optimizer import Adam
 paddle.enable_static()


-class SimpleLSTMRNN(fluid.Layer):
+class SimpleLSTMRNN(paddle.nn.Layer):
    def __init__(
        self,
        name_scope,
@@ -153,7 +153,7 @@ class SimpleLSTMRNN(fluid.Layer):
        return real_res, last_hidden, last_cell


-class PtbModel(fluid.Layer):
+class PtbModel(paddle.nn.Layer):
    def __init__(
        self,
        name_scope,

--- a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
+++ b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
@@ -75,7 +75,7 @@ class TestTracedLayerErrMsg(unittest.TestCase):
                    None, [in_x]
                )
            self.assertEqual(
-                "The type of 'layer' in paddle.jit.TracedLayer.trace must be fluid.dygraph.Layer, but received <{} 'NoneType'>.".format(
+                "The type of 'layer' in paddle.jit.TracedLayer.trace must be paddle.nn.Layer, but received <{} 'NoneType'>.".format(
                    self.type_str
                ),
                str(e.exception),

--- a/python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_gradient_check.py
@@ -18,7 +18,6 @@ import numpy as np

 import paddle
 import paddle.distributed as dist
-import paddle.fluid as fluid
 from paddle.nn import Linear

 paddle.seed(1024)
@@ -29,7 +28,7 @@ in_dim = 10
 out_dim = 20


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(self, train_id):
        super(SimpleNet, self).__init__()
        self.w1 = self.create_parameter(

--- a/python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
+++ b/python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
@@ -18,7 +18,6 @@ import numpy as np

 import paddle
 import paddle.distributed as dist
-import paddle.fluid as fluid
 from paddle.nn import Linear

 paddle.seed(1024)
@@ -29,7 +28,7 @@ in_dim = 10
 out_dim = 20


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(self, train_id):
        super(SimpleNet, self).__init__()
        self.w1 = self.create_parameter(

--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -30,7 +30,7 @@ from ..fluid.core import CustomPlace  # noqa: F401
 from ..fluid.core import VarBase  # noqa: F401

 from ..fluid import core  # noqa: F401
-from ..fluid.dygraph import base, layers, to_variable
+from ..fluid.dygraph import base, to_variable
 from ..fluid.dygraph.base import no_grad_ as no_grad  # noqa: F401
 from ..fluid.dygraph.base import grad  # noqa: F401
 from .io import save  # noqa: F401

--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -37,13 +37,6 @@ from paddle.fluid.framework import (
    _non_static_mode,
    _varbase_creator,
 )
-from paddle.jit.api import _SaveLoadConfig
-from paddle.jit.translated_layer import (
-    INFER_MODEL_SUFFIX,
-    INFER_PARAMS_SUFFIX,
-    _construct_params_and_buffers,
-    _construct_program_holders,
-)

 from .io_utils import (
    _is_file_path,
@@ -81,6 +74,13 @@ def _build_saved_state_dict(state_dict):

 def _load_state_dict_from_save_inference_model(model_path, config):
    # 1. load program desc & construct _ProgramHolder
+    # TODO(GGBond8488):From a long-term perspective, it is inappropriate for the framework to
+    # rely on jit. It is necessary to migrate the dependency from jit to the framework in the future
+    from paddle.jit.translated_layer import (
+        _construct_params_and_buffers,
+        _construct_program_holders,
+    )
+
    programs = _construct_program_holders(model_path, config.model_filename)

    # 2. load layer parameters & buffers
@@ -168,6 +168,13 @@ def _load_state_dict_from_save_params(model_path):
 def _build_load_path_and_config(path, config):
    # NOTE(chenweihang): If both [prefix save format] and [directory save format] exist,
    # raise error, avoid confusing behavior
+    # TODO(GGBond8488):From a long-term perspective, it is inappropriate for the framework to
+    # rely on jit. It is necessary to migrate the dependency from jit to the framework in the future
+    from paddle.jit.translated_layer import (
+        INFER_MODEL_SUFFIX,
+        INFER_PARAMS_SUFFIX,
+    )
+
    prefix_format_path = path + INFER_MODEL_SUFFIX
    prefix_format_exist = os.path.exists(prefix_format_path)
    directory_format_exist = os.path.isdir(path)
@@ -233,6 +240,10 @@ def _parse_load_config(configs):
            )

    # construct inner config
+    # TODO(GGBond8488):From a long-term perspective, it is inappropriate for the framework to
+    # rely on jit. It is necessary to migrate the dependency from jit to the framework in the future
+    from paddle.jit.api import _SaveLoadConfig
+
    inner_config = _SaveLoadConfig()
    inner_config.model_filename = configs.get('model_filename', None)
    inner_config.params_filename = configs.get('params_filename', None)
@@ -254,6 +265,10 @@ def _parse_save_config(configs):
            )

    # construct inner config
+    # TODO(GGBond8488):From a long-term perspective, it is inappropriate for the framework to
+    # rely on jit. It is necessary to migrate the dependency from jit to the framework in the future
+    from paddle.jit.api import _SaveLoadConfig
+
    inner_config = _SaveLoadConfig()
    inner_config.use_binary_format = configs.get('use_binary_format', False)
    inner_config.pickle_protocol = configs.get('pickle_protocol', None)
@@ -298,7 +313,9 @@ def _pickle_save(obj, f, protocol):
        return layer

    _parse_every_object(
-        obj, lambda v: isinstance(v, fluid.Layer), create_layer_dispatch_table
+        obj,
+        lambda v: isinstance(v, paddle.nn.Layer),
+        create_layer_dispatch_table,
    )

    def add_dispatch_table():
@@ -371,7 +388,7 @@ def _is_state_dict(obj):
            return isinstance(
                obj,
                (
-                    fluid.Layer,
+                    paddle.nn.Layer,
                    Program,
                    core.VarBase,
                    core.eager.Tensor,
@@ -493,7 +510,7 @@ def _parse_every_object(obj, condition_func, convert_func):

 def _parse_load_result(obj, return_numpy):
    def is_layer(obj):
-        return isinstance(obj, fluid.Layer)
+        return isinstance(obj, paddle.nn.Layer)

    def parse_layer(obj):
        temp_dict = _parse_load_result(obj.__dict__, False)

--- a/python/paddle/incubate/asp/supported_layer_list.py
+++ b/python/paddle/incubate/asp/supported_layer_list.py
@@ -99,14 +99,12 @@ def add_supported_layer(layer, pruning_func=None):
    name = None
    if isinstance(layer, str):
        name = layer
-    elif isinstance(layer, paddle.fluid.dygraph.layers.Layer):
-        name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
+    elif isinstance(layer, paddle.nn.Layer):
+        name = paddle.nn.layer.layers._convert_camel_to_snake(
            type(layer).__name__
        )
-    elif issubclass(layer, paddle.fluid.dygraph.layers.Layer):
-        name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
-            layer.__name__
-        )
+    elif issubclass(layer, paddle.nn.Layer):
+        name = paddle.nn.layer.layers._convert_camel_to_snake(layer.__name__)
    else:
        assert (
            "The type of layer should be string of Layer, but got {}!".format(

--- a/python/paddle/jit/api.py
+++ b/python/paddle/jit/api.py
@@ -55,7 +55,7 @@ from paddle.jit.translated_layer import (
    INFER_PARAMS_INFO_SUFFIX,
    INFER_PROPERTY_SUFFIX,
 )
-from paddle.fluid.dygraph.layers import Layer
+from paddle.nn import Layer
 from paddle.fluid.executor import Executor, scope_guard
 from paddle.fluid.framework import (
    Block,
@@ -1618,7 +1618,7 @@ class TracedLayer:
        """
        assert isinstance(
            layer, Layer
-        ), "The type of 'layer' in paddle.jit.TracedLayer.trace must be fluid.dygraph.Layer, but received {}.".format(
+        ), "The type of 'layer' in paddle.jit.TracedLayer.trace must be paddle.nn.Layer, but received {}.".format(
            type(layer)
        )
        outs, prog, feed, fetch, parameters = _trace(layer, inputs)

--- a/python/paddle/jit/dy2static/convert_call_func.py
+++ b/python/paddle/jit/dy2static/convert_call_func.py
@@ -23,8 +23,6 @@ from typing import Any, List

 import numpy

-from paddle.fluid.dygraph.layers import Layer
-
 from .convert_operators import (
    convert_enumerate,
    convert_len,
@@ -304,6 +302,8 @@ def convert_call(func):
            converted_call = None

    elif hasattr(func, '__class__') and hasattr(func.__class__, '__call__'):
+        from paddle.nn import Layer
+
        if hasattr(func, 'forward') and isinstance(func, Layer):
            try:
                _, forward_func = unwrap_decorators(func.forward)

--- a/python/paddle/jit/dy2static/function_spec.py
+++ b/python/paddle/jit/dy2static/function_spec.py
@@ -19,9 +19,9 @@ import numpy as np

 import paddle
 from paddle.fluid import core
-from paddle.fluid.dygraph import layers
 from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.jit.translated_layer import TranslatedLayer
+from paddle.nn.layer import layers

 from . import logging_utils
 from .utils import (

--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
@@ -21,9 +21,9 @@ from paddle import _legacy_C_ops
 from paddle.amp.auto_cast import _in_amp_guard, _in_pure_fp16_guard
 from paddle.fluid import backward, core, framework, program_guard
 from paddle.fluid.compiler import BuildStrategy
-from paddle.fluid.dygraph import layers
 from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.framework import _apply_pass
+from paddle.nn.layer import layers

 from . import logging_utils
 from .utils import RETURN_NO_VALUE_MAGIC_NUM, _out_grad_names, _param_grad_names

--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -22,8 +22,8 @@ import weakref
 from paddle.amp.auto_cast import _in_amp_guard
 from paddle.fluid import _non_static_mode, core, framework
 from paddle.fluid.data_feeder import check_type
-from paddle.fluid.dygraph import layers
 from paddle.fluid.dygraph.base import param_guard, switch_to_static_graph
+from paddle.nn.layer import layers
 from paddle.utils import flatten, gast

 from . import error, logging_utils

--- a/python/paddle/jit/dy2static/utils.py
+++ b/python/paddle/jit/dy2static/utils.py
@@ -393,7 +393,7 @@ def update_args_of_func(node, dygraph_node, method_name):
    import paddle.fluid as fluid  # noqa: F401

    if method_name == "__init__" or eval(
-        "issubclass({}, fluid.dygraph.Layer)".format(class_src)
+        "issubclass({}, paddle.nn.Layer)".format(class_src)
    ):
        full_args = eval(f"inspect.getfullargspec({class_src}.{method_name})")
        full_args_name = [

--- a/python/paddle/jit/translated_layer.py
+++ b/python/paddle/jit/translated_layer.py
@@ -20,7 +20,6 @@ import numpy as np
 import paddle
 from paddle import _legacy_C_ops
 from paddle.fluid import backward, core, framework, unique_name
-from paddle.fluid.dygraph import layers
 from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.executor import (
    _is_dy2st_enable_standalone_executor,
@@ -31,6 +30,7 @@ from paddle.jit.dy2static.partial_program import (
    LazyInitialized,
    add_build_strategy_for,
 )
+from paddle.nn.layer import layers

 from .dy2static.utils import _out_grad_names, _param_grad_names


--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -14,7 +14,6 @@

 # TODO: import all neural network related api under this directory,
 # including layers, linear, conv, rnn etc.
-from ..fluid.dygraph.layers import Layer  # noqa: F401
 from .layer.container import LayerList  # noqa: F401
 from .layer.container import ParameterList  # noqa: F401
 from .layer.container import Sequential  # noqa: F401
@@ -150,6 +149,8 @@ from .layer.vision import PixelUnshuffle  # noqa: F401
 from .layer.vision import ChannelShuffle  # noqa: F401
 from .layer.container import LayerDict  # noqa: F401

+from .layer.layers import Layer  # noqa: F401
+
 from .utils.spectral_norm_hook import spectral_norm

 # TODO: remove loss, keep it for too many used in unitests

--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -98,5 +98,6 @@ from .vision import PixelUnshuffle  # noqa: F401
 from .vision import ChannelShuffle  # noqa: F401
 from .distance import PairwiseDistance  # noqa: F401
 from .container import LayerDict  # noqa: F401
+from .layers import Layer

 __all__ = []
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -15,10 +15,10 @@
 # TODO: define activation functions of neural network

 from paddle.framework import get_default_dtype
-from paddle.nn import Layer

 from .. import functional as F
 from ..initializer import Constant
+from .layers import Layer

 __all__ = []


--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -15,9 +15,9 @@
 # TODO: define the common classes to build a neural network
 import paddle
 from paddle import in_dynamic_mode
-from paddle.nn import Layer

 from .. import functional as F
+from .layers import Layer

 __all__ = []


--- a/python/paddle/nn/layer/container.py
+++ b/python/paddle/nn/layer/container.py
@@ -17,7 +17,7 @@ from collections.abc import Iterable, Mapping

 from ...fluid.dygraph.base import param_guard
 from ...fluid.framework import Parameter
-from .. import Layer
+from .layers import Layer

 __all__ = []


--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -24,10 +24,10 @@ from ...device import (
    is_compiled_with_rocm,
 )
 from ...utils import convert_to_list
-from .. import Layer
 from .. import functional as F
 from ..functional.conv import _update_padding_nd
 from ..initializer import Normal
+from .layers import Layer

 __all__ = []


--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from .. import Layer
 from .. import functional as F
+from .layers import Layer

 __all__ = []


--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -13,48 +13,91 @@
 # limitations under the License.

 import collections
-import numpy as np
-import re
 import copy
-import weakref
-import warnings
 import inspect
+import re
+import warnings
+import weakref
+
+import numpy as np

 import paddle
 import paddle.profiler as profiler
-from paddle.profiler.utils import in_profiler_mode
-
-from .. import unique_name
-from paddle.fluid import core
-from .layer_object_helper import LayerObjectHelper
-from .layer_hooks import (
-    record_program_ops_pre_hook,
-    set_op_customized_attrs_post_hook,
-    LayerOpsRecoder,
-)
-from .base import (
-    program_desc_tracing_guard,
-    in_declarative_mode,
+import paddle.utils.deprecated as deprecated
+from paddle.fluid import core, framework, unique_name
+from paddle.fluid.core import VarDesc
+from paddle.fluid.dygraph import no_grad
+from paddle.fluid.dygraph.base import (
    _convert_into_variable,
+    in_declarative_mode,
+    program_desc_tracing_guard,
 )
-from paddle.fluid import framework
+from paddle.fluid.dygraph_utils import _append_activation_in_dygraph
 from paddle.fluid.executor import Executor, global_scope
+from paddle.fluid.framework import Parameter, Program
+from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.framework import (
+    _global_flags,
    convert_np_dtype_to_dtype_,
+    default_main_program,
    in_dygraph_mode,
 )
-from paddle.fluid.framework import Program
-from paddle.fluid.framework import _current_expected_place as _get_device
-from paddle.fluid.core import VarDesc
-from paddle.fluid.dygraph import no_grad
-import paddle.utils.deprecated as deprecated
+from paddle.fluid.layer_helper_base import LayerHelperBase
+from paddle.fluid.param_attr import ParamAttr
+from paddle.profiler.utils import in_profiler_mode

-__all__ = ['Layer']
+__all__ = []

 _first_cap_re = re.compile('(.)([A-Z][a-z]+)')
 _all_cap_re = re.compile('([a-z])([A-Z])')


+def record_program_ops_pre_hook(layer, inputs):
+    """
+    A pre-hook to mark op numbers before enter layer.forward.
+    """
+    if not in_dygraph_mode():
+        if layer._op_recorder.start < 0:
+            layer._op_recorder.start = len(
+                default_main_program().current_block().ops
+            )
+            layer._op_recorder.is_valid = True
+        else:
+            layer._op_recorder.is_valid = False
+            warnings.warn(
+                "{} has recorded the op information before. Please check whether you call this layer twice.".format(
+                    layer._full_name
+                )
+            )
+
+    return None
+
+
+def set_op_customized_attrs_post_hook(layer, inputs, outputs):
+    """
+    A post-hook to append customized attributes into all operators generated in current layer.
+    """
+    if not in_dygraph_mode() and layer._op_recorder.is_valid:
+
+        start = layer._op_recorder.start
+        end = len(default_main_program().current_block().ops)
+        assert start >= 0 and end >= start
+        ops = default_main_program().current_block().ops[start:end]
+
+        layer._op_recorder.end = end
+        layer._op_recorder.ops = ops
+
+        for op in ops:
+            for attr_name, val in layer._customized_attrs.items():
+                op._set_attr(attr_name, val)
+
+        # remove pre-hook and post-hook
+        for hook_helper in layer._op_recorder.hooks:
+            hook_helper.remove()
+
+    return None
+
+
 def _scope_dist2single(dist_scope):
    mapping = {
        "row_parallel_linear": "linear",
@@ -82,6 +125,202 @@ def _addindent(string, indent):
    return s1[0] + '\n' + '\n'.join(s2)


+class LayerObjectHelper(LayerHelperBase):
+    def __init__(self, name):
+        super().__init__(name, layer_type=name)
+
+    def append_op(
+        self,
+        type=None,
+        inputs=None,
+        outputs=None,
+        attrs=None,
+        stop_gradient=None,
+    ):
+        """append an operator for this layer object.
+
+           Args:
+               type: operator type
+               inputs: input variable of the operator
+               dtype: data type of this parameter
+               is_bias: if this is a bias parameter
+               default_initializer: set the default initializer for this parameter
+
+        Returns created parameter Variable.
+        """
+        return self.main_program.current_block().append_op(
+            type=type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+            stop_gradient=stop_gradient,
+        )
+
+    def _multiple_input(self, inputs_in):
+        inputs = inputs_in
+        ret = []
+        if isinstance(inputs, (list, tuple)):
+            for inp in inputs:
+                ret.append(self.to_variable(inp))
+        else:
+            ret.append(self.to_variable(inputs))
+        return ret
+
+    # TODO: make it public when we need it
+    def _input(self, inputs_in):
+        inputs = self._multiple_input(inputs_in)
+        if len(inputs) != 1:
+            raise "{0} layer only takes one input in".format(self.layer_type)
+        return inputs[0]
+
+    def _multiple_param_attr(self, length, param_attr_in=None):
+        param_attr = param_attr_in
+        if isinstance(param_attr, ParamAttr):
+            param_attr = [param_attr]
+
+        if len(param_attr) != 1 and len(param_attr) != length:
+            raise ValueError(
+                "parameter number mismatch in {}".format(self.name)
+            )
+        elif len(param_attr) == 1 and length != 1:
+            tmp = [None] * length
+            for i in range(length):
+                tmp[i] = copy.deepcopy(param_attr[0])
+            param_attr = tmp
+        return param_attr
+
+    def iter_inputs_and_params(self, inputs_in, param_attr_in=None):
+        """Access all inputs and params one by one
+
+           Args:
+               inputs_in: inputs to be iter
+               param_attr_in: param_attr to be iter
+
+        Returns input, param_attr
+        """
+        param_attr_in = ParamAttr._to_attr(param_attr_in)
+        if isinstance(param_attr_in, bool):
+            raise ValueError(
+                'Param_attr should not be False in {}'.format(self.name)
+            )
+        inputs = inputs_in if (inputs_in is not None) else []
+        inputs = self._multiple_input(inputs)
+        param_attrs = self._multiple_param_attr(len(inputs), param_attr_in)
+        for ipt, param_attr in zip(inputs, param_attrs):
+            yield ipt, param_attr
+
+    def input_dtype(self, inputs_in):
+        """Get input data type
+
+           Args:
+               inputs_in: inputs wanted know the data type
+
+        Returns dtype of the input
+        """
+        inputs_in = inputs_in if (inputs_in is not None) else []
+        inputs = self._multiple_input(inputs_in)
+        dtype = None
+        for each in inputs:
+            if dtype is None:
+                dtype = each.dtype
+            elif dtype != each.dtype:
+                raise ValueError(
+                    "Data Type mismatch: %d to %d in %s"
+                    % (dtype, each.dtype, self.name)
+                )
+        return dtype
+
+    def get_parameter(self, name):
+        """Get parameter specifically
+
+           Args:
+               name: parameter's name
+
+        Returns target parameter
+        """
+        param = self.main_program.global_block().var(name)
+        if not isinstance(param, Parameter):
+            raise ValueError(
+                "no Parameter name %s found in %s" % (name, self.name)
+            )
+        return param
+
+    # TODO: this should not be called anymore after all activation func move to Layers
+    def append_activation(self, input_var, act=None, use_cudnn=None):
+        """Append activation
+
+            Args:
+                input_var: the input variable. The len(input_var.shape) is
+                larger or equal than 2.
+                act: activation type
+                use_cudnn: if use cudnn
+
+        Return the Variable of after append activation
+        """
+        act = act
+        if act is None:
+            return input_var
+        if isinstance(act, str):
+            act = {'type': act}
+        else:
+            raise TypeError(
+                str(act) + " should be unicode or str in %s ", self.name
+            )
+
+        if (use_cudnn is not None) and use_cudnn:
+            act['use_cudnn'] = use_cudnn
+        use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
+        if (use_mkldnn is not None) and use_mkldnn:
+            act['use_mkldnn'] = use_mkldnn
+        act_type = act.pop('type')
+        if in_dygraph_mode():
+            res = _append_activation_in_dygraph(
+                input_var, act_type, use_cudnn, use_mkldnn
+            )
+            return res
+        else:
+            tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
+            self.append_op(
+                type=act_type,
+                inputs={"X": [input_var]},
+                outputs={"Out": [tmp]},
+                attrs=act,
+            )
+            return tmp
+
+    def is_instance(self, param, cls):
+        """Check if the input parameter is instance of input class
+
+            Args:
+                param: parameter to be check
+                cls: class of the parameter
+
+        Return result of the check (True or False)
+        """
+        param = param
+        if not isinstance(param, cls):
+            raise TypeError(
+                "The input {0} parameter of method {1} must be {2}, in layer {3}",
+                param,
+                self.layer_type,
+                cls.__name__,
+                self.name,
+            )
+
+
+class LayerOpsRecoder:
+    """
+    Record generated operators information in nn.Layer.
+    """
+
+    def __init__(self, start=-1, end=-1, ops=None, is_valid=False, hooks=None):
+        self.start = start
+        self.end = end
+        self.ops = ops
+        self.is_valid = is_valid
+        self.hooks = hooks
+
+
 class HookRemoveHelper:
    """A HookRemoveHelper that can be used to remove hook."""


--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -20,8 +20,8 @@ import paddle.fluid as fluid
 from paddle import in_dynamic_mode
 from paddle.fluid.framework import in_dygraph_mode

-from .. import Layer
 from .. import functional as F
+from .layers import Layer

 __all__ = []


--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -39,10 +39,10 @@ from paddle.fluid.framework import in_dygraph_mode
 from ...fluid import dygraph_utils
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...framework import ParamAttr, _global_flags, get_default_dtype, no_grad
-from .. import Layer
 from .. import functional as F
 from ..functional import batch_norm, instance_norm, layer_norm
 from ..initializer import Constant, Normal
+from .layers import Layer

 __all__ = []


--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from .. import Layer
 from .. import functional as F
+from .layers import Layer

 __all__ = []


--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -30,12 +30,12 @@ from paddle.fluid.framework import (
 )
 from paddle.fluid.layers import control_flow
 from paddle.framework import core
-from paddle.nn import Layer
 from paddle.nn import functional as F
 from paddle.nn import initializer as I
 from paddle.tensor.manipulation import tensor_array_to_tensor

 from .container import LayerList
+from .layers import Layer

 __all__ = []


--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -25,10 +25,10 @@ from paddle.fluid.data_feeder import convert_dtype
 from ... import tensor
 from ...fluid import layers
 from ...framework import ParamAttr
-from .. import Layer
 from .. import functional as F
 from .common import Dropout, Linear
 from .container import LayerList
+from .layers import Layer
 from .norm import LayerNorm

 __all__ = []

--- a/python/paddle/nn/layer/vision.py
+++ b/python/paddle/nn/layer/vision.py
@@ -14,7 +14,8 @@

 # TODO: define specitial functions used in computer vision task

-from .. import Layer, functional
+from .. import functional
+from .layers import Layer

 __all__ = []