【fluid clean】Move out layers and layers helper (#49415)

* remove no used fluid beam_search_decoder * move Layer and related helper to paddle.nn.common * modify Layer references from dygraph.layers.Layer to paddle.nn.common.layers * stash changge * remove fluid layer_object_helper, layers.py * remove fluid layers init * add setip * fix unitest * delete layers in fluid.dygraph * merge paddle.tensor.stat,py * fix circle import * fix curcle import * remove redundant in_dygraph_mode import * revoce paddle.nn.common.* in fluid.__init__ * recovery nn.rnn * paddle.frame use lazy import import paddle.jit to avoid circle import * remove left dygraph.layers ref * merge develop * fix import error * fix test error * fxi merge error * fix test fluid.Layer * fix test error * fix test error * fix import error * fix import error * fix comments * fix circle import * fix rnn import error * fix circle import

【fluid clean】Move out layers and layers helper (#49415)
* remove no used fluid beam_search_decoder * move Layer and related helper to paddle.nn.common * modify Layer references from dygraph.layers.Layer to paddle.nn.common.layers * stash changge * remove fluid layer_object_helper, layers.py * remove fluid layers init * add setip * fix unitest * delete layers in fluid.dygraph * merge paddle.tensor.stat,py * fix circle import * fix curcle import * remove redundant in_dygraph_mode import * revoce paddle.nn.common.* in fluid.__init__ * recovery nn.rnn * paddle.frame use lazy import import paddle.jit to avoid circle import * remove left dygraph.layers ref * merge develop * fix import error * fix test error * fxi merge error * fix test fluid.Layer * fix test error * fix test error * fix import error * fix import error * fix comments * fix circle import * fix rnn import error * fix circle import
1d5cad23 · GGBond8488 · GitHub · 5b6d2f85 · 1d5cad23 · 1d5cad23
140 changed file
--- a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
@@ -14,7 +14,6 @@

 import paddle
 from paddle.fluid import core
-from paddle.nn import Layer
 from paddle.nn import functional as F

 from ...base import topology as tp
@@ -32,7 +31,7 @@ def is_fused_matmul_bias_supported():
    return hasattr(core.eager.ops.legacy, 'fused_gemm_epilogue')


-class VocabParallelEmbedding(Layer):
+class VocabParallelEmbedding(paddle.nn.Layer):
    """Embedding mp parallelized in the vocabulary dimension.
    this class is used for splitting embedding in mp group.

@@ -170,7 +169,7 @@ class VocabParallelEmbedding(Layer):
        return output


-class ColumnParallelLinear(Layer):
+class ColumnParallelLinear(paddle.nn.Layer):
    """Linear layer with mp parallelized(column).
    this class is used for splitting Linear Layer in mp group, column split the weight of the Linear layer.

@@ -329,7 +328,7 @@ class ColumnParallelLinear(Layer):
        return output


-class RowParallelLinear(Layer):
+class RowParallelLinear(paddle.nn.Layer):
    """Linear layer with mp parallelized(row).
    this class is used for splitting Linear Layer in mp group, row split the weight of the Linear layer.

@@ -495,7 +494,7 @@ class RowParallelLinear(Layer):
        return output


-class ParallelCrossEntropy(Layer):
+class ParallelCrossEntropy(paddle.nn.Layer):
    """CrossEntropy with mp parallelized.
    this class is used for splitting softmax cross entropy in mp group.


--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -46,7 +46,8 @@ from paddle.distributed.fleet.launch_utils import check_backend
 # (TODO: GhostScreaming) It will be removed later.
 from paddle.framework import ParamBase, _set_expected_place
 from paddle.framework import base as imperative_base
-from paddle.framework import core, in_dygraph_mode, layers, to_variable
+from paddle.framework import core, in_dygraph_mode, to_variable
+from paddle.nn.layer import layers
 from paddle.utils import deprecated

 from . import parallel_helper

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -86,7 +86,6 @@ from .parallel_executor import *
 from . import compiler
 from .compiler import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable
-from .dygraph.layers import *
 from .dygraph.base import enable_dygraph, disable_dygraph
 from .dygraph.varbase_patch_methods import monkey_patch_varbase
 from .core import _cuda_synchronize

--- a/python/paddle/fluid/contrib/tests/test_correlation.py
+++ b/python/paddle/fluid/contrib/tests/test_correlation.py
@@ -135,7 +135,7 @@ class TestCorrelationOp(unittest.TestCase):
        np.testing.assert_allclose(res[0], out_np, rtol=1e-05, atol=1e-8)


-class Net(fluid.dygraph.Layer):
+class Net(paddle.nn.Layer):
    def __init__(self, name_scope):
        super().__init__(name_scope)


--- a/python/paddle/fluid/dygraph/__init__.py
+++ b/python/paddle/fluid/dygraph/__init__.py
@@ -15,9 +15,6 @@
 from . import base
 from .base import *

-from . import layers
-from .layers import *
-
 from . import tracer
 from .tracer import *

@@ -27,6 +24,5 @@ from .learning_rate_scheduler import *
 from .math_op_patch import monkey_patch_math_varbase

 __all__ = []
-__all__ += layers.__all__
 __all__ += base.__all__
 __all__ += learning_rate_scheduler.__all__
--- a/python/paddle/fluid/dygraph/layer_hooks.py
+++ b/python/paddle/fluid/dygraph/layer_hooks.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from paddle.fluid.framework import default_main_program, in_dygraph_mode
-
-
-class LayerOpsRecoder:
-    """
-    Record generated operators information in nn.Layer.
-    """
-
-    def __init__(self, start=-1, end=-1, ops=None, is_valid=False, hooks=None):
-        self.start = start
-        self.end = end
-        self.ops = ops
-        self.is_valid = is_valid
-        self.hooks = hooks
-
-
-def record_program_ops_pre_hook(layer, inputs):
-    """
-    A pre-hook to mark op numbers before enter layer.forward.
-    """
-    if not in_dygraph_mode():
-        if layer._op_recorder.start < 0:
-            layer._op_recorder.start = len(
-                default_main_program().current_block().ops
-            )
-            layer._op_recorder.is_valid = True
-        else:
-            layer._op_recorder.is_valid = False
-            warnings.warn(
-                "{} has recorded the op information before. Please check whether you call this layer twice.".format(
-                    layer._full_name
-                )
-            )
-
-    return None
-
-
-def set_op_customized_attrs_post_hook(layer, inputs, outputs):
-    """
-    A post-hook to append customized attributes into all operators generated in current layer.
-    """
-    if not in_dygraph_mode() and layer._op_recorder.is_valid:
-
-        start = layer._op_recorder.start
-        end = len(default_main_program().current_block().ops)
-        assert start >= 0 and end >= start
-        ops = default_main_program().current_block().ops[start:end]
-
-        layer._op_recorder.end = end
-        layer._op_recorder.ops = ops
-
-        for op in ops:
-            for attr_name, val in layer._customized_attrs.items():
-                op._set_attr(attr_name, val)
-
-        # remove pre-hook and post-hook
-        for hook_helper in layer._op_recorder.hooks:
-            hook_helper.remove()
-
-    return None
--- a/python/paddle/fluid/dygraph/layer_object_helper.py
+++ b/python/paddle/fluid/dygraph/layer_object_helper.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-from ..framework import Parameter, in_dygraph_mode, _global_flags
-from ..param_attr import ParamAttr
-from .. import core
-
-from ..layer_helper_base import LayerHelperBase
-from ..dygraph_utils import _append_activation_in_dygraph
-
-
-class LayerObjectHelper(LayerHelperBase):
-    def __init__(self, name):
-        super().__init__(name, layer_type=name)
-
-    def append_op(
-        self,
-        type=None,
-        inputs=None,
-        outputs=None,
-        attrs=None,
-        stop_gradient=None,
-    ):
-        """append an operator for this layer object.
-
-           Args:
-               type: operator type
-               inputs: input variable of the operator
-               dtype: data type of this parameter
-               is_bias: if this is a bias parameter
-               default_initializer: set the default initializer for this parameter
-
-        Returns created parameter Variable.
-        """
-        return self.main_program.current_block().append_op(
-            type=type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=attrs,
-            stop_gradient=stop_gradient,
-        )
-
-    def _multiple_input(self, inputs_in):
-        inputs = inputs_in
-        ret = []
-        if isinstance(inputs, (list, tuple)):
-            for inp in inputs:
-                ret.append(self.to_variable(inp))
-        else:
-            ret.append(self.to_variable(inputs))
-        return ret
-
-    # TODO: make it public when we need it
-    def _input(self, inputs_in):
-        inputs = self._multiple_input(inputs_in)
-        if len(inputs) != 1:
-            raise "{0} layer only takes one input in".format(self.layer_type)
-        return inputs[0]
-
-    def _multiple_param_attr(self, length, param_attr_in=None):
-        param_attr = param_attr_in
-        if isinstance(param_attr, ParamAttr):
-            param_attr = [param_attr]
-
-        if len(param_attr) != 1 and len(param_attr) != length:
-            raise ValueError(
-                "parameter number mismatch in {}".format(self.name)
-            )
-        elif len(param_attr) == 1 and length != 1:
-            tmp = [None] * length
-            for i in range(length):
-                tmp[i] = copy.deepcopy(param_attr[0])
-            param_attr = tmp
-        return param_attr
-
-    def iter_inputs_and_params(self, inputs_in, param_attr_in=None):
-        """Access all inputs and params one by one
-
-           Args:
-               inputs_in: inputs to be iter
-               param_attr_in: param_attr to be iter
-
-        Returns input, param_attr
-        """
-        param_attr_in = ParamAttr._to_attr(param_attr_in)
-        if isinstance(param_attr_in, bool):
-            raise ValueError(
-                'Param_attr should not be False in {}'.format(self.name)
-            )
-        inputs = inputs_in if (inputs_in is not None) else []
-        inputs = self._multiple_input(inputs)
-        param_attrs = self._multiple_param_attr(len(inputs), param_attr_in)
-        for ipt, param_attr in zip(inputs, param_attrs):
-            yield ipt, param_attr
-
-    def input_dtype(self, inputs_in):
-        """Get input data type
-
-           Args:
-               inputs_in: inputs wanted know the data type
-
-        Returns dtype of the input
-        """
-        inputs_in = inputs_in if (inputs_in is not None) else []
-        inputs = self._multiple_input(inputs_in)
-        dtype = None
-        for each in inputs:
-            if dtype is None:
-                dtype = each.dtype
-            elif dtype != each.dtype:
-                raise ValueError(
-                    "Data Type mismatch: %d to %d in %s"
-                    % (dtype, each.dtype, self.name)
-                )
-        return dtype
-
-    def get_parameter(self, name):
-        """Get parameter specifically
-
-           Args:
-               name: parameter's name
-
-        Returns target parameter
-        """
-        param = self.main_program.global_block().var(name)
-        if not isinstance(param, Parameter):
-            raise ValueError(
-                "no Parameter name %s found in %s" % (name, self.name)
-            )
-        return param
-
-    # TODO: this should not be called anymore after all activation func move to Layers
-    def append_activation(self, input_var, act=None, use_cudnn=None):
-        """Append activation
-
-            Args:
-                input_var: the input variable. The len(input_var.shape) is
-                larger or equal than 2.
-                act: activation type
-                use_cudnn: if use cudnn
-
-        Return the Variable of after append activation
-        """
-        act = act
-        if act is None:
-            return input_var
-        if isinstance(act, str):
-            act = {'type': act}
-        else:
-            raise TypeError(
-                str(act) + " should be unicode or str in %s ", self.name
-            )
-
-        if (use_cudnn is not None) and use_cudnn:
-            act['use_cudnn'] = use_cudnn
-        use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
-        if (use_mkldnn is not None) and use_mkldnn:
-            act['use_mkldnn'] = use_mkldnn
-        act_type = act.pop('type')
-        if in_dygraph_mode():
-            res = _append_activation_in_dygraph(
-                input_var, act_type, use_cudnn, use_mkldnn
-            )
-            return res
-        else:
-            tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
-            self.append_op(
-                type=act_type,
-                inputs={"X": [input_var]},
-                outputs={"Out": [tmp]},
-                attrs=act,
-            )
-            return tmp
-
-    def is_instance(self, param, cls):
-        """Check if the input parameter is instance of input class
-
-            Args:
-                param: parameter to be check
-                cls: class of the parameter
-
-        Return result of the check (True or False)
-        """
-        param = param
-        if not isinstance(param, cls):
-            raise TypeError(
-                "The input {0} parameter of method {1} must be {2}, in layer {3}",
-                param,
-                self.layer_type,
-                cls.__name__,
-                self.name,
-            )
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -442,7 +442,7 @@ def set_ipu_shard(call_func, index=-1, stage=-1):

        return wrapper

-    from .dygraph.layers import Layer
+    from paddle.nn import Layer

    if not isinstance(call_func, Layer):
        if callable(call_func):

--- a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
@@ -20,11 +20,11 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.dygraph.layers import Layer, _convert_camel_to_snake
 from paddle.incubate import asp as sparsity
 from paddle.incubate.asp.supported_layer_list import (
    supported_layers_and_prune_func_map,
 )
+from paddle.nn.layer.layers import Layer, _convert_camel_to_snake


 class MyOwnLayer(Layer):
@@ -99,10 +99,8 @@ class TestASPDynamicCustomerizedPruneFunc(unittest.TestCase):
        sparsity.add_supported_layer(CustomerLayer, my_own_pruning)

        self.layer = CustomerLayer()
-        self.customer_prefix = (
-            paddle.fluid.dygraph.layers._convert_camel_to_snake(
-                CustomerLayer.__name__
-            )
+        self.customer_prefix = paddle.nn.layer.layers._convert_camel_to_snake(
+            CustomerLayer.__name__
        )
        self.supported_layer_count_ref = 3


--- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_dist_save_load.py
@@ -22,7 +22,6 @@ import tempfile
 import numpy as np

 import paddle
-import paddle.fluid as fluid
 from paddle import distributed as dist
 from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import (
@@ -39,7 +38,7 @@ epoch = 2
 linear_size = 1000


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_api.py
@@ -18,7 +18,6 @@ import tempfile
 import numpy as np

 import paddle
-import paddle.fluid as fluid
 from paddle.distributed import fleet
 from paddle.distributed.sharding import (
    group_sharded_parallel,
@@ -35,7 +34,7 @@ l2_decay = 1e-4
 batch_size = 100


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_api_eager.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_api_eager.py
@@ -17,7 +17,6 @@ import tempfile
 import numpy as np

 import paddle
-import paddle.fluid as fluid
 from paddle.distributed.sharding import (
    group_sharded_parallel,
    save_group_sharded_model,
@@ -33,7 +32,7 @@ l2_decay = 1e-4
 batch_size = 100


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage2.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage2.py
@@ -21,7 +21,6 @@ import tempfile
 import numpy as np

 import paddle
-import paddle.fluid as fluid
 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import (
    GroupShardedOptimizerStage2,
 )
@@ -38,7 +37,7 @@ np.random.seed(seed)
 paddle.seed(seed)


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage2_comm_overlap.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage2_comm_overlap.py
@@ -21,7 +21,6 @@ import tempfile
 import numpy as np

 import paddle
-import paddle.fluid as fluid
 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import (
    GroupShardedOptimizerStage2,
 )
@@ -38,7 +37,7 @@ np.random.seed(seed)
 paddle.seed(seed)


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3_offload.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3_offload.py
@@ -18,7 +18,6 @@
 import numpy as np

 import paddle
-import paddle.fluid as fluid
 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import (
    GroupShardedStage3,
 )
@@ -36,7 +35,7 @@ momentum_rate = 0.9
 l2_decay = 1e-4


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py
@@ -23,7 +23,6 @@ import tempfile
 import numpy as np

 import paddle
-import paddle.fluid as fluid
 from paddle import distributed as dist
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel import engine
@@ -86,7 +85,7 @@ class MLP_pipe(PipelineLayer):
        )


-class MLP_Hybrid(fluid.Layer):
+class MLP_Hybrid(paddle.nn.Layer):
    def __init__(
        self,
        embedding_size=1000,
@@ -121,7 +120,7 @@ class MLP_Hybrid(fluid.Layer):
        return y


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(
        self,
        embedding_size=1000,

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_layers.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_layers.py
@@ -20,7 +20,6 @@ import numpy as np
 import paddle
 import paddle.distributed as dist
 import paddle.distributed.fleet as fleet
-import paddle.fluid as fluid


 def set_random_seed(seed):
@@ -31,7 +30,7 @@ def set_random_seed(seed):
    fleet.meta_parallel.model_parallel_random_seed(seed)


-class ColumnLinearNet(fluid.dygraph.Layer):
+class ColumnLinearNet(paddle.nn.Layer):
    def __init__(self, input_size, output_size, global_dtype):
        super().__init__()
        self.parallel_linear = fleet.meta_parallel.ColumnParallelLinear(
@@ -48,7 +47,7 @@ class ColumnLinearNet(fluid.dygraph.Layer):
        return output


-class RowLinearNet(fluid.dygraph.Layer):
+class RowLinearNet(paddle.nn.Layer):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.parallel_linear = fleet.meta_parallel.RowParallelLinear(
@@ -64,7 +63,7 @@ class RowLinearNet(fluid.dygraph.Layer):
        return output


-class EmbeddingNet(fluid.dygraph.Layer):
+class EmbeddingNet(paddle.nn.Layer):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
@@ -76,7 +75,7 @@ class EmbeddingNet(fluid.dygraph.Layer):
        return output


-class SimpleMatmul(fluid.dygraph.Layer):
+class SimpleMatmul(paddle.nn.Layer):
    def __init__(self, weight, output_size, global_dtype):
        super().__init__()
        self.weight = paddle.create_parameter(
@@ -99,7 +98,7 @@ class SimpleMatmul(fluid.dygraph.Layer):
        return output


-class SimpleEmbedding(fluid.dygraph.Layer):
+class SimpleEmbedding(paddle.nn.Layer):
    def __init__(self, vocab_size, hidden_size, weight):
        super().__init__()
        self.embedding = paddle.nn.Embedding(

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_model.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_model.py
@@ -20,7 +20,6 @@ import numpy as np
 import paddle
 import paddle.distributed as dist
 import paddle.distributed.fleet as fleet
-import paddle.fluid as fluid


 def set_random_seed(seed, dp_id, rank_id):
@@ -62,7 +61,7 @@ def parallel_matmul(lm_output, logit_weights, parallel_output):
        return logits


-class SimpleMPNet(fluid.dygraph.Layer):
+class SimpleMPNet(paddle.nn.Layer):
    def __init__(
        self,
        vocab_size,
@@ -128,7 +127,7 @@ class SimpleMPNet(fluid.dygraph.Layer):
        return x


-class SimpleDPNet(fluid.dygraph.Layer):
+class SimpleDPNet(paddle.nn.Layer):
    def __init__(
        self, vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2
    ):

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_embedding.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_embedding.py
@@ -22,8 +22,7 @@ import paddle.distributed as dist
 import paddle.distributed.fleet as fleet
 import paddle.nn as nn
 from paddle.distributed.fleet.meta_parallel import PipelineLayer
-from paddle.fluid.dygraph.layers import Layer
-from paddle.nn import Sequential
+from paddle.nn import Layer, Sequential


 def set_random_seed(seed, dp_id, rank_id):

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_layer_with_virtual_stage.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_layer_with_virtual_stage.py
@@ -23,7 +23,7 @@ from paddle.distributed.fleet.meta_parallel import (
    PipelineLayer,
    PipelineParallelWithInterleave,
 )
-from paddle.fluid.dygraph.layers import Layer
+from paddle.nn import Layer


 class ReshapeHelp(Layer):

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_recompute.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_recompute.py
@@ -24,7 +24,7 @@ import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle import framework
 from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
-from paddle.fluid.dygraph.layers import Layer
+from paddle.nn import Layer


 def set_random_seed(seed, dp_id, rank_id):

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer.py
@@ -23,7 +23,7 @@ import paddle.distributed.fleet as fleet
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
-from paddle.fluid.dygraph.layers import Layer
+from paddle.nn import Layer


 def set_random_seed(seed, dp_id, rank_id):

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py
@@ -23,7 +23,7 @@ import paddle.distributed.fleet as fleet
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
-from paddle.fluid.dygraph.layers import Layer
+from paddle.nn import Layer


 def set_random_seed(seed, dp_id, rank_id):

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_sharding_model.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_sharding_model.py
@@ -20,7 +20,6 @@ import numpy as np
 import paddle
 import paddle.distributed as dist
 import paddle.distributed.fleet as fleet
-import paddle.fluid as fluid
 from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizer import (
    DygraphShardingOptimizer,
 )
@@ -58,7 +57,7 @@ def parallel_matmul(lm_output, logit_weights, parallel_output):
        return logits


-class SimpleMPNet(fluid.dygraph.Layer):
+class SimpleMPNet(paddle.nn.Layer):
    def __init__(
        self,
        vocab_size,
@@ -124,7 +123,7 @@ class SimpleMPNet(fluid.dygraph.Layer):
        return x


-class SimpleDPNet(fluid.dygraph.Layer):
+class SimpleDPNet(paddle.nn.Layer):
    def __init__(
        self, vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2
    ):

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_shared_weight.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_shared_weight.py
@@ -26,7 +26,7 @@ from paddle.distributed.fleet.meta_parallel import (
    PipelineLayer,
    SharedLayerDesc,
 )
-from paddle.fluid.dygraph.layers import Layer
+from paddle.nn import Layer


 def print_hook_fn(grad):

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_different.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_different.py
@@ -16,14 +16,13 @@ import numpy as np
 from test_dist_base import TestParallelDyGraphRunnerBase, runtime_main

 import paddle
-import paddle.fluid as fluid
 import paddle.nn.functional as F

 paddle.seed(123)
 np.random.seed(2021)


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(self, hidden_size, vocab_size, is_sparse=False):
        super().__init__()
        self.hidden_size = hidden_size

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_same.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_same.py
@@ -16,7 +16,6 @@ import numpy as np
 from test_dist_base import TestParallelDyGraphRunnerBase, runtime_main

 import paddle
-import paddle.fluid as fluid
 from paddle.fluid.dygraph.base import to_variable
 from paddle.nn import Linear

@@ -27,7 +26,7 @@ batch_size = 4
 batch_num = 1000


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.net_a = paddle.nn.Sequential(

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync.py
@@ -34,7 +34,7 @@ batch_size = 4
 batch_num = 1000


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.net_a = Linear(10, 20)

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync_control_flow.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync_control_flow.py
@@ -17,7 +17,6 @@ from parallel_dygraph_no_sync import TestNoSync
 from test_dist_base import runtime_main

 import paddle
-import paddle.fluid as fluid
 from paddle.nn import Linear

 seed = 90
@@ -26,7 +25,7 @@ batch_size = 4
 batch_num = 1000


-class SimpleNetControlFlow(fluid.Layer):
+class SimpleNetControlFlow(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.net_a = Linear(10, 20)

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync_gradient_check.py
@@ -18,7 +18,6 @@ import numpy as np

 import paddle
 import paddle.distributed as dist
-import paddle.fluid as fluid
 from paddle.nn import Linear

 paddle.seed(1024)
@@ -29,7 +28,7 @@ in_dim = 10
 out_dim = 20


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(self, train_id):
        super().__init__()
        self.w1 = self.create_parameter(

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync_unused_params.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync_unused_params.py
@@ -17,7 +17,6 @@ from parallel_dygraph_no_sync import TestNoSync
 from test_dist_base import runtime_main

 import paddle
-import paddle.fluid as fluid
 from paddle.nn import Linear

 seed = 90
@@ -26,7 +25,7 @@ batch_size = 4
 batch_num = 1000


-class SimpleNetUnusedParam(fluid.Layer):
+class SimpleNetUnusedParam(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.net_a = Linear(10, 20)

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py
@@ -76,7 +76,7 @@ def optimizer_setting(params, parameter_list=None):
    return optimizer


-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -109,7 +109,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
        return y


-class SqueezeExcitation(fluid.dygraph.Layer):
+class SqueezeExcitation(paddle.nn.Layer):
    def __init__(self, num_channels, reduction_ratio):

        super().__init__()
@@ -143,7 +143,7 @@ class SqueezeExcitation(fluid.dygraph.Layer):
        return y


-class BottleneckBlock(fluid.dygraph.Layer):
+class BottleneckBlock(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -207,7 +207,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
        return y


-class SeResNeXt(fluid.dygraph.Layer):
+class SeResNeXt(paddle.nn.Layer):
    def __init__(self, layers=50, class_dim=102):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_sync_batch_norm.py
@@ -21,7 +21,7 @@ from paddle.fluid.dygraph.base import to_variable
 from paddle.nn import Conv2D, SyncBatchNorm


-class TestLayer(fluid.dygraph.Layer):
+class TestLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
@@ -18,7 +18,8 @@ from test_dist_base import TestParallelDyGraphRunnerBase, runtime_main
 import paddle
 import paddle.fluid as fluid
 import paddle.nn.functional as F
-from paddle.fluid.dygraph import Layer, to_variable
+from paddle.fluid.dygraph import to_variable
+from paddle.nn import Layer
 from paddle.optimizer.lr import NoamDecay

 """

--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_static_mp_layers.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_static_mp_layers.py
@@ -17,12 +17,11 @@ import unittest

 import paddle
 import paddle.distributed.fleet as fleet
-import paddle.fluid as fluid

 paddle.enable_static()


-class ColumnLinearNet(fluid.dygraph.Layer):
+class ColumnLinearNet(paddle.nn.Layer):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.parallel_linear = fleet.meta_parallel.ColumnParallelLinear(
@@ -39,7 +38,7 @@ class ColumnLinearNet(fluid.dygraph.Layer):
        return output


-class RowLinearNet(fluid.dygraph.Layer):
+class RowLinearNet(paddle.nn.Layer):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.parallel_linear = fleet.meta_parallel.RowParallelLinear(
@@ -55,7 +54,7 @@ class RowLinearNet(fluid.dygraph.Layer):
        return output


-class EmbeddingNet(fluid.dygraph.Layer):
+class EmbeddingNet(paddle.nn.Layer):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.embedding = fleet.meta_parallel.VocabParallelEmbedding(

--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
@@ -29,7 +29,7 @@ if fluid.core.is_compiled_with_cuda():
    fluid.set_flags({"FLAGS_cudnn_deterministic": True})


-class SimpleConv(fluid.dygraph.Layer):
+class SimpleConv(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,

--- a/python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_group_sharded_stage3.py
+++ b/python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_group_sharded_stage3.py
@@ -21,7 +21,6 @@ import tempfile
 import numpy as np

 import paddle
-import paddle.fluid as fluid
 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import (
    GroupShardedOptimizerStage2,
 )
@@ -44,7 +43,7 @@ momentum_rate = 0.9
 l2_decay = 1e-4


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
@@ -16,9 +16,8 @@ from transformer_dygraph_model import MultiHeadAttention, PrePostProcessLayer

 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.dygraph import Layer
 from paddle.jit.api import to_static
-from paddle.nn import Linear
+from paddle.nn import Layer, Linear


 class PositionwiseFeedForwardLayer(Layer):

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
@@ -13,13 +13,12 @@
 # limitations under the License.

 import paddle
-import paddle.fluid as fluid
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.regularizer import L2Decay
 from paddle.nn import BatchNorm


-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(paddle.nn.Layer):
    def __init__(
        self,
        ch_in,
@@ -68,7 +67,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
        return out


-class DownSample(fluid.dygraph.Layer):
+class DownSample(paddle.nn.Layer):
    def __init__(
        self, ch_in, ch_out, filter_size=3, stride=2, padding=1, is_test=True
    ):
@@ -90,7 +89,7 @@ class DownSample(fluid.dygraph.Layer):
        return out


-class BasicBlock(fluid.dygraph.Layer):
+class BasicBlock(paddle.nn.Layer):
    def __init__(self, ch_in, ch_out, is_test=True):
        super().__init__()

@@ -118,7 +117,7 @@ class BasicBlock(fluid.dygraph.Layer):
        return out


-class LayerWarp(fluid.dygraph.Layer):
+class LayerWarp(paddle.nn.Layer):
    def __init__(self, ch_in, ch_out, count, is_test=True):
        super().__init__()

@@ -142,7 +141,7 @@ class LayerWarp(fluid.dygraph.Layer):
 DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}


-class DarkNet53_conv_body(fluid.dygraph.Layer):
+class DarkNet53_conv_body(paddle.nn.Layer):
    def __init__(self, ch_in=3, is_test=True):
        super().__init__()
        self.stages = DarkNet_cfg[53]

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
@@ -232,7 +232,7 @@ def nested_if_else_3(x):
    return res


-class NetWithControlFlowIf(fluid.dygraph.Layer):
+class NetWithControlFlowIf(paddle.nn.Layer):
    def __init__(self, hidden_dim=16):
        super().__init__()
        self.hidden_dim = hidden_dim

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
@@ -19,10 +19,9 @@ from seq2seq_utils import Seq2SeqModelHyperParams as args
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import ParamAttr
-from paddle.fluid.dygraph import Layer
 from paddle.fluid.dygraph.base import to_variable
 from paddle.jit.api import to_static
-from paddle.nn import Embedding
+from paddle.nn import Embedding, Layer

 INF = 1.0 * 1e5
 alpha = 0.6
@@ -84,7 +83,7 @@ class BasicLSTMUnit(Layer):
        return new_hidden, new_cell


-class BaseModel(fluid.dygraph.Layer):
+class BaseModel(paddle.nn.Layer):
    def __init__(
        self,
        hidden_size,
@@ -511,7 +510,7 @@ class BaseModel(fluid.dygraph.Layer):
        return predicted_ids


-class AttentionModel(fluid.dygraph.Layer):
+class AttentionModel(paddle.nn.Layer):
    def __init__(
        self,
        hidden_size,

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
@@ -17,8 +17,8 @@ from functools import reduce
 import paddle
 import paddle.fluid.param_attr as attr
 from paddle.common_ops_import import Variable
-from paddle.fluid.dygraph import Layer
 from paddle.jit.api import to_static
+from paddle.nn import Layer


 class EmbeddingLayer:

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -99,7 +99,7 @@ def _get_interp1d_bin_mask(
    return p_mask


-class Conv1D(fluid.dygraph.Layer):
+class Conv1D(paddle.nn.Layer):
    def __init__(
        self,
        prefix,
@@ -140,7 +140,7 @@ class Conv1D(fluid.dygraph.Layer):
        return x


-class BMN(fluid.dygraph.Layer):
+class BMN(paddle.nn.Layer):
    def __init__(self, cfg):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
@@ -117,7 +117,7 @@ class TestRecursiveCall1(unittest.TestCase):
 lambda_fun = lambda x: x


-class MyConvLayer(fluid.dygraph.Layer):
+class MyConvLayer(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self._conv = paddle.nn.Conv2D(
@@ -145,7 +145,7 @@ class MyConvLayer(fluid.dygraph.Layer):
        return x_v


-class MyLayer(fluid.dygraph.Layer):
+class MyLayer(paddle.nn.Layer):
    def __init__(self):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
@@ -61,7 +61,7 @@ IMAGE_SIZE = 64
 SEED = 2020


-class Cycle_Gan(fluid.dygraph.Layer):
+class Cycle_Gan(paddle.nn.Layer):
    def __init__(self, input_channel, istrain=True):
        super().__init__()

@@ -151,7 +151,7 @@ class Cycle_Gan(fluid.dygraph.Layer):
        return rec_A, fake_pool_rec_A


-class build_resnet_block(fluid.dygraph.Layer):
+class build_resnet_block(paddle.nn.Layer):
    def __init__(self, dim, use_bias=False):
        super().__init__()

@@ -185,7 +185,7 @@ class build_resnet_block(fluid.dygraph.Layer):
        return out_res + inputs


-class build_generator_resnet_9blocks(fluid.dygraph.Layer):
+class build_generator_resnet_9blocks(paddle.nn.Layer):
    def __init__(self, input_channel):
        super().__init__()

@@ -267,7 +267,7 @@ class build_generator_resnet_9blocks(fluid.dygraph.Layer):
        return y


-class build_gen_discriminator(fluid.dygraph.Layer):
+class build_gen_discriminator(paddle.nn.Layer):
    def __init__(self, input_channel):
        super().__init__()

@@ -330,7 +330,7 @@ class build_gen_discriminator(fluid.dygraph.Layer):
        return y


-class conv2d(fluid.dygraph.Layer):
+class conv2d(paddle.nn.Layer):
    """docstring for Conv2D"""

    def __init__(
@@ -398,7 +398,7 @@ class conv2d(fluid.dygraph.Layer):
        return conv


-class DeConv2D(fluid.dygraph.Layer):
+class DeConv2D(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -21,12 +21,13 @@ from test_basic_api_transformation import dyfunc_to_variable

 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.dygraph import Layer, to_variable
+from paddle.fluid.dygraph import to_variable
 from paddle.jit.api import to_static
 from paddle.jit.dy2static.program_translator import (
    ConcreteProgram,
    StaticFunction,
 )
+from paddle.nn import Layer
 from paddle.static import InputSpec



--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
@@ -25,7 +25,7 @@ PLACE = (
 )


-class SubNetWithDict(fluid.dygraph.Layer):
+class SubNetWithDict(paddle.nn.Layer):
    def __init__(self, hidden_size=16, output_size=16):
        super().__init__()

@@ -72,7 +72,7 @@ class SubNetWithDict(fluid.dygraph.Layer):
        return out


-class MainNetWithDict(fluid.dygraph.Layer):
+class MainNetWithDict(paddle.nn.Layer):
    def __init__(self, batch_size=64, hidden_size=16, output_size=16):
        super().__init__()
        self.batch_size = batch_size

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
@@ -67,7 +67,7 @@ def func_decorated_by_other_2():
    return 1


-class LayerErrorInCompiletime(fluid.dygraph.Layer):
+class LayerErrorInCompiletime(paddle.nn.Layer):
    def __init__(self, fc_size=20):
        super().__init__()
        self._linear = paddle.nn.Linear(fc_size, fc_size)
@@ -82,7 +82,7 @@ class LayerErrorInCompiletime(fluid.dygraph.Layer):
        return out


-class LayerErrorInCompiletime2(fluid.dygraph.Layer):
+class LayerErrorInCompiletime2(paddle.nn.Layer):
    def __init__(self):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fetch_feed.py
@@ -23,7 +23,7 @@ from paddle.jit.api import to_static
 SEED = 2020


-class Pool2D(fluid.dygraph.Layer):
+class Pool2D(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.pool2d = paddle.nn.AvgPool2D(kernel_size=2, stride=1)
@@ -38,7 +38,7 @@ class Pool2D(fluid.dygraph.Layer):
        return pre


-class Linear(fluid.dygraph.Layer):
+class Linear(paddle.nn.Layer):
    def __init__(self, input_dim=10, output_dim=5):
        super().__init__()
        self.fc = paddle.nn.Linear(

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
@@ -289,7 +289,7 @@ class TestAst2FuncWithExternalFunc(TestDygraphIfElse):
        self.dyfunc = call_external_func


-class NetWithExternalFunc(fluid.dygraph.Layer):
+class NetWithExternalFunc(paddle.nn.Layer):
    @paddle.jit.to_static
    def forward(self, x, label=None):
        if paddle.mean(x) < 0:

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -40,7 +40,7 @@ input_specs = [
 ]


-class DynamicGRU(fluid.dygraph.Layer):
+class DynamicGRU(paddle.nn.Layer):
    def __init__(
        self,
        size,
@@ -90,7 +90,7 @@ class DynamicGRU(fluid.dygraph.Layer):
        return res


-class BiGRU(fluid.dygraph.Layer):
+class BiGRU(paddle.nn.Layer):
    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
        super().__init__()

@@ -158,7 +158,7 @@ class BiGRU(fluid.dygraph.Layer):
        return bi_merge


-class LinearChainCRF(fluid.dygraph.Layer):
+class LinearChainCRF(paddle.nn.Layer):
    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
        super().__init__()

@@ -222,7 +222,7 @@ class LinearChainCRF(fluid.dygraph.Layer):
        return log_likelihood


-class CRFDecoding(fluid.dygraph.Layer):
+class CRFDecoding(paddle.nn.Layer):
    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
        super().__init__()

@@ -271,7 +271,7 @@ class CRFDecoding(fluid.dygraph.Layer):
        return viterbi_path


-class ChunkEval(fluid.dygraph.Layer):
+class ChunkEval(paddle.nn.Layer):
    def __init__(
        self, num_chunk_types, chunk_scheme, excluded_chunk_types=None
    ):
@@ -344,7 +344,7 @@ class ChunkEval(fluid.dygraph.Layer):
        )


-class LexNet(fluid.dygraph.Layer):
+class LexNet(paddle.nn.Layer):
    def __init__(self, args, length=None):
        super().__init__()
        """

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -34,7 +34,7 @@ if paddle.fluid.is_compiled_with_cuda():
    paddle.fluid.set_flags({'FLAGS_cudnn_deterministic': True})


-class SimpleImgConvPool(fluid.dygraph.Layer):
+class SimpleImgConvPool(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -80,7 +80,7 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
        return x


-class MNIST(fluid.dygraph.Layer):
+class MNIST(paddle.nn.Layer):
    def __init__(self):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -36,7 +36,7 @@ if fluid.is_compiled_with_cuda():
 SEED = 2020


-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -83,7 +83,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
        return y


-class DepthwiseSeparable(fluid.dygraph.Layer):
+class DepthwiseSeparable(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -120,7 +120,7 @@ class DepthwiseSeparable(fluid.dygraph.Layer):
        return y


-class MobileNetV1(fluid.dygraph.Layer):
+class MobileNetV1(paddle.nn.Layer):
    def __init__(self, scale=1.0, class_dim=1000):
        super().__init__()
        self.scale = scale
@@ -276,7 +276,7 @@ class MobileNetV1(fluid.dygraph.Layer):
        return y


-class InvertedResidualUnit(fluid.dygraph.Layer):
+class InvertedResidualUnit(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -329,7 +329,7 @@ class InvertedResidualUnit(fluid.dygraph.Layer):
        return y


-class InvresiBlocks(fluid.dygraph.Layer):
+class InvresiBlocks(paddle.nn.Layer):
    def __init__(self, in_c, t, c, n, s):
        super().__init__()

@@ -366,7 +366,7 @@ class InvresiBlocks(fluid.dygraph.Layer):
        return y


-class MobileNetV2(fluid.dygraph.Layer):
+class MobileNetV2(paddle.nn.Layer):
    def __init__(self, class_dim=1000, scale=1.0):
        super().__init__()
        self.scale = scale

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
@@ -173,7 +173,7 @@ class TestWithNoGrad(unittest.TestCase):
                )


-class GPT2LMHeadModel(fluid.dygraph.Layer):
+class GPT2LMHeadModel(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.embedding0 = paddle.nn.Embedding(20, 16)

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
@@ -203,7 +203,7 @@ class StaticCode2:
        return __return_value_1


-class NetWithError(fluid.dygraph.layers.Layer):
+class NetWithError(paddle.nn.Layer):
    @to_static
    def forward(self, x):
        linear = paddle.nn.Linear(32, 64)
@@ -240,7 +240,7 @@ class TestEnableDeclarative(unittest.TestCase):
            )


-class Net(fluid.dygraph.layers.Layer):
+class Net(paddle.nn.Layer):
    def __init__(self):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
@@ -28,7 +28,7 @@ PRINT_STEP = 20
 SEED = 2020


-class SimpleLSTMRNN(fluid.Layer):
+class SimpleLSTMRNN(paddle.nn.Layer):
    def __init__(
        self, hidden_size, num_steps, num_layers=2, init_scale=0.1, dropout=None
    ):
@@ -128,7 +128,7 @@ class SimpleLSTMRNN(fluid.Layer):
        return real_res, last_hidden, last_cell


-class PtbModel(fluid.Layer):
+class PtbModel(paddle.nn.Layer):
    def __init__(
        self,
        hidden_size,

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
@@ -22,8 +22,9 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 import paddle.nn.functional as F
-from paddle.fluid.dygraph import Layer, to_variable
+from paddle.fluid.dygraph import to_variable
 from paddle.jit.api import to_static
+from paddle.nn import Layer

 SEED = 2020


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -55,7 +55,7 @@ def optimizer_setting(parameter_list=None):
    return optimizer


-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -86,7 +86,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
        return y


-class BottleneckBlock(fluid.dygraph.Layer):
+class BottleneckBlock(paddle.nn.Layer):
    def __init__(self, num_channels, num_filters, stride, shortcut=True):
        super().__init__()

@@ -140,7 +140,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
        return layer_helper.append_activation(y)


-class ResNet(fluid.dygraph.Layer):
+class ResNet(paddle.nn.Layer):
    def __init__(self, layers=50, class_dim=102):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
@@ -33,7 +33,7 @@ place = (
 )


-class SimpleFcLayer(fluid.dygraph.Layer):
+class SimpleFcLayer(paddle.nn.Layer):
    def __init__(self, fc_size):
        super().__init__()
        self._linear = paddle.nn.Linear(fc_size, fc_size)

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -89,7 +89,7 @@ def optimizer_setting(params, parameter_list):
    return optimizer


-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -120,7 +120,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
        return y


-class SqueezeExcitation(fluid.dygraph.Layer):
+class SqueezeExcitation(paddle.nn.Layer):
    def __init__(self, num_channels, reduction_ratio):

        super().__init__()
@@ -154,7 +154,7 @@ class SqueezeExcitation(fluid.dygraph.Layer):
        return y


-class BottleneckBlock(fluid.dygraph.Layer):
+class BottleneckBlock(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -218,7 +218,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
        return y


-class SeResNeXt(fluid.dygraph.Layer):
+class SeResNeXt(paddle.nn.Layer):
    def __init__(self, layers=50, class_dim=102):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
@@ -32,7 +32,7 @@ if fluid.is_compiled_with_cuda():
    fluid.set_flags({'FLAGS_cudnn_deterministic': True})


-class SimpleConvPool(fluid.dygraph.Layer):
+class SimpleConvPool(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -57,7 +57,7 @@ class SimpleConvPool(fluid.dygraph.Layer):
        return x


-class CNN(fluid.dygraph.Layer):
+class CNN(paddle.nn.Layer):
    def __init__(self, dict_dim, batch_size, seq_len):
        super().__init__()
        self.dict_dim = dict_dim
@@ -112,7 +112,7 @@ class CNN(fluid.dygraph.Layer):
        return avg_cost, prediction, acc


-class BOW(fluid.dygraph.Layer):
+class BOW(paddle.nn.Layer):
    def __init__(self, dict_dim, batch_size, seq_len):
        super().__init__()
        self.dict_dim = dict_dim
@@ -157,7 +157,7 @@ class BOW(fluid.dygraph.Layer):
        return avg_cost, prediction, acc


-class GRU(fluid.dygraph.Layer):
+class GRU(paddle.nn.Layer):
    def __init__(self, dict_dim, batch_size, seq_len):
        super().__init__()
        self.dict_dim = dict_dim
@@ -205,7 +205,7 @@ class GRU(fluid.dygraph.Layer):
        return avg_cost, prediction, acc


-class BiGRU(fluid.dygraph.Layer):
+class BiGRU(paddle.nn.Layer):
    def __init__(self, dict_dim, batch_size, seq_len):
        super().__init__()
        self.dict_dim = dict_dim

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
@@ -49,7 +49,7 @@ def parse_args():
    return args


-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -86,7 +86,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
        return y


-class BottleneckBlock(fluid.dygraph.Layer):
+class BottleneckBlock(paddle.nn.Layer):
    def __init__(
        self, num_channels, num_filters, stride, shortcut=True, seg_num=8
    ):
@@ -138,7 +138,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
        return y


-class TSM_ResNet(fluid.dygraph.Layer):
+class TSM_ResNet(paddle.nn.Layer):
    def __init__(self, name_scope, config, mode):
        super().__init__(name_scope)


--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
@@ -219,7 +219,7 @@ def build_batch(dataset, batch_size, epoch_num):
        )


-class SkipGram(fluid.dygraph.Layer):
+class SkipGram(paddle.nn.Layer):
    def __init__(self, name_scope, vocab_size, embedding_size, init_scale=0.1):
        super().__init__(name_scope)
        self.vocab_size = vocab_size

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
@@ -17,9 +17,9 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 import paddle.nn.functional as F
-from paddle.fluid.dygraph import Layer, to_variable
+from paddle.fluid.dygraph import to_variable
 from paddle.jit.api import dygraph_to_static_func
-from paddle.nn import Linear
+from paddle.nn import Layer, Linear


 def position_encoding_init(n_position, d_pos_vec):

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
@@ -130,7 +130,7 @@ cfg.use_gpu = fluid.is_compiled_with_cuda()
 cfg.class_num = 80


-class YoloDetectionBlock(fluid.dygraph.Layer):
+class YoloDetectionBlock(paddle.nn.Layer):
    def __init__(self, ch_in, channel, is_test=True):
        super().__init__()

@@ -197,7 +197,7 @@ class YoloDetectionBlock(fluid.dygraph.Layer):
        return route, tip


-class Upsample(fluid.dygraph.Layer):
+class Upsample(paddle.nn.Layer):
    def __init__(self, scale=2):
        super().__init__()
        self.scale = scale
@@ -219,7 +219,7 @@ class Upsample(fluid.dygraph.Layer):
        return out


-class YOLOv3(fluid.dygraph.Layer):
+class YOLOv3(paddle.nn.Layer):
    def __init__(self, ch_in, is_train=True, use_random=False):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
@@ -20,8 +20,7 @@ import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
-from paddle.fluid.dygraph.layers import Layer
-from paddle.nn import Sequential
+from paddle.nn import Layer, Sequential


 class ReshapeHelp(Layer):

--- a/python/paddle/fluid/tests/unittests/mlu/parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/mlu/parallel_dygraph_sync_batch_norm.py
@@ -31,7 +31,7 @@ sys.path.append("..")
 from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase


-class TestLayer(fluid.dygraph.Layer):
+class TestLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,

--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
@@ -18,7 +18,6 @@ import numpy as np

 import paddle
 import paddle.distributed as dist
-import paddle.fluid as fluid
 from paddle.nn import Linear

 paddle.seed(1024)
@@ -29,7 +28,7 @@ in_dim = 10
 out_dim = 20


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(self, train_id):
        super().__init__()
        self.w1 = self.create_parameter(

--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py
@@ -18,7 +18,6 @@ import numpy as np

 import paddle
 import paddle.distributed as dist
-import paddle.fluid as fluid
 from paddle.nn import Linear

 paddle.seed(1024)
@@ -29,7 +28,7 @@ in_dim = 10
 out_dim = 20


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(self, train_id):
        super().__init__()
        self.w1 = self.create_parameter(

--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
@@ -16,11 +16,10 @@ import numpy as np
 from test_dist_base import TestParallelDyGraphRunnerBase, runtime_main

 import paddle
-import paddle.fluid as fluid
 from paddle.fluid.dygraph.base import to_variable


-class SimpleImgConvPool(fluid.dygraph.Layer):
+class SimpleImgConvPool(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -66,7 +65,7 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
        return x


-class MNIST(fluid.dygraph.Layer):
+class MNIST(paddle.nn.Layer):
    def __init__(self):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py
@@ -16,7 +16,6 @@ import numpy as np
 from test_dist_base import TestParallelDyGraphRunnerBase, runtime_main

 import paddle
-import paddle.fluid as fluid

 np.random.seed(2021)
 paddle.seed(1024)
@@ -25,7 +24,7 @@ batch_size = 4
 batch_num = 1000


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.net_a = paddle.nn.Sequential(

--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py
@@ -16,7 +16,6 @@ import numpy as np
 from test_dist_base import TestParallelDyGraphRunnerBase, runtime_main

 import paddle
-import paddle.fluid as fluid
 from paddle.fluid.dygraph.base import to_variable
 from paddle.nn import Linear

@@ -24,7 +23,7 @@ np.random.seed(2021)
 paddle.seed(1024)


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(self):
        # bias is unused parameters, and it share with net_a
        super().__init__()

--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
@@ -21,7 +21,7 @@ from paddle.fluid.dygraph.base import to_variable
 from paddle.nn import Embedding


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(
        self,
        hidden_size,

--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -22,7 +22,7 @@ from paddle.fluid.dygraph import to_variable
 from paddle.fluid.framework import EagerParamBase, ParamBase, in_dygraph_mode


-class L1(fluid.Layer):
+class L1(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self._param_attr = fluid.ParamAttr(
@@ -39,7 +39,7 @@ class L1(fluid.Layer):
        return self.w1 + self.w2


-class L2(fluid.Layer):
+class L2(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.layer1 = L1()
@@ -49,7 +49,7 @@ class L2(fluid.Layer):
        return self.layer1() + self.layer2()


-class L3(fluid.Layer):
+class L3(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.layer1 = L2()
@@ -97,7 +97,7 @@ class TestBaseLayer(unittest.TestCase):

    def test_add_parameter_with_error(self):
        with fluid.dygraph.guard():
-            net = fluid.Layer()
+            net = paddle.nn.Layer()
            param = net.create_parameter(shape=[1])

            with self.assertRaises(TypeError):
@@ -121,7 +121,7 @@ class TestBaseLayer(unittest.TestCase):
            net.add_parameter("load_param", load_param)


-class BufferLayer(fluid.Layer):
+class BufferLayer(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        buffer_var = to_variable(np.zeros([2, 4]).astype('int32'))
@@ -131,7 +131,7 @@ class BufferLayer(fluid.Layer):
        pass


-class BufferNet(fluid.Layer):
+class BufferNet(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.buffer_layer = BufferLayer()
@@ -173,7 +173,7 @@ class TestBuffer(unittest.TestCase):

    def test_register_buffer_with_error(self):
        with fluid.dygraph.guard():
-            net = fluid.Layer()
+            net = paddle.nn.Layer()
            var = to_variable(np.zeros([1]))

            with self.assertRaisesRegex(
@@ -217,7 +217,7 @@ class TestBuffer(unittest.TestCase):

    def test_register_buffer_same_name(self):
        with fluid.dygraph.guard():
-            net = fluid.Layer()
+            net = paddle.nn.Layer()
            var1 = to_variable(np.zeros([1]))
            var2 = to_variable(np.zeros([2]))
            var3 = to_variable(np.zeros([3]))
@@ -231,7 +231,7 @@ class TestBuffer(unittest.TestCase):

    def test_buffer_not_persistable(self):
        with fluid.dygraph.guard():
-            net = fluid.Layer()
+            net = paddle.nn.Layer()
            var1 = to_variable(np.zeros([1]))

            net.register_buffer("buffer_name", var1, persistable=False)
@@ -240,7 +240,7 @@ class TestBuffer(unittest.TestCase):

    def test_buffer_not_persistable_del(self):
        with fluid.dygraph.guard():
-            net = fluid.Layer()
+            net = paddle.nn.Layer()
            var1 = to_variable(np.zeros([1]))
            net.register_buffer("buffer_name", var1, persistable=False)
            del net.buffer_name
@@ -248,7 +248,7 @@ class TestBuffer(unittest.TestCase):

    def test_buffer_not_persistable_overwrite(self):
        with fluid.dygraph.guard():
-            net = fluid.Layer()
+            net = paddle.nn.Layer()
            var1 = to_variable(np.zeros([1]))
            var2 = to_variable(np.zeros([2]))
            net.register_buffer("buffer_name", var1, persistable=False)
@@ -264,7 +264,7 @@ class TestBuffer(unittest.TestCase):

    def test_buffer_not_persistable_assign(self):
        with fluid.dygraph.guard():
-            net = fluid.Layer()
+            net = paddle.nn.Layer()
            var1 = to_variable(np.zeros([1]))
            net.register_buffer("buffer_name", var1, persistable=False)

@@ -288,14 +288,14 @@ class TestBuffer(unittest.TestCase):

    def test_buffer_not_persistable_load(self):
        with fluid.dygraph.guard():
-            net = fluid.Layer()
+            net = paddle.nn.Layer()
            var1 = to_variable(np.zeros([1]))
            net.register_buffer("buffer_name", var1, persistable=False)
            net.load_dict({})

    def test_buffer_state_dict(self):
        with fluid.dygraph.guard():
-            net = fluid.Layer()
+            net = paddle.nn.Layer()
            var1 = to_variable(np.zeros([2, 3]))
            var2 = to_variable(np.zeros([3, 2]))
            net.register_buffer("buffer_var1", var1)
@@ -307,7 +307,7 @@ class TestBuffer(unittest.TestCase):
            )

            # load state_dict
-            net_load = fluid.Layer()
+            net_load = paddle.nn.Layer()
            var = to_variable(np.ones([2, 3]))
            net_load.register_buffer("buffer_var1", var)
            net_load.load_dict(net.state_dict())

--- a/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py
@@ -21,7 +21,7 @@ import paddle.fluid as fluid
 from paddle.nn import Linear


-class SimpleImgConvPool(fluid.dygraph.Layer):
+class SimpleImgConvPool(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -68,7 +68,7 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
        return x


-class MNIST(fluid.dygraph.Layer):
+class MNIST(paddle.nn.Layer):
    def __init__(self, dtype="float32"):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
@@ -27,7 +27,7 @@ from paddle.nn import Linear
 SEED = 123123111


-class SimpleImgConvPool(fluid.dygraph.Layer):
+class SimpleImgConvPool(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -73,7 +73,7 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
        return x


-class MNIST(fluid.dygraph.Layer):
+class MNIST(paddle.nn.Layer):
    def __init__(self):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -22,7 +22,7 @@ from paddle.nn import Embedding
 from paddle.tensor import random


-class AutoPruneLayer0(fluid.Layer):
+class AutoPruneLayer0(paddle.nn.Layer):
    def __init__(self, input_size):
        super().__init__()
        self.linear1 = paddle.nn.Linear(
@@ -50,7 +50,7 @@ class AutoPruneLayer0(fluid.Layer):
        return d


-class AutoPruneLayer1(fluid.Layer):
+class AutoPruneLayer1(paddle.nn.Layer):
    def __init__(self, input_size):
        super().__init__()
        self.linear1 = paddle.nn.Linear(
@@ -79,7 +79,7 @@ class AutoPruneLayer1(fluid.Layer):
        return d


-class AutoPruneLayer2(fluid.Layer):
+class AutoPruneLayer2(paddle.nn.Layer):
    def __init__(self, input_size):
        super().__init__()
        self.linear = paddle.nn.Linear(input_size, 10)
@@ -98,7 +98,7 @@ class AutoPruneLayer2(fluid.Layer):
        return loss


-class AutoPruneLayer3(fluid.Layer):
+class AutoPruneLayer3(paddle.nn.Layer):
    def __init__(self, input_size):
        super().__init__()
        self.linear = paddle.nn.Linear(input_size, 20)
@@ -117,7 +117,7 @@ class AutoPruneLayer3(fluid.Layer):
            return loss, part1, part2


-class MyLayer(fluid.Layer):
+class MyLayer(paddle.nn.Layer):
    def __init__(self, input_size, vocab_size, size, dtype="float32"):
        super().__init__(dtype=dtype)
        self.embed0 = Embedding(vocab_size, size)
@@ -139,7 +139,7 @@ class MyLayer(fluid.Layer):
        return loss


-class MyLayer2(fluid.Layer):
+class MyLayer2(paddle.nn.Layer):
    def __init__(self, input_size, vocab_size, size, dtype="float32"):
        super().__init__(dtype=dtype)
        self.embed0 = Embedding(vocab_size, size)

--- a/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py
@@ -20,7 +20,7 @@ import paddle
 import paddle.fluid as fluid


-class MyLayer(fluid.Layer):
+class MyLayer(paddle.nn.Layer):
    def __init__(self, layerlist):
        super().__init__()
        self.layerlist = layerlist

--- a/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py
@@ -21,7 +21,7 @@ import paddle.fluid as fluid
 from paddle import _legacy_C_ops


-class MyLayer(fluid.Layer):
+class MyLayer(paddle.nn.Layer):
    def __init__(self, num_stacked_param, use_fluid_api):
        super().__init__()
        # create ParameterList with iterable Parameters

--- a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
@@ -22,7 +22,7 @@ import paddle.fluid.core as core
 from paddle.nn import Linear


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -27,7 +27,7 @@ from paddle.fluid.dygraph.base import to_variable
 from paddle.nn import Linear


-class DMF(fluid.Layer):
+class DMF(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self._user_latent = Linear(1000, 256)
@@ -78,7 +78,7 @@ class DMF(fluid.Layer):
        return paddle.multiply(users, items)


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self._user_latent = Linear(1000, 256)
@@ -111,7 +111,7 @@ class MLP(fluid.Layer):
        return match_vec


-class DeepCF(fluid.Layer):
+class DeepCF(paddle.nn.Layer):
    def __init__(self, num_users, num_items, matrix):
        super().__init__()
        self._num_users = num_users

--- a/python/paddle/fluid/tests/unittests/test_imperative_framework.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_framework.py
@@ -21,7 +21,7 @@ import paddle
 import paddle.fluid as fluid


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, input_size):
        super().__init__()
        self._linear1 = paddle.nn.Linear(

--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -25,7 +25,7 @@ from paddle.fluid.optimizer import SGDOptimizer
 from paddle.nn import Linear


-class Discriminator(fluid.Layer):
+class Discriminator(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self._fc1 = Linear(1, 32)
@@ -38,7 +38,7 @@ class Discriminator(fluid.Layer):
        return x


-class Generator(fluid.Layer):
+class Generator(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self._fc1 = Linear(2, 64)

--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -30,7 +30,7 @@ def gen_data():
    pass


-class GraphConv(fluid.Layer):
+class GraphConv(paddle.nn.Layer):
    def __init__(self, name_scope, in_features, out_features):
        super().__init__(name_scope)

@@ -51,7 +51,7 @@ class GraphConv(fluid.Layer):
        return paddle.matmul(adj, support) + self.bias


-class GCN(fluid.Layer):
+class GCN(paddle.nn.Layer):
    def __init__(self, name_scope, num_hidden):
        super().__init__(name_scope)
        self.gc = GraphConv(self.full_name(), num_hidden, 32)

--- a/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
@@ -21,7 +21,7 @@ import paddle.fluid as fluid
 import paddle.nn as nn


-class LeNetDygraph(fluid.dygraph.Layer):
+class LeNetDygraph(paddle.nn.Layer):
    def __init__(self, num_classes=10, classifier_activation='softmax'):
        super().__init__()
        self.num_classes = num_classes

--- a/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
@@ -21,7 +21,7 @@ import paddle.fluid as fluid
 import paddle.nn as nn


-class LeNetDygraph(fluid.dygraph.Layer):
+class LeNetDygraph(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(

--- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
@@ -170,7 +170,7 @@ class TestDygraphLoadStatic(unittest.TestCase):

        with fluid.dygraph.guard():

-            class MyTest(fluid.dygraph.Layer):
+            class MyTest(paddle.nn.Layer):
                def __init__(self):
                    super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -25,7 +25,7 @@ from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.optimizer import SGDOptimizer


-class SimpleNet(fluid.Layer):
+class SimpleNet(paddle.nn.Layer):
    def __init__(
        self,
        hidden_size,

--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
@@ -25,7 +25,7 @@ from paddle.fluid.optimizer import SGDOptimizer
 from paddle.nn import Linear


-class SimpleImgConvPool(fluid.dygraph.Layer):
+class SimpleImgConvPool(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -70,7 +70,7 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
        return x


-class MNIST(fluid.dygraph.Layer):
+class MNIST(paddle.nn.Layer):
    def __init__(self):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
@@ -20,7 +20,7 @@ import paddle
 import paddle.fluid as fluid


-class MyLayer(fluid.Layer):
+class MyLayer(paddle.nn.Layer):
    def __init__(self, num_channel, dim, num_filter=5):
        super().__init__()
        self.fc = paddle.nn.Linear(dim, dim)
@@ -84,7 +84,7 @@ class TestImperativeNamedParameters(unittest.TestCase):
    def test_dir_layer(self):
        with fluid.dygraph.guard():

-            class Mymodel(fluid.dygraph.Layer):
+            class Mymodel(paddle.nn.Layer):
                def __init__(self):
                    super().__init__()
                    self.linear1 = paddle.nn.Linear(10, 10)

--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -59,7 +59,7 @@ class Config:
    DATA_SHAPE = [1, 16, 64]


-class ConvBNPool(fluid.dygraph.Layer):
+class ConvBNPool(paddle.nn.Layer):
    def __init__(
        self,
        group,
@@ -122,7 +122,7 @@ class ConvBNPool(fluid.dygraph.Layer):
        return bn_1


-class OCRConv(fluid.dygraph.Layer):
+class OCRConv(paddle.nn.Layer):
    def __init__(self, is_test=False, use_cudnn=True):
        super().__init__()
        self.conv_bn_pool_1 = ConvBNPool(
@@ -152,7 +152,7 @@ class OCRConv(fluid.dygraph.Layer):
        return inputs_4


-class DynamicGRU(fluid.dygraph.Layer):
+class DynamicGRU(paddle.nn.Layer):
    def __init__(
        self,
        size,
@@ -193,7 +193,7 @@ class DynamicGRU(fluid.dygraph.Layer):
        return res


-class EncoderNet(fluid.dygraph.Layer):
+class EncoderNet(paddle.nn.Layer):
    def __init__(
        self, rnn_hidden_size=Config.encoder_size, is_test=False, use_cudnn=True
    ):
@@ -277,7 +277,7 @@ class EncoderNet(fluid.dygraph.Layer):
        return gru_backward, encoded_vector, encoded_proj


-class SimpleAttention(fluid.dygraph.Layer):
+class SimpleAttention(paddle.nn.Layer):
    def __init__(self, decoder_size):
        super().__init__()

@@ -312,7 +312,7 @@ class SimpleAttention(fluid.dygraph.Layer):
        return context


-class GRUDecoderWithAttention(fluid.dygraph.Layer):
+class GRUDecoderWithAttention(paddle.nn.Layer):
    def __init__(self, decoder_size, num_classes):
        super().__init__()
        self.simple_attention = SimpleAttention(decoder_size)
@@ -359,7 +359,7 @@ class GRUDecoderWithAttention(fluid.dygraph.Layer):
        return res1


-class OCRAttention(fluid.dygraph.Layer):
+class OCRAttention(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.encoder_net = EncoderNet()

--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -46,7 +46,7 @@ from paddle.fluid.optimizer import (
 # In dygraph, don't support ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer.


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
@@ -43,7 +43,7 @@ from paddle.fluid.optimizer import (
 # In dygraph, don't support ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer.


-class MLP(fluid.Layer):
+class MLP(paddle.nn.Layer):
    def __init__(self, param_attr=None, bias_attr=None):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_parallel_coalesce_split.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_parallel_coalesce_split.py
@@ -24,7 +24,7 @@ from paddle.fluid import core
 from paddle.fluid.dygraph.base import to_variable


-class MyLayer(fluid.Layer):
+class MyLayer(paddle.nn.Layer):
    def __init__(self, name_scope):
        super().__init__(name_scope)


--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -27,7 +27,7 @@ from paddle.fluid.optimizer import SGDOptimizer
 from paddle.nn import Embedding


-class SimpleLSTMRNN(fluid.Layer):
+class SimpleLSTMRNN(paddle.nn.Layer):
    def __init__(
        self, hidden_size, num_steps, num_layers=2, init_scale=0.1, dropout=None
    ):
@@ -145,7 +145,7 @@ class SimpleLSTMRNN(fluid.Layer):
        return real_res, last_hidden, last_cell


-class PtbModel(fluid.Layer):
+class PtbModel(paddle.nn.Layer):
    def __init__(
        self,
        hidden_size,

--- a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py
@@ -23,7 +23,7 @@ import paddle.fluid.core as core
 from paddle.fluid.dygraph.base import to_variable


-class RecurrentTest(fluid.Layer):
+class RecurrentTest(paddle.nn.Layer):
    def __init__(self, name_scope):
        super().__init__(name_scope)


--- a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
@@ -24,7 +24,7 @@ from paddle.fluid import core
 from paddle.fluid.optimizer import SGDOptimizer


-class Policy(fluid.dygraph.Layer):
+class Policy(paddle.nn.Layer):
    def __init__(self, input_size):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -75,7 +75,7 @@ def optimizer_setting(params, parameter_list=None):
    return optimizer


-class ConvBNLayer(fluid.Layer):
+class ConvBNLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -107,7 +107,7 @@ class ConvBNLayer(fluid.Layer):
        return y


-class BottleneckBlock(fluid.Layer):
+class BottleneckBlock(paddle.nn.Layer):
    def __init__(
        self, num_channels, num_filters, stride, shortcut=True, use_cudnn=False
    ):
@@ -163,7 +163,7 @@ class BottleneckBlock(fluid.Layer):
        return layer_helper.append_activation(y)


-class ResNet(fluid.Layer):
+class ResNet(paddle.nn.Layer):
    def __init__(self, layers=50, class_dim=102, use_cudnn=True):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -27,7 +27,7 @@ from paddle.nn import Embedding
 from paddle.optimizer import Adam


-class SimpleLSTMRNN(fluid.Layer):
+class SimpleLSTMRNN(paddle.nn.Layer):
    def __init__(
        self, hidden_size, num_steps, num_layers=2, init_scale=0.1, dropout=None
    ):
@@ -142,7 +142,7 @@ class SimpleLSTMRNN(fluid.Layer):
        return real_res, last_hidden, last_cell


-class PtbModel(fluid.Layer):
+class PtbModel(paddle.nn.Layer):
    def __init__(
        self,
        hidden_size,

--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -64,7 +64,7 @@ def optimizer_setting(params, parameter_list=None):
    return optimizer


-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -95,7 +95,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
        return y


-class SqueezeExcitation(fluid.dygraph.Layer):
+class SqueezeExcitation(paddle.nn.Layer):
    def __init__(self, num_channels, reduction_ratio):

        super().__init__()
@@ -129,7 +129,7 @@ class SqueezeExcitation(fluid.dygraph.Layer):
        return y


-class BottleneckBlock(fluid.dygraph.Layer):
+class BottleneckBlock(paddle.nn.Layer):
    def __init__(
        self,
        num_channels,
@@ -192,7 +192,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
        return y


-class SeResNeXt(fluid.dygraph.Layer):
+class SeResNeXt(paddle.nn.Layer):
    def __init__(self, layers=50, class_dim=102):
        super().__init__()


--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
--- a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
--- a/python/paddle/fluid/tests/unittests/test_op_function_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_op_function_generator.py
--- a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
--- a/python/paddle/fluid/tests/unittests/test_retain_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py
--- a/python/paddle/fluid/tests/unittests/test_static_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py
--- a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
+++ b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
--- a/python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_gradient_check.py
--- a/python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
+++ b/python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
--- a/python/paddle/incubate/asp/supported_layer_list.py
+++ b/python/paddle/incubate/asp/supported_layer_list.py
--- a/python/paddle/jit/api.py
+++ b/python/paddle/jit/api.py
--- a/python/paddle/jit/dy2static/convert_call_func.py
+++ b/python/paddle/jit/dy2static/convert_call_func.py
--- a/python/paddle/jit/dy2static/function_spec.py
+++ b/python/paddle/jit/dy2static/function_spec.py
--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
--- a/python/paddle/jit/dy2static/utils.py
+++ b/python/paddle/jit/dy2static/utils.py
--- a/python/paddle/jit/translated_layer.py
+++ b/python/paddle/jit/translated_layer.py
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
--- a/python/paddle/nn/layer/container.py
+++ b/python/paddle/nn/layer/container.py
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
--- a/python/paddle/nn/layer/vision.py
+++ b/python/paddle/nn/layer/vision.py