[dygraph]unify _non_static_mode() in_dygraph_mode() and in_dynamic_mode() (#53856)

* [dygraph]unify _non_static_mode() in_dygraph_mode() and in_dynamic_mode() * [dygraph]unify _non_static_mode() in_dygraph_mode() and in_dynamic_mode() * [dygraph]unify _non_static_mode() in_dygraph_mode() and in_dynamic_mode() * [dygraph]unify _non_static_mode() in_dygraph_mode() and in_dynamic_mode() * [dygraph]unify _non_static_mode() in_dygraph_mode() and in_dynamic_mode() * [dygraph]unify _non_static_mode() in_dygraph_mode() and in_dynamic_mode() * fixed cyclic reference that caused patial import * fixed bad change * fix bad import * fix bad import * fix bad import * fix ut failed caused by change in_dynamic_mode * fix ut failed caused by change in_dynamic_mode * fixed usage of in_dynamic_mode() or in_dygraph_mode() * revert python3 to python in .pre-commit-config.yaml * fix merge conflicts

[dygraph]unify _non_static_mode() in_dygraph_mode() and in_dynamic_mode() (#53856)
* [dygraph]unify _non_static_mode() in_dygraph_mode() and in_dynamic_mode() * [dygraph]unify _non_static_mode() in_dygraph_mode() and in_dynamic_mode() * [dygraph]unify _non_static_mode() in_dygraph_mode() and in_dynamic_mode() * [dygraph]unify _non_static_mode() in_dygraph_mode() and in_dynamic_mode() * [dygraph]unify _non_static_mode() in_dygraph_mode() and in_dynamic_mode() * [dygraph]unify _non_static_mode() in_dygraph_mode() and in_dynamic_mode() * fixed cyclic reference that caused patial import * fixed bad change * fix bad import * fix bad import * fix bad import * fix ut failed caused by change in_dynamic_mode * fix ut failed caused by change in_dynamic_mode * fixed usage of in_dynamic_mode() or in_dygraph_mode() * revert python3 to python in .pre-commit-config.yaml * fix merge conflicts
3794d171 · Meteor Liu · GitHub · 98f4446a · 3794d171 · 3794d171
164 changed file
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -19,10 +19,11 @@ from enum import Enum
 import numpy as np
 from paddle import _C_ops, _legacy_C_ops
-from paddle.fluid import core, in_dygraph_mode
+from paddle.fluid import core
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.framework import _dygraph_tracer, dygraph_only
+from paddle.framework import in_dynamic_mode
 from .auto_cast import amp_global_state
@@ -307,7 +308,7 @@ class AmpScaler:
                        else:
                            param_grads_fp32.append(param._grad_ivar())
        else:
-            if in_dygraph_mode():
+            if in_dynamic_mode():
                # It is very time-consuming to call c++ functions in a loop on the python side.
                # We put this part of the code on the c++ side to improve the speed in eager mode.
                (

--- a/python/paddle/autograd/autograd.py
+++ b/python/paddle/autograd/autograd.py
@@ -689,7 +689,7 @@ def _grad_for_jacobian(ys, xs, v=None):
            Tensor is the sum of gradients of outputs with respect to the i-th
            inputs.
    """
-    if paddle.fluid._non_static_mode():
+    if paddle.in_dynamic_mode():
        # paddle.grad returns a list though the inputs is a signle Tensor. The
        # follow code snippet fixes the problem by return the first element of
        # xs_grad when the xs is a signle Tensor.

--- a/python/paddle/common_ops_import.py
+++ b/python/paddle/common_ops_import.py
@@ -25,7 +25,6 @@ from paddle.fluid.framework import (  # noqa: F401
    Variable,
    _create_tensor,
    _dygraph_tracer,
-    _non_static_mode,
    convert_np_dtype_to_dtype_,
    default_main_program,
    device_guard,

--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -28,7 +28,7 @@ from paddle.distributed import fleet
 from paddle.fluid.executor import _to_name_str
 from paddle.framework import IrGraph
 from paddle.framework import _current_expected_place as _get_device
-from paddle.framework import core, in_dygraph_mode
+from paddle.framework import core, in_dynamic_mode
 from paddle.metric import Metric
 from paddle.static import InputSpec, Operator, Variable, global_scope
@@ -312,7 +312,7 @@ class Engine:
        return inputs_spec, labels_spec
    def _prepare_data_tensor(self, inputs_spec, labels_spec, inputs, labels):
-        if in_dygraph_mode() or self._dygraph_mode:
+        if in_dynamic_mode() or self._dygraph_mode:
            raise ValueError("Only support static graph mode.")
        if inputs_spec:
@@ -561,7 +561,7 @@ class Engine:
        self._has_prepared[mode] = True
    def _build(self, mode):
-        if in_dygraph_mode() or self._dygraph_mode:
+        if in_dynamic_mode() or self._dygraph_mode:
            paddle.disable_static()
            self._dygraph_mode = True
            self._logger.info("Building model with 'to_static' method.")
@@ -1789,7 +1789,7 @@ class Engine:
            self._build(mode)
            self._plan(mode)
        else:
-            if in_dygraph_mode() or self._dygraph_mode:
+            if in_dynamic_mode() or self._dygraph_mode:
                raise ValueError(
                    "Please call `prepare()` or `fit()` or  `evaluate()` or  `predict()` before calling `cost()`."
                )

--- a/python/paddle/distributed/auto_parallel/process_group.py
+++ b/python/paddle/distributed/auto_parallel/process_group.py
@@ -16,7 +16,7 @@ from collections import OrderedDict
 import paddle
 from paddle import _legacy_C_ops
-from paddle.framework import core, in_dygraph_mode
+from paddle.framework import core, in_dynamic_mode
 from paddle.tensor import fill_constant
 from ..collective import _get_global_env, _new_ring_id
@@ -177,7 +177,7 @@ class ProcessGroup:
                )
            tmp = (
                paddle.to_tensor([1], dtype="int32")
-                if in_dygraph_mode()
+                if in_dynamic_mode()
                else fill_constant([0], dtype="int32", value="1")
            )
            # use legacy ops

--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -19,7 +19,7 @@ import paddle
 # (TODO: GhostScreaming) It will be removed later.
 from paddle.fluid import core
-from paddle.framework import in_dygraph_mode
+from paddle.framework import in_dynamic_mode
 from .communication.group import Group, _add_new_group, is_initialized
 from .fleet.layers.mpu.mp_ops import _c_concat  # noqa: F401
@@ -128,7 +128,7 @@ def _set_group_map_backend(group, backend):
 def _new_ring_id():
    # NOTE(liyurui): For compatible reason, auto parallel and eager mode relay on previous syntax.
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        global _start_ring_id
        _start_ring_id += 1
        return _start_ring_id + max(_get_global_env().nrings, 9)
@@ -198,7 +198,7 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
    """
    global _custom_gid
    global _group_map
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        global _default_group_name
        gid = _custom_gid if _custom_gid else _new_ring_id()
        group_name = _default_group_name + str(gid)
@@ -292,7 +292,7 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
    # hang caused by cross-creation of new_group
    tmp = (
        paddle.to_tensor([1], dtype="int32")
-        if in_dygraph_mode()
+        if in_dynamic_mode()
        else paddle.full([0], 1, dtype="int32")
    )
    paddle.distributed.all_reduce(tmp, sync_op=True)

--- a/python/paddle/distributed/communication/all_gather.py
+++ b/python/paddle/distributed/communication/all_gather.py
@@ -102,7 +102,7 @@ def all_gather_object(object_list, obj, group=None):
            # [{'foo': [1, 2, 3]}, {'bar': [4, 5, 6]}] (2 GPUs)
    """
    assert (
-        framework.in_dygraph_mode()
+        framework.in_dynamic_mode()
    ), "all_gather_object doesn't support static graph mode."
    tensor, len_of_tensor = convert_object_to_tensor(obj)

--- a/python/paddle/distributed/communication/batch_isend_irecv.py
+++ b/python/paddle/distributed/communication/batch_isend_irecv.py
@@ -159,7 +159,7 @@ def batch_isend_irecv(p2p_op_list):
    if _warn_cur_rank_not_in_group(group):
        return
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        group = _get_global_group() if group is None else group
        backend = group.backend
        tasks = []

--- a/python/paddle/distributed/communication/broadcast.py
+++ b/python/paddle/distributed/communication/broadcast.py
@@ -102,7 +102,7 @@ def broadcast_object_list(object_list, src, group=None):
            # [{"bar": [4, 5, 6]}] (2 GPUs)
    """
    assert (
-        framework.in_dygraph_mode()
+        framework.in_dynamic_mode()
    ), "broadcast_object_list doesn't support static graph mode."
    rank = dist.get_rank()

--- a/python/paddle/distributed/communication/gather.py
+++ b/python/paddle/distributed/communication/gather.py
@@ -55,6 +55,6 @@ def gather(tensor, gather_list=None, dst=0, group=None, sync_op=True):
            # [] (2 GPUs, out for rank 1)
    """
    assert (
-        framework.in_dygraph_mode()
+        framework.in_dynamic_mode()
    ), "gather doesn't support static graph mode yet."
    return stream.gather(tensor, gather_list, dst, group, sync_op)
--- a/python/paddle/distributed/communication/group.py
+++ b/python/paddle/distributed/communication/group.py
@@ -231,7 +231,7 @@ def get_group(id=0):
 def _sync_calc_stream(tensor):
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        return paddle._legacy_C_ops.c_sync_calc_stream(tensor, tensor)
    else:
        op_type = 'c_sync_calc_stream'
@@ -244,7 +244,7 @@ def _sync_calc_stream(tensor):
 def _sync_comm_stream(tensor, ring_id=0):
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        return paddle._legacy_C_ops.c_sync_comm_stream(
            [tensor], [tensor], 'ring_id', ring_id
        )
@@ -318,7 +318,7 @@ def barrier(group=None):
    if group is not None and not group.is_member():
        return
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        group = _get_global_group() if group is None else group
        place = framework._current_expected_place()
        if isinstance(place, framework.CPUPlace):
@@ -332,7 +332,7 @@ def barrier(group=None):
    ring_id = 0 if group is None else group.id
    barrier_tensor = paddle.full([1], 1, dtype="int32")
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        return paddle._legacy_C_ops.barrier(
            barrier_tensor, barrier_tensor, 'ring_id', ring_id
        )

--- a/python/paddle/distributed/communication/reduce.py
+++ b/python/paddle/distributed/communication/reduce.py
@@ -56,7 +56,7 @@ class ReduceOp:
 def _get_reduce_op(reduce_op, func_name):
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        if reduce_op == ReduceOp.SUM:
            return framework.core.ReduceOp.SUM
        elif reduce_op == ReduceOp.MAX:

--- a/python/paddle/distributed/communication/scatter.py
+++ b/python/paddle/distributed/communication/scatter.py
@@ -108,7 +108,7 @@ def scatter_object_list(
            # [{'bar': [4, 5, 6]}] (2 GPUs, out for rank 1)
    """
    assert (
-        framework.in_dygraph_mode()
+        framework.in_dynamic_mode()
    ), "scatter_object_list doesn't support static graph mode."
    rank = dist.get_rank()

--- a/python/paddle/distributed/communication/stream/all_gather.py
+++ b/python/paddle/distributed/communication/stream/all_gather.py
@@ -171,7 +171,7 @@ def all_gather(
            "use_calc_stream can only be true in sync op behavior."
        )
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        if paddle.is_tensor(tensor_or_tensor_list):
            return _all_gather_into_tensor_in_dygraph(
                tensor_or_tensor_list, tensor, group, sync_op, use_calc_stream

--- a/python/paddle/distributed/communication/stream/all_reduce.py
+++ b/python/paddle/distributed/communication/stream/all_reduce.py
@@ -116,7 +116,7 @@ def all_reduce(
            "use_calc_stream can only be true in sync op behavior."
        )
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        group = _get_global_group() if group is None else group
        return _all_reduce_in_dygraph(
            tensor, op, group, sync_op, use_calc_stream

--- a/python/paddle/distributed/communication/stream/all_to_all.py
+++ b/python/paddle/distributed/communication/stream/all_to_all.py
@@ -185,7 +185,7 @@ def alltoall(
    if in_tensor_or_tensor_list is None:
        raise RuntimeError("The input should be specified.")
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        group = _get_global_group() if group is None else group
        out_is_tensor = paddle.is_tensor(out_tensor_or_tensor_list)
        in_is_tensor = paddle.is_tensor(in_tensor_or_tensor_list)
@@ -335,7 +335,7 @@ def alltoall_single(
            "use_calc_stream can only be true in sync op behavior."
        )
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        group = _get_global_group() if group is None else group
        return _alltoall_single_in_dygraph(
            out_tensor,

--- a/python/paddle/distributed/communication/stream/broadcast.py
+++ b/python/paddle/distributed/communication/stream/broadcast.py
@@ -117,7 +117,7 @@ def broadcast(tensor, src, group=None, sync_op=True, use_calc_stream=False):
            "use_calc_stream can only be True in sync op behavior."
        )
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        group = _get_global_group() if group is None else group
        src_rank_in_group = _get_or_throw_group_rank(src, group)

--- a/python/paddle/distributed/communication/stream/gather.py
+++ b/python/paddle/distributed/communication/stream/gather.py
@@ -99,7 +99,7 @@ def gather(
    """
    assert (
-        framework.in_dygraph_mode()
+        framework.in_dynamic_mode()
    ), "gather doesn't support static graph mode yet."
    if _warn_cur_rank_not_in_group(group):

--- a/python/paddle/distributed/communication/stream/recv.py
+++ b/python/paddle/distributed/communication/stream/recv.py
@@ -105,7 +105,7 @@ def recv(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
            "use_calc_stream can only be True in sync op behavior."
        )
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        group = _get_global_group() if group is None else group
        src_rank_in_group = _get_or_throw_group_rank(src, group)

--- a/python/paddle/distributed/communication/stream/reduce.py
+++ b/python/paddle/distributed/communication/stream/reduce.py
@@ -131,7 +131,7 @@ def reduce(
            "use_calc_stream can only be true in sync op behavior."
        )
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        group = _get_global_group() if group is None else group
        dst_rank_in_group = _get_or_throw_group_rank(dst, group)
        return _reduce_in_dygraph(

--- a/python/paddle/distributed/communication/stream/reduce_scatter.py
+++ b/python/paddle/distributed/communication/stream/reduce_scatter.py
@@ -158,7 +158,7 @@ def reduce_scatter(
            "use_calc_stream can only be true in sync op behavior."
        )
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        group = _get_global_group() if group is None else group
        if paddle.is_tensor(tensor_or_tensor_list):
            return _reduce_scatter_tensor_in_dygraph(
@@ -243,7 +243,7 @@ def _reduce_scatter_base(
            "use_calc_stream can only be true in sync op behavior."
        )
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        group = _get_global_group() if group is None else group
        return _reduce_scatter_tensor_in_dygraph(
            out_tensor,

--- a/python/paddle/distributed/communication/stream/scatter.py
+++ b/python/paddle/distributed/communication/stream/scatter.py
@@ -201,7 +201,7 @@ def scatter(
            )
        tensor_or_tensor_list = []
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        group = _get_global_group() if group is None else group
        src_rank_in_group = _get_or_throw_group_rank(src, group)
        if paddle.is_tensor(tensor_or_tensor_list):

--- a/python/paddle/distributed/communication/stream/send.py
+++ b/python/paddle/distributed/communication/stream/send.py
@@ -104,7 +104,7 @@ def send(tensor, dst=0, group=None, sync_op=True, use_calc_stream=False):
            "use_calc_stream can only be True in sync op behavior."
        )
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        group = _get_global_group() if group is None else group
        dst_rank_in_group = _get_or_throw_group_rank(dst, group)

--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -1176,7 +1176,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
            else:
                self._collective_env()
            self._role_is_generated = True
-            if not paddle.framework.in_dynamic_mode():
+            if not paddle.in_dynamic_mode():
                self._gloo_init()

--- a/python/paddle/distributed/fleet/fleet.py
+++ b/python/paddle/distributed/fleet/fleet.py
@@ -17,9 +17,8 @@ import os
 import paddle
 from paddle.fluid import compiler
-from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.wrapped_decorator import wrap_decorator
-from paddle.framework import _global_flags
+from paddle.framework import _global_flags, in_dynamic_mode
 from paddle.framework.ir import apply_build_strategy
 from .base import topology as tp
@@ -281,7 +280,7 @@ class Fleet:
                        "CUDA_VISIBLE_DEVICES shoule be set only 1 card if you use `python` to launch fleet program."
                    )
-        if in_dygraph_mode():
+        if in_dynamic_mode():
            if self.worker_num() == 1:
                # if worker_num is 1, should construct default topology & hcg
                self._topology = tp.CommunicateTopology()
@@ -1270,7 +1269,7 @@ class Fleet:
            )
        else:
            if (
-                in_dygraph_mode()
+                in_dynamic_mode()
                or self._role_maker._is_non_distributed()
                or self._is_collective
            ):
@@ -1286,7 +1285,7 @@ class Fleet:
        context["user_defined_strategy"] = copy.deepcopy(
            self._user_defined_strategy
        )
-        if in_dygraph_mode():
+        if in_dynamic_mode():
            # imitate target optimizer retrieval
            target_opt = self.user_defined_optimizer
            self._context = context

--- a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
@@ -16,7 +16,7 @@ import paddle
 from paddle import _legacy_C_ops
 from paddle.distributed import collective
 from paddle.fluid.data_feeder import check_dtype, check_variable_and_dtype
-from paddle.framework import LayerHelper, _create_tensor, in_dygraph_mode
+from paddle.framework import LayerHelper, _create_tensor, in_dynamic_mode
 from paddle.nn import Layer
 from paddle.nn.utils import dygraph_utils
@@ -39,7 +39,7 @@ def _c_identity(tensor, group=None):
        return
    ring_id = 0 if group is None else group.id
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        from paddle.autograd import PyLayer
        class c_identity_eager(PyLayer):
@@ -108,7 +108,7 @@ def _c_concat(tensor, group=None):
    rank = group.rank
    nranks = group.nranks
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        return _legacy_C_ops.c_concat(
            tensor,
            'ring_id',
@@ -174,7 +174,7 @@ def _c_split(tensor, group=None):
        else group.nranks
    )
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        return _legacy_C_ops.c_split(
            tensor,
            'use_calc_stream',
@@ -226,7 +226,7 @@ def _mp_allreduce(
    if group is not None and not group.is_member():
        return
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        group = collective._get_default_group() if group is None else group
        assert op == ReduceOp.SUM, f"Unknown parameter: {op}."
@@ -308,7 +308,7 @@ def _c_lookup_table(table, index, start_index=0, name=None):
    Returns:
        Tensor.
    """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        return _legacy_C_ops.c_embedding(
            table, index, "start_index", start_index
        )
@@ -401,7 +401,7 @@ def _c_softmax_with_cross_entropy(
    if input_dims - 1 == label_dims:
        label = paddle.unsqueeze(label, axis=-1)
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        softmax, loss = _legacy_C_ops.c_softmax_with_cross_entropy(
            logits,
            label,
@@ -445,7 +445,7 @@ def _linear(x, weight, bias=None, name=None):
    """
    Fuction Linear
    """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        pre_bias = _create_tensor(dtype=x.dtype)
        _legacy_C_ops.matmul(
            x,
@@ -810,7 +810,7 @@ def split(
            supported_operations
        )
    )
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        raise ValueError(
            "paddle.distributed.split cannot be used in dynamic "
            "graph mode, plese use ParallelEmbedding, ParallelRowLinear, "

--- a/python/paddle/distributed/fleet/layers/mpu/random.py
+++ b/python/paddle/distributed/fleet/layers/mpu/random.py
@@ -21,8 +21,7 @@ from paddle import _legacy_C_ops
 from paddle.common_ops_import import Variable
 from paddle.fluid import core
 from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle.fluid.framework import in_dygraph_mode
+from paddle.framework import LayerHelper, in_dynamic_mode
-from paddle.framework import LayerHelper
 __all__ = []
@@ -218,7 +217,7 @@ def dropout(
    )  # semantic transfer
    # dygraph using tracker, doesn't need determinate seed
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        out, mask = _legacy_C_ops.dropout(
            x,
            'dropout_prob',

--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -22,7 +22,7 @@ import paddle
 from paddle.common_ops_import import LayerHelper
 from paddle.fluid.dygraph import base as imperative_base
 from paddle.fluid.optimizer import Momentum, Optimizer
-from paddle.framework import core, in_dygraph_mode
+from paddle.framework import core, in_dynamic_mode
 from paddle.nn.clip import ClipGradByNorm, append_gradient_clip_ops
 from paddle.regularizer import L1Decay, L2Decay
 from paddle.static import create_global_var
@@ -46,7 +46,7 @@ class DGCMomentumOptimizer(Optimizer):
        grad_clip=None,
        name=None,
    ):
-        if in_dygraph_mode():
+        if in_dynamic_mode():
            raise Exception("In dygraph, don't support DGCMomentumOptimizer.")
        assert (

--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -533,7 +533,7 @@ class PipelineLayer(nn.Layer):
        for key, comm in self.shared_comm.items():
            param = getattr(self.shared_layers[key], comm['weight_attr'])
            # need use trace_op to allreduce weight
-            if framework.in_dygraph_mode():
+            if framework.in_dynamic_mode():
                with paddle.framework.no_grad():
                    paddle.distributed.all_reduce(
                        param.grad

--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -555,7 +555,7 @@ class PipelineParallelWithInterleave(PipelineParallel):
        super().__init__(layers=layers, hcg=hcg, strategy=strategy)
        assert layers.get_num_virtual_stages() > 1
        assert (
-            framework.in_dygraph_mode()
+            framework.in_dynamic_mode()
        ), "virtual pipeline stage with interleave only support eager dygraph mode"
        # setup for interleave scheduler
        self.num_model_chunks = layers.get_num_virtual_stages()

--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
@@ -220,7 +220,7 @@ def _partial_send_op(
    tensor, group, use_calc_stream, ring_id, dst, nranks, rank_id
 ):
    dst_rank_in_group = dst if group is None else group.get_group_rank(dst)
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        group = (
            paddle.distributed.collective._get_default_group()
            if group is None
@@ -291,7 +291,7 @@ def recv_partial(
    else:
        if use_calc_stream:
            recv_op = paddle.distributed.recv
-        elif framework.in_dygraph_mode():
+        elif framework.in_dynamic_mode():
            recv_op = paddle.distributed.irecv
        return recv_op(tensor.detach(), src=src_rank, group=group)
@@ -656,7 +656,7 @@ def _p2p_helper(
                    tasks.append(task)
        _xpu_comm_group_end()
    if not sync_recv:
-        if framework.in_dygraph_mode():
+        if framework.in_dynamic_mode():
            # wait irecv tasks in eager dygraph mode with new comm library
            for task in tasks:
                assert task is not None

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
@@ -38,7 +38,7 @@ def _all_gather(tensor, buffer_size, group):
    """
    assert group is not None
-    if framework.in_dygraph_mode():
+    if framework.in_dynamic_mode():
        out = paddle.zeros([buffer_size], dtype=tensor.dtype)
        task = group.process_group.all_gather(tensor, out)
        return out, task

--- a/python/paddle/distributed/fleet/optimizer.py
+++ b/python/paddle/distributed/fleet/optimizer.py
@@ -15,7 +15,7 @@
 import copy
 from paddle.distributed import fleet
-from paddle.fluid.framework import in_dygraph_mode
+from paddle.framework import in_dynamic_mode
 from .meta_optimizers import HeterParallelOptimizer, HybridParallelOptimizer
 from .utils.log_util import logger
@@ -81,7 +81,7 @@ def _dygraph_distributed_optimizer(optimizer, strategy=None):
 def distributed_optimizer(*args, **kwargs):
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        return _dygraph_distributed_optimizer(*args, **kwargs)
    else:
        return fleet.fleet.distributed_optimizer(*args, **kwargs)
--- a/python/paddle/distributed/fleet/recompute/recompute.py
+++ b/python/paddle/distributed/fleet/recompute/recompute.py
@@ -21,7 +21,7 @@ from paddle.autograd import PyLayer
 from paddle.distributed.fleet.meta_parallel.parallel_layers.random import (
    get_rng_state_tracker,
 )
-from paddle.framework import core, in_dygraph_mode
+from paddle.framework import core, in_dynamic_mode
 from ..utils.log_util import logger
@@ -198,7 +198,7 @@ class RecomputeFunction(PyLayer):
                    forward_outputs_with_grad, backward_inputs_with_grad
                )
-            if in_dygraph_mode():
+            if in_dynamic_mode():
                grads = tuple(
                    inp._grad_ivar()
                    for inp in detached_inputs

--- a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
+++ b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
@@ -161,7 +161,7 @@ class _HPRecomputeFunction(PyLayer):
                #  If not marked non_differentiable, all output tensors' attr `stop gradient`
                #  will be reset to `False` in c++ backend.
                #  See https://github.com/PaddlePaddle/Paddle/blob/9d62efb0e6e5373823039d9eda96cd5905426c0a/paddle/fluid/pybind/eager_py_layer.cc#L388
-                if framework.in_dygraph_mode() and state:
+                if framework.in_dynamic_mode() and state:
                    ctx.mark_non_differentiable(arg)
            else:
                ctx.inputs.append(arg)

--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
@@ -20,8 +20,7 @@ from paddle.distributed import fleet
 # (TODO: GhostScreaming) It will be removed later.
 from paddle.fluid import core
-from paddle.fluid.framework import in_dygraph_mode
+from paddle.framework import Block, Program, in_dynamic_mode
-from paddle.framework import Block, Program
 class HybridParallelInferenceHelper:
@@ -205,7 +204,7 @@ class HybridParallelInferenceHelper:
            self._device = "gpu"
        assert self._device, "Only gpu are supported."
-        assert not in_dygraph_mode(), "Only static graph mode is supported."
+        assert not in_dynamic_mode(), "Only static graph mode is supported."
        op_maker = core.op_proto_and_checker_maker
        self._op_role = op_maker.OpRole

--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -17,7 +17,7 @@ from paddle import framework
 from paddle.distributed.parallel import (
    _split_tensors,
    build_groups,
-    in_dygraph_mode,
+    in_dynamic_mode,
    sync_params_buffers,
 )
@@ -131,7 +131,7 @@ def _broadcast_data_help(data, shape, dtype, hcg):
    )
    if mp_rank != 0:
-        if in_dygraph_mode():
+        if in_dynamic_mode():
            data._clear_data()
            input_data._share_buffer_to(data)
        else:
@@ -174,7 +174,7 @@ def broadcast_input_data(hcg, *inputs, **kwargs):
    for v in inputs:
        if isinstance(v, core.eager.Tensor):
            with framework.no_grad():
-                if in_dygraph_mode() and not eval(f"v.place.is_{dev}_place")():
+                if in_dynamic_mode() and not eval(f"v.place.is_{dev}_place")():
                    v_gpu = v._copy_to(place, True)
                    v._clear_data()
                    v_gpu._share_buffer_to(v)
@@ -185,7 +185,7 @@ def broadcast_input_data(hcg, *inputs, **kwargs):
    for k, v in kwargs.items():
        if isinstance(v, core.eager.Tensor):
            with framework.no_grad():
-                if in_dygraph_mode() and not eval(f"v.place.is_{dev}_place")():
+                if in_dynamic_mode() and not eval(f"v.place.is_{dev}_place")():
                    v_gpu = v._copy_to(place, True)
                    v._clear_data()
                    v_gpu._share_buffer_to(v)
@@ -217,7 +217,7 @@ def fused_allreduce_gradients_with_group(
 ):
    apply_func = (
        _apply_collective_grads_eager
-        if in_dygraph_mode()
+        if in_dynamic_mode()
        else _apply_collective_grads
    )
    with framework.no_grad():

--- a/python/paddle/distributed/fleet/utils/mix_precision_utils.py
+++ b/python/paddle/distributed/fleet/utils/mix_precision_utils.py
@@ -120,7 +120,7 @@ class MixPrecisionOptimizer:
                if param.stop_gradient:
                    continue
                grad_var = param.main_grad
-                if framework.in_dygraph_mode():
+                if paddle.in_dynamic_mode():
                    if (
                        hasattr(grad_var, "is_selected_rows")
                        and grad_var.is_selected_rows()
@@ -151,7 +151,7 @@ class MixPrecisionOptimizer:
                    if param.stop_gradient:
                        continue
                    grad_var = param.main_grad
-                    if framework.in_dygraph_mode():
+                    if paddle.in_dynamic_mode():
                        if (
                            hasattr(grad_var, "is_selected_rows")
                            and grad_var.is_selected_rows()

--- a/python/paddle/distributed/models/moe/utils.py
+++ b/python/paddle/distributed/models/moe/utils.py
@@ -14,7 +14,7 @@
 from paddle import _legacy_C_ops
 from paddle.common_ops_import import check_variable_and_dtype
-from paddle.framework import LayerHelper, in_dygraph_mode
+from paddle.framework import LayerHelper, in_dynamic_mode
 def _number_count(numbers, upper_range):
@@ -39,7 +39,7 @@ def _number_count(numbers, upper_range):
            number_count = paddle.distributed.utils.number_count(numbers, upper_range)
            print(number_count) # the result: [2, 0, 2, 0, 0, 0]
    """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        return _legacy_C_ops.number_count(numbers, 'upper_range', upper_range)
    else:
        op_type = 'number_count'
@@ -86,7 +86,7 @@ def _assign_pos(x, cum_count):
            pos = paddle.distributed.utils.assign_pos(x=numbers, cum_count=num_cum)
            print(pos) # the result: (2, 0, 3, 1)
    """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        return _legacy_C_ops.assign_pos(x, cum_count, cum_count[-1])
    else:
        op_type = 'assign_pos'
@@ -121,7 +121,7 @@ def _random_routing(topk_idx, topk_value, prob, topk=2):
        prob: random prob, shape=(topk_idx.shape[0],)
    """
    if topk == 2:
-        if in_dygraph_mode():
+        if in_dynamic_mode():
            return _legacy_C_ops.random_routing(prob, topk_value, topk_idx)
        else:
            raise RuntimeError("Not supporting static graph mode now")
@@ -150,7 +150,7 @@ def _limit_by_capacity(expert_count, capacity, n_worker):
            out = paddle.distributed.utils.limit_by_capacity(expert_count, capacity, n_work)
            print(out) # the result: [1, 2, 2, 4, 3, 3]
    """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        return _legacy_C_ops.limit_by_capacity(
            expert_count, capacity, 'n_worker', n_worker
        )
@@ -195,7 +195,7 @@ def _prune_gate_by_capacity(gate_idx, expert_count, n_expert, n_worker):
            # Tensor(shape=[8], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
              [1, 3, 3, 3, -1, 2, 1, 1])
    """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        return _legacy_C_ops.prune_gate_by_capacity(
            gate_idx, expert_count, "n_expert", n_expert, "n_worker", n_worker
        )

--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -47,7 +47,7 @@ from paddle.distributed.fleet.launch_utils import check_backend
 # (TODO: GhostScreaming) It will be removed later.
 from paddle.framework import _set_expected_place
 from paddle.framework import base as imperative_base
-from paddle.framework import core, in_dygraph_mode
+from paddle.framework import core, in_dynamic_mode
 from paddle.nn.layer import layers
 from paddle.utils import deprecated
@@ -101,7 +101,7 @@ def _reshape_inplace(x, shape):
 @framework.dygraph_only
 def _split_tensors(coalesced_grads_and_grad_vars):
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        for (
            coalesced_grad,
            origin_grad_vars,
@@ -356,7 +356,7 @@ class DataParallel(layers.Layer):
        super().__init__(layers.full_name() + "_data_parallel")
        assert (
-            in_dygraph_mode()
+            in_dynamic_mode()
        ), "It's not supported to construct DataParallel in static graph mode."
        self._layers = layers
@@ -381,7 +381,7 @@ class DataParallel(layers.Layer):
                "constructing the DataParallel."
            )
-            if in_dygraph_mode():
+            if in_dynamic_mode():
                self.group = (
                    paddle.distributed.collective._get_default_group()
                    if self.group is None
@@ -456,7 +456,7 @@ class DataParallel(layers.Layer):
            check_layer_sparse(sublayer) for sublayer, _ in layers_param
        ]
-        if in_dygraph_mode():
+        if in_dynamic_mode():
            self.group_indices = core.eager_assign_group_by_size(
                trainable_parameters,
                is_sparse_gradient,
@@ -1041,7 +1041,7 @@ def init_parallel_env():
    group = None
-    if backend in _valid_backend_list and in_dygraph_mode():
+    if backend in _valid_backend_list and in_dynamic_mode():
        if _default_group_name in _get_group_map_by_name():
            return _get_group_map_by_name()[_default_group_name]
        _set_default_backend(backend)
@@ -1212,7 +1212,7 @@ def get_rank(group=None):
            print("The rank is %d" % dist.get_rank())
            # The rank is 0
    """
-    if in_dygraph_mode() and group:
+    if in_dynamic_mode() and group:
        return group.rank
    assert group is None, "Only support group argument in eager mode."
@@ -1244,7 +1244,7 @@ def get_world_size(group=None):
            print("The world_size is %d" % dist.get_world_size())
            # The world_size is 1
    """
-    if in_dygraph_mode() and group:
+    if in_dynamic_mode() and group:
        return group.world_size
    assert group is None, "Only support group argument in eager mode."

--- a/python/paddle/distributed/utils/moe_utils.py
+++ b/python/paddle/distributed/utils/moe_utils.py
@@ -14,7 +14,7 @@
 from paddle import _legacy_C_ops
 from paddle.common_ops_import import check_variable_and_dtype
-from paddle.framework import LayerHelper, in_dygraph_mode
+from paddle.framework import LayerHelper, in_dynamic_mode
 def global_scatter(
@@ -102,7 +102,7 @@ def global_scatter(
        return
    ring_id = 0 if group is None else group.id
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        return _legacy_C_ops.global_scatter(
            x,
            local_count,
@@ -219,7 +219,7 @@ def global_gather(
        return
    ring_id = 0 if group is None else group.id
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        return _legacy_C_ops.global_gather(
            x,
            local_count,

--- a/python/paddle/distribution/bernoulli.py
+++ b/python/paddle/distribution/bernoulli.py
@@ -18,8 +18,8 @@ import numpy as np
 import paddle
 from paddle.distribution import exponential_family
 from paddle.fluid.data_feeder import check_type, convert_dtype
-from paddle.fluid.framework import _non_static_mode
 from paddle.fluid.layers import tensor
+from paddle.framework import in_dynamic_mode
 from paddle.nn.functional import (
    binary_cross_entropy_with_logits,
    sigmoid,
@@ -93,7 +93,7 @@ class Bernoulli(exponential_family.ExponentialFamily):
    def __init__(self, probs, name=None):
        self.name = name or 'Bernoulli'
-        if not _non_static_mode():
+        if not in_dynamic_mode():
            check_type(
                probs,
                'probs',
@@ -110,7 +110,7 @@ class Bernoulli(exponential_family.ExponentialFamily):
            self.dtype = paddle.get_default_dtype()
        # Check probs range [0, 1].
-        if _non_static_mode():
+        if in_dynamic_mode():
            """Not use `paddle.any` in static mode, which always be `True`."""
            if (
                paddle.any(self.probs < 0)
@@ -176,7 +176,7 @@ class Bernoulli(exponential_family.ExponentialFamily):
                # [100, 2, 2]
        """
        name = self.name + '_sample'
-        if not _non_static_mode():
+        if not in_dynamic_mode():
            check_type(
                shape,
                'shape',
@@ -255,7 +255,7 @@ class Bernoulli(exponential_family.ExponentialFamily):
                #        288.66418457)
        """
        name = self.name + '_rsample'
-        if not _non_static_mode():
+        if not in_dynamic_mode():
            check_type(
                shape,
                'shape',
@@ -317,7 +317,7 @@ class Bernoulli(exponential_family.ExponentialFamily):
                #        [1.])
        """
        name = self.name + '_cdf'
-        if not _non_static_mode():
+        if not in_dynamic_mode():
            check_type(value, 'value', tensor.Variable, name)
        value = self._check_values_dtype_in_probs(self.probs, value)
@@ -355,7 +355,7 @@ class Bernoulli(exponential_family.ExponentialFamily):
                #        [-1.20397282])
        """
        name = self.name + '_log_prob'
-        if not _non_static_mode():
+        if not in_dynamic_mode():
            check_type(value, 'value', tensor.Variable, name)
        value = self._check_values_dtype_in_probs(self.probs, value)
@@ -394,7 +394,7 @@ class Bernoulli(exponential_family.ExponentialFamily):
                #        [0.29999998])
        """
        name = self.name + '_prob'
-        if not _non_static_mode():
+        if not in_dynamic_mode():
            check_type(value, 'value', tensor.Variable, name)
        return self.log_prob(value).exp(name=name)
@@ -459,7 +459,7 @@ class Bernoulli(exponential_family.ExponentialFamily):
                #        0.33891910)
        """
        name = self.name + '_kl_divergence'
-        if not _non_static_mode():
+        if not in_dynamic_mode():
            check_type(other, 'other', Bernoulli, name)
        a_logits = self.logits

--- a/python/paddle/distribution/categorical.py
+++ b/python/paddle/distribution/categorical.py
@@ -17,8 +17,8 @@ import numpy as np
 import paddle
 from paddle.distribution import distribution
 from paddle.fluid.data_feeder import check_type, convert_dtype
-from paddle.fluid.framework import _non_static_mode
 from paddle.fluid.layers import tensor
+from paddle.framework import in_dynamic_mode
 from paddle.tensor import multinomial
@@ -90,7 +90,7 @@ class Categorical(distribution.Distribution):
            logits(list|tuple|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64.
            name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
        """
-        if not _non_static_mode():
+        if not in_dynamic_mode():
            check_type(
                logits,
                'logits',
@@ -146,7 +146,7 @@ class Categorical(distribution.Distribution):
        """
        name = self.name + '_sample'
-        if not _non_static_mode():
+        if not in_dynamic_mode():
            check_type(shape, 'shape', (list), 'sample')
        num_samples = np.prod(np.array(shape))
@@ -208,7 +208,7 @@ class Categorical(distribution.Distribution):
        """
        name = self.name + '_kl_divergence'
-        if not _non_static_mode():
+        if not in_dynamic_mode():
            check_type(other, 'other', Categorical, 'kl_divergence')
        logits = self.logits - paddle.max(self.logits, axis=-1, keepdim=True)

--- a/python/paddle/distribution/dirichlet.py
+++ b/python/paddle/distribution/dirichlet.py
@@ -15,8 +15,8 @@
 import paddle
 from paddle.distribution import exponential_family
 from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.layer_helper import LayerHelper
+from paddle.framework import in_dynamic_mode
 class Dirichlet(exponential_family.ExponentialFamily):
@@ -159,7 +159,7 @@ class Dirichlet(exponential_family.ExponentialFamily):
 def _dirichlet(concentration, name=None):
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        return paddle._C_ops.dirichlet(concentration)
    else:
        op_type = 'dirichlet'

--- a/python/paddle/distribution/distribution.py
+++ b/python/paddle/distribution/distribution.py
@@ -26,8 +26,8 @@ import numpy as np
 import paddle
 from paddle import _C_ops
 from paddle.fluid.data_feeder import check_variable_and_dtype, convert_dtype
-from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.layers import tensor
+from paddle.framework import in_dynamic_mode
 class Distribution:
@@ -221,7 +221,7 @@ class Distribution:
        Returns:
            value (Tensor): Change value's dtype if value's dtype is different from param.
        """
-        if in_dygraph_mode():
+        if in_dynamic_mode():
            if value.dtype != param.dtype and convert_dtype(value.dtype) in [
                'float32',
                'float64',

--- a/python/paddle/distribution/exponential_family.py
+++ b/python/paddle/distribution/exponential_family.py
@@ -14,7 +14,7 @@
 import paddle
 from paddle.distribution import distribution
-from paddle.fluid.framework import _non_static_mode
+from paddle.framework import in_dynamic_mode
 class ExponentialFamily(distribution.Distribution):
@@ -61,7 +61,7 @@ class ExponentialFamily(distribution.Distribution):
        log_norm = self._log_normalizer(*natural_parameters)
-        if _non_static_mode():
+        if in_dynamic_mode():
            grads = paddle.grad(
                log_norm.sum(), natural_parameters, create_graph=True
            )

--- a/python/paddle/distribution/kl.py
+++ b/python/paddle/distribution/kl.py
@@ -27,7 +27,7 @@ from paddle.distribution.laplace import Laplace
 from paddle.distribution.lognormal import LogNormal
 from paddle.distribution.normal import Normal
 from paddle.distribution.uniform import Uniform
-from paddle.fluid.framework import _non_static_mode
+from paddle.framework import in_dynamic_mode
 __all__ = ["register_kl", "kl_divergence"]
@@ -229,7 +229,7 @@ def _kl_expfamily_expfamily(p, q):
    p_log_norm = p._log_normalizer(*p_natural_params)
    try:
-        if _non_static_mode():
+        if in_dynamic_mode():
            p_grads = paddle.grad(
                p_log_norm, p_natural_params, create_graph=True
            )

--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -20,8 +20,8 @@ import numpy as np
 import paddle
 from paddle.distribution import distribution
 from paddle.fluid.data_feeder import check_type, convert_dtype
-from paddle.fluid.framework import _non_static_mode
 from paddle.fluid.layers import tensor
+from paddle.framework import in_dynamic_mode
 from paddle.tensor import random
@@ -87,7 +87,7 @@ class Normal(distribution.Distribution):
    """
    def __init__(self, loc, scale, name=None):
-        if not _non_static_mode():
+        if not in_dynamic_mode():
            check_type(
                loc,
                'loc',
@@ -166,7 +166,7 @@ class Normal(distribution.Distribution):
        if not isinstance(shape, Iterable):
            raise TypeError('sample shape must be Iterable object.')
-        if not _non_static_mode():
+        if not in_dynamic_mode():
            check_type(seed, 'seed', (int), 'sample')
        shape = list(shape)
@@ -321,7 +321,7 @@ class Normal(distribution.Distribution):
            Tensor, kl-divergence between two normal distributions.The data type is float32.
        """
-        if not _non_static_mode():
+        if not in_dynamic_mode():
            check_type(other, 'other', Normal, 'kl_divergence')
        name = self.name + '_kl_divergence'

--- a/python/paddle/distribution/uniform.py
+++ b/python/paddle/distribution/uniform.py
@@ -18,8 +18,8 @@ import paddle
 from paddle import _C_ops
 from paddle.distribution import distribution
 from paddle.fluid.data_feeder import check_type, convert_dtype
-from paddle.fluid.framework import _non_static_mode, in_dygraph_mode
 from paddle.fluid.layers import tensor
+from paddle.framework import in_dynamic_mode
 from paddle.tensor import random
@@ -92,7 +92,7 @@ class Uniform(distribution.Distribution):
    """
    def __init__(self, low, high, name=None):
-        if not _non_static_mode():
+        if not in_dynamic_mode():
            check_type(
                low,
                'low',
@@ -152,7 +152,7 @@ class Uniform(distribution.Distribution):
            Tensor, A tensor with prepended dimensions shape. The data type is float32.
        """
-        if not _non_static_mode():
+        if not in_dynamic_mode():
            check_type(shape, 'shape', (list), 'sample')
            check_type(seed, 'seed', (int), 'sample')
@@ -205,7 +205,7 @@ class Uniform(distribution.Distribution):
        """
        value = self._check_values_dtype_in_probs(self.low, value)
-        if in_dygraph_mode():
+        if in_dynamic_mode():
            # ensure value in [low, high]
            lb_bool = self.low < value
            ub_bool = value < self.high
@@ -234,7 +234,7 @@ class Uniform(distribution.Distribution):
        """
        value = self._check_values_dtype_in_probs(self.low, value)
-        if in_dygraph_mode():
+        if in_dynamic_mode():
            lb_bool = self.low < value
            ub_bool = value < self.high
            lb = _C_ops.cast(lb_bool, value.dtype)

--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -20,8 +20,8 @@ import paddle
 from . import _C_ops
 from .fluid.data_feeder import check_variable_and_dtype
-from .fluid.framework import in_dygraph_mode
 from .fluid.layer_helper import LayerHelper
+from .framework import in_dynamic_mode
 from .tensor.attribute import is_floating_point, is_integer
 from .tensor.creation import _complex_to_real_dtype, _real_to_complex_dtype
@@ -1437,7 +1437,7 @@ def fft_c2c(x, n, axis, norm, forward, name):
        s = [n]
        x = _resize_fft_input(x, s, axes)
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        out = _C_ops.fft_c2c(x, axes, norm, forward)
    else:
        op_type = 'fft_c2c'
@@ -1468,7 +1468,7 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
        _check_fft_n(n)
        s = [n]
        x = _resize_fft_input(x, s, axes)
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        out = _C_ops.fft_r2c(x, axes, norm, forward, onesided)
    else:
        op_type = 'fft_r2c'
@@ -1511,7 +1511,7 @@ def fft_c2r(x, n, axis, norm, forward, name):
        s = [n // 2 + 1]
        x = _resize_fft_input(x, s, axes)
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        if n is not None:
            out = _C_ops.fft_c2r(x, axes, norm, forward, n)
        else:
@@ -1570,7 +1570,7 @@ def fftn_c2c(x, s, axes, norm, forward, name):
    if s is not None:
        x = _resize_fft_input(x, s, axes)
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        out = _C_ops.fft_c2c(x, axes, norm, forward)
    else:
        op_type = 'fft_c2c'
@@ -1620,7 +1620,7 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
    if s is not None:
        x = _resize_fft_input(x, s, axes)
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        out = _C_ops.fft_r2c(x, axes, norm, forward, onesided)
    else:
        op_type = 'fft_r2c'
@@ -1684,7 +1684,7 @@ def fftn_c2r(x, s, axes, norm, forward, name):
        fft_input_shape[-1] = fft_input_shape[-1] // 2 + 1
        x = _resize_fft_input(x, fft_input_shape, axes)
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        if s is not None:
            out = _C_ops.fft_c2r(x, axes, norm, forward, s[-1])
        else:

--- a/python/paddle/fluid/contrib/optimizer.py
+++ b/python/paddle/fluid/contrib/optimizer.py
@@ -229,7 +229,7 @@ class Momentum(Optimizer):
            else None
        )
-        if framework._non_static_mode():
+        if framework.in_dygraph_mode():
            _, _, _ = _legacy_C_ops.momentum(
                param_and_grad[0],
                param_and_grad[1],

--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -22,8 +22,8 @@ import struct
 from .framework import (
    Variable,
    default_main_program,
+    in_dygraph_mode,
    _current_expected_place,
-    _non_static_mode,
 )
 from .framework import _cpu_num, _cuda_ids
@@ -140,7 +140,7 @@ def check_type(input, input_name, expected_type, op_name, extra_message=''):
    # in dynamic graph mode.
    # 2. Performance considerations. Because these checks are executed at
    # each step in dynamic graph mode, it will bring a heavy performance burden.
-    if _non_static_mode():
+    if in_dygraph_mode():
        return
    # NOTE: `in_declarative_mode` is used to determined whether this op is called under
@@ -171,7 +171,7 @@ def check_dtype(
    input_dtype, input_name, expected_dtype, op_name, extra_message=''
 ):
    # See NOTE [ Why skip dynamic graph check ]
-    if _non_static_mode():
+    if in_dygraph_mode():
        return
    if convert_dtype(input_dtype) in ['float16']:
        warnings.warn(
@@ -208,7 +208,7 @@ def check_shape(
    expected_tensor_dtype=('int32', 'int64'),
 ):
    # See NOTE [ Why skip dynamic graph check ]
-    if _non_static_mode():
+    if in_dygraph_mode():
        return
    check_type(shape, 'shape', expected_shape_type, op_name)
    if expected_element_type is not None and not isinstance(shape, Variable):

--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -106,7 +106,7 @@ def program_desc_tracing_guard(enable):
 @signature_safe_contextmanager
 def param_guard(parameters):
    # Note: parameters is a reference of self._parameters or self._buffers
-    if in_declarative_mode() and not framework.in_dygraph_mode() and parameters:
+    if in_declarative_mode() and not paddle.in_dynamic_mode() and parameters:
        origin_parameters = parameters.copy()
        for name, var_base in parameters.items():
            if isinstance(var_base, list):

--- a/python/paddle/fluid/dygraph/tensor_patch_methods.py
+++ b/python/paddle/fluid/dygraph/tensor_patch_methods.py
@@ -270,7 +270,7 @@ def monkey_patch_tensor():
                # 4: [5000.]
        """
-        if framework._non_static_mode():
+        if framework.in_dygraph_mode():
            if in_profiler_mode():
                record_event = profiler.RecordEvent(
                    "Gradient Backward", profiler.TracerEventType.Backward
@@ -978,21 +978,20 @@ def monkey_patch_tensor():
        ("values", values),
        ("to_dense", to_dense),
        ("to_sparse_coo", to_sparse_coo),
+        ("_set_grad_ivar", _set_grad_ivar),
+        ("value", value),
+        ("cpu", cpu),
+        ("cuda", cuda),
+        ("pin_memory", pin_memory),
+        ("_slice", _slice),
+        ("_numel", _numel),
+        ("_uva", _uva),
+        ("_clear_data", _clear_data),
+        ("__hash__", __hash__),
+        ("_use_gpudnn", _use_gpudnn),
    ):
        setattr(core.eager.Tensor, method_name, method)
-    setattr(core.eager.Tensor, "_set_grad_ivar", _set_grad_ivar)
-    setattr(core.eager.Tensor, "value", value)
-    setattr(core.eager.Tensor, "cpu", cpu)
-    setattr(core.eager.Tensor, "cuda", cuda)
-    setattr(core.eager.Tensor, "pin_memory", pin_memory)
-    setattr(core.eager.Tensor, "_slice", _slice)
-    setattr(core.eager.Tensor, "_numel", _numel)
-    setattr(core.eager.Tensor, "_uva", _uva)
-    setattr(core.eager.Tensor, "_clear_data", _clear_data)
-    setattr(core.eager.Tensor, "__hash__", __hash__)
-    setattr(core.eager.Tensor, "_use_gpudnn", _use_gpudnn)
    global _already_patch_repr
    if not _already_patch_repr:
        # NOTE(zhiqiu): pybind11 will set a default __str__ method of enum class.

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -52,7 +52,6 @@ __all__ = [
    'cpu_places',
    'xpu_places',
    'cuda_pinned_places',
-    '_non_static_mode',
    'in_dygraph_mode',
    'is_compiled_with_cinn',
    'is_compiled_with_cuda',
@@ -156,28 +155,6 @@ extra_op_attrs = {
    "unique": ["is_sorted"],
 }
-# Some explanation of our execution system 2022.03
-# For now we have 3 kinds of execution system, since we refactored dygraph mode to
-# build a fast execution system for dynamic mode. But we can't just remove all legacy
-# code once we present the new system for some historical reason. That's why we have
-# these flags.
-#
-# 1. _non_static_mode():
-# _non_static_mode means  we are now running in legacy dygraph mode or dygraph mode.
-# 2. dygraph_mode():
-# This flags inidicates we are now running in dygraph mode which called eager mode before.
-# 3. _in_legacy_dygraph():
-# This flags has been deprecated
-#
-# They have a relation ship as below:
-# Since _in_legacy_graph is deprecated, so dygraph_mode is _non_static_mode
-#
-# Why we have to make different of _in_legacy_dygraph and dygraph_mode?
-# In some performance issue, we find that python if statement cause server performance problem
-# and we need our new dygraph mode becomes as fast as it could be. That's why we make these flags
-# to make sure in most case, we find new dygraph mode first with only one if statement.
 # FIXME(dev): We haven't fully verified eager mode on XPU et.al but
 # only GPU/CPU. Remove this after we improve this feature.
 _is_first_import_ = True
@@ -213,10 +190,6 @@ def in_dygraph_mode():
    return global_var._dygraph_tracer_ is not None
-def _non_static_mode():
-    return global_var._dygraph_tracer_ is not None
 global_ipu_index = -1
 global_ipu_stage = -1
 ipu_index_attr_name = 'ipu_index'
@@ -459,7 +432,7 @@ def require_version(min_version, max_version=None):
 def _dygraph_not_support_(func):
    def __impl__(*args, **kwargs):
-        assert not _non_static_mode(), (
+        assert not in_dygraph_mode(), (
            "We don't support %s in dynamic graph mode" % func.__name__
        )
        return func(*args, **kwargs)
@@ -469,7 +442,7 @@ def _dygraph_not_support_(func):
 def _dygraph_only_(func):
    def __impl__(*args, **kwargs):
-        assert _non_static_mode(), (
+        assert in_dygraph_mode(), (
            "We only support '%s()' in dynamic graph mode, please call 'paddle.disable_static()' to enter dynamic graph mode."
            % func.__name__
        )
@@ -482,7 +455,7 @@ def _non_static_only_(func):
    def __impl__(*args, **kwargs):
        from .dygraph.base import in_declarative_mode
-        assert _non_static_mode() or in_declarative_mode(), (
+        assert in_dygraph_mode() or in_declarative_mode(), (
            "We only support '%s()' in dynamic graph mode, please call 'paddle.disable_static()' to enter dynamic graph mode."
            % func.__name__
        )
@@ -493,7 +466,7 @@ def _non_static_only_(func):
 def _static_only_(func):
    def __impl__(*args, **kwargs):
-        assert not _non_static_mode(), (
+        assert not in_dygraph_mode(), (
            "In PaddlePaddle 2.x, we turn on dynamic graph mode by default, and '%s()' is only supported in static graph mode. So if you want to use this api, please call 'paddle.enable_static()' before this api to enter static graph mode."
            % func.__name__
        )
@@ -971,7 +944,7 @@ def name_scope(prefix=None):
    """
    # TODO(panyx0718): Only [0-9a-z].
    # in dygraph we don't need namescope since it will cause mem leak
-    if _non_static_mode():
+    if in_dygraph_mode():
        yield
    else:
        assert prefix, "namescope prefix can not be empty."
@@ -2738,7 +2711,7 @@ class Operator:
        except ValueError:
            pass
-        if _non_static_mode():
+        if in_dygraph_mode():
            if type is None:
                raise ValueError(
                    "`type` to initialized an Operator can not be None."
@@ -2924,7 +2897,7 @@ class Operator:
                        else:
                            out_arg_names.append(arg.name)
                        # TODO(minqiyang): could we remove variable's op in static graph mode?
-                        if not _non_static_mode():
+                        if not in_dygraph_mode():
                            if isinstance(arg, str):
                                block.var(arg).op = self
                            else:
@@ -3799,7 +3772,7 @@ class Block:
        )
    def create_var(self, *args, **kwargs):
-        if _non_static_mode():
+        if in_dygraph_mode():
            var = _create_tensor(*args, **kwargs)
        else:
            var = Variable(block=self, *args, **kwargs)
@@ -3956,7 +3929,7 @@ class Block:
            Operator: the append Operator.
        """
        op_type = kwargs.get("type", None)
-        if _non_static_mode():
+        if in_dygraph_mode():
            attrs = kwargs.get("attrs", {})
            inplace_map = kwargs.get("inplace_map", None)
            warnings.warn(
@@ -4093,7 +4066,7 @@ class Block:
        return self.ops[start:end]
    def _prepend_op(self, *args, **kwargs):
-        if _non_static_mode():
+        if in_dygraph_mode():
            type = kwargs.get("type", None)
            attrs = kwargs.get("attrs", {})
            op = Operator(
@@ -7469,7 +7442,7 @@ def _cuda_graph_guard(cuda_graph_attr=None):
                                   cuda_graph_capture_mode;memory_pool_id;cuda_graph_id
    """
    assert (
-        not _non_static_mode()
+        not in_dygraph_mode()
    ), "cuda_graph_guard only works under static graph mode"
    assert (
        core.is_compiled_with_cuda()

--- a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
+++ b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
@@ -24,7 +24,7 @@ from contextlib import contextmanager
 from paddle.fluid import unique_name, compiler
 from .checkpoint_saver import SerializableBase, CheckpointSaver, PaddleModel
-from paddle.fluid.framework import _non_static_mode, Program
+from paddle.fluid.framework import in_dygraph_mode, Program
 g_train_epoch_range = None
 g_checker = None
@@ -138,7 +138,7 @@ class AutoCheckpointChecker:
        return self._save_checkpoint_inter
    def valid(self):
-        if _non_static_mode():
+        if in_dygraph_mode():
            return False
        return (

--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -17,7 +17,7 @@ import paddle
 from .framework import (
    Parameter,
    dtype_is_floating,
-    _non_static_mode,
+    in_dygraph_mode,
    OpProtoHolder,
    _global_flags,
 )
@@ -159,7 +159,7 @@ class LayerHelper(LayerHelperBase):
        if use_mkldnn:
            act['use_mkldnn'] = use_mkldnn
        act_type = act.pop('type')
-        if _non_static_mode():
+        if in_dygraph_mode():
            res = _append_activation_in_dygraph(
                input_var, act_type, use_cudnn, use_mkldnn
            )

--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -20,7 +20,7 @@ from .framework import (
    Variable,
    default_main_program,
    default_startup_program,
-    _non_static_mode,
+    in_dygraph_mode,
    _current_expected_place,
 )
 from . import unique_name
@@ -409,7 +409,7 @@ class LayerHelperBase:
            param = self._create_weight_normalize(attr, shape, dtype)
            WeightNormParamAttr.params_with_weight_norm.append(param)
            return param
-        if _non_static_mode():
+        if in_dygraph_mode():
            # In dygraph mode, we want the returned parameter to be
            # initialized so that it can be used imperatively.
            # check parameter name
@@ -527,7 +527,7 @@ class LayerHelperBase:
            initializer: initializer to use
        """
        assert isinstance(var, Variable)
-        if _non_static_mode():
+        if in_dygraph_mode():
            initializer(var, self.main_program.global_block())
        else:
            self.startup_program.global_block().create_var(

--- a/python/paddle/fluid/lazy_init.py
+++ b/python/paddle/fluid/lazy_init.py
@@ -37,7 +37,7 @@ class LazyInitHelper:
        if self._state:
            return
        assert (
-            framework._non_static_mode()
+            framework.in_dygraph_mode()
        ), "LazyInit.enable() is only available in dygraph mode."
        self._state = True

--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -26,7 +26,7 @@ from .framework import (
    program_guard,
    default_main_program,
    default_startup_program,
-    _non_static_mode,
+    in_dygraph_mode,
    cpu_places,
    _current_expected_place,
 )
@@ -417,7 +417,7 @@ class DataLoader:
                            epoch_id, batch_id, np.mean(loss.numpy())))
        """
-        if _non_static_mode():
+        if in_dygraph_mode():
            return DygraphGeneratorLoader(
                feed_list,
                capacity,
@@ -1605,7 +1605,7 @@ class DatasetLoader(DataLoaderBase):
            dataset, paddle.distributed.fleet.dataset.DatasetBase
        ), "dataset must be type of DatasetBase"
        assert (
-            not _non_static_mode()
+            not in_dygraph_mode()
        ), "DatasetLoader is not supported in dygraph mode yet"
        if isinstance(places, (list, tuple)):
            places = _get_paddle_place_list(places)

--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_recompute.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_recompute.py
@@ -87,7 +87,7 @@ class TransformerNet(Layer):
 class EmbeddingPipe(EmbeddingNet):
    def forward(self, tensors):
-        if framework.in_dygraph_mode():
+        if framework.in_dynamic_mode():
            stable, x = tensors
            return stable, super().forward(x)
        else:
@@ -96,7 +96,7 @@ class EmbeddingPipe(EmbeddingNet):
 class TransformerNetPipe(TransformerNet):
    def forward(self, tensors):
-        if framework.in_dygraph_mode():
+        if framework.in_dynamic_mode():
            stable, x = tensors
            output = super().forward(x)
            return stable, output
@@ -109,7 +109,7 @@ class CriterionPipe(Layer):
        super().__init__()
    def forward(self, out, label):
-        if framework.in_dygraph_mode():
+        if framework.in_dynamic_mode():
            out = out[-1]
        loss = out.mean()
        return loss
@@ -179,7 +179,7 @@ class TestDistPPTraning(unittest.TestCase):
            x_data = np.random.randint(0, vocab_size, size=[batch_size, length])
            x = paddle.to_tensor(x_data)
            x.stop_gradient = True
-            input_ = (x, x) if framework.in_dygraph_mode() else x
+            input_ = (x, x) if framework.in_dynamic_mode() else x
            loss = model.train_batch([input_, x], optimizer, scheduler)
            # TODO(shenliang03) add utest for loss
            print("loss: ", loss)

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py
@@ -55,7 +55,7 @@ def optimizer_setting(params, parameter_list=None):
    bd = [step * e for e in ls["epochs"]]
    lr = params["lr"]
    num_epochs = params["num_epochs"]
-    if fluid._non_static_mode():
+    if fluid.in_dygraph_mode():
        optimizer = fluid.optimizer.Momentum(
            learning_rate=fluid.layers.cosine_decay(
                learning_rate=lr, step_each_epoch=step, epochs=num_epochs

--- a/python/paddle/fluid/tests/unittests/test_context_manager.py
+++ b/python/paddle/fluid/tests/unittests/test_context_manager.py
@@ -29,8 +29,8 @@ class TestContextManagerRaiseException(unittest.TestCase):
    def test_func2(self):
        # After test_func1 executed, if fluid.dygraph.guard() in test_func1 safely exited,
-        # fluid._non_static_mode() should be false.
+        # fluid.in_dygraph_mode() should be false.
-        self.assertEqual(fluid._non_static_mode(), False)
+        self.assertEqual(fluid.in_dygraph_mode(), False)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_dropout_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_nd_op.py
@@ -21,7 +21,7 @@ import paddle
 from paddle import _legacy_C_ops, fluid
 from paddle.fluid import core
 from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle.fluid.framework import _non_static_mode
+from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.static import default_main_program
@@ -34,7 +34,7 @@ def dropout_nd(
    mode = (
        'downgrade_in_infer' if mode == 'downscale_in_infer' else mode
    )  # semantic transfer
-    if _non_static_mode():
+    if in_dygraph_mode():
        if default_main_program().random_seed != 0:
            seed = default_main_program().random_seed

--- a/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
@@ -27,7 +27,7 @@ class TestTracerMode(unittest.TestCase):
        self.init_mode = True
    def get_tracer_mode(self):
-        assert fluid._non_static_mode(), "Dygraph mode must be enabled"
+        assert framework.in_dygraph_mode(), "Dygraph mode must be enabled"
    @fluid.dygraph.no_grad
    def no_grad_func(self, a):

--- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
@@ -26,7 +26,7 @@ from paddle.vision.models import resnet50, resnet101
 def _dygraph_guard_(func):
    def __impl__(*args, **kwargs):
-        if fluid._non_static_mode():
+        if fluid.in_dygraph_mode():
            return func(*args, **kwargs)
        else:
            with fluid.dygraph.guard():

--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -206,7 +206,7 @@ class EncoderNet(paddle.nn.Layer):
            initializer=paddle.nn.initializer.Normal(0.0, 0.02),
            learning_rate=2.0,
        )
-        if fluid.framework._non_static_mode():
+        if fluid.framework.in_dygraph_mode():
            h_0 = np.zeros(
                (Config.batch_size, rnn_hidden_size), dtype="float32"
            )

--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -58,7 +58,7 @@ def optimizer_setting(params, parameter_list=None):
        base_lr = params["lr"]
        lr = []
        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-        if fluid._non_static_mode():
+        if fluid.in_dygraph_mode():
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.01, parameter_list=parameter_list
            )

--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
@@ -54,7 +54,7 @@ def optimizer_setting(params, parameter_list=None):
        base_lr = params["lr"]
        lr = []
        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-        if fluid._non_static_mode():
+        if fluid.in_dygraph_mode():
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.01, parameter_list=parameter_list
            )

--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -54,7 +54,7 @@ def optimizer_setting(params, parameter_list=None):
        # bd = [step * e for e in ls["epochs"]]
        # base_lr = params["lr"]
        # lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-        if fluid._non_static_mode():
+        if fluid.in_dygraph_mode():
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.01, parameter_list=parameter_list
            )

--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -114,7 +114,7 @@ class InstanceNorm(paddle.nn.Layer):
        self.bias = self.create_parameter(shape=[num_channels], is_bias=True)
    def forward(self, input):
-        if fluid._non_static_mode():
+        if fluid.in_dygraph_mode():
            out, _, _ = _legacy_C_ops.instance_norm(
                input, self.scale, self.bias, 'epsilon', self.epsilon
            )
@@ -387,7 +387,7 @@ def loss_cls(cls, label, cfg):
 def calc_gradients(outputs, inputs, no_grad_set):
-    if fluid._non_static_mode():
+    if fluid.in_dygraph_mode():
        return fluid.dygraph.grad(
            outputs=outputs,
            inputs=inputs,
@@ -481,7 +481,7 @@ def build_optimizer(layer, cfg, loss=None):
    learning_rate = 1e-3
    beta1 = 0.5
    beta2 = 0.999
-    if fluid._non_static_mode():
+    if fluid.in_dygraph_mode():
        return fluid.optimizer.Adam(
            learning_rate=learning_rate,
            beta1=beta1,

--- a/python/paddle/fluid/tests/unittests/test_imperative_tensor_clear_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_tensor_clear_gradient.py
@@ -24,7 +24,7 @@ from paddle.fluid.wrapped_decorator import wrap_decorator
 def _dygraph_guard_(func):
    def __impl__(*args, **kwargs):
-        if fluid._non_static_mode():
+        if fluid.in_dygraph_mode():
            return func(*args, **kwargs)
        else:
            with fluid.dygraph.guard():

--- a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py
@@ -24,7 +24,7 @@ from paddle.fluid.wrapped_decorator import wrap_decorator
 def _dygraph_guard_(func):
    def __impl__(*args, **kwargs):
-        if fluid._non_static_mode():
+        if fluid.in_dygraph_mode():
            return func(*args, **kwargs)
        else:
            with fluid.dygraph.guard():

--- a/python/paddle/fluid/tests/unittests/test_logical_op.py
+++ b/python/paddle/fluid/tests/unittests/test_logical_op.py
@@ -18,7 +18,7 @@ import numpy as np
 from eager_op_test import convert_float_to_uint16
 import paddle
-from paddle.framework import _non_static_mode
+from paddle.framework import in_dynamic_mode
 from paddle.static import Executor, Program, program_guard
 SUPPORTED_DTYPES = [
@@ -182,11 +182,11 @@ def test_type_error(unit_test, use_gpu, type_str_map):
        if binary_op:
            if type_str_map['x'] != type_str_map['y']:
                unit_test.assertRaises(error_type, op, x=x, y=y)
-            if not _non_static_mode():
+            if not in_dynamic_mode():
                error_type = TypeError
                unit_test.assertRaises(error_type, op, x=x, y=y, out=1)
        else:
-            if not _non_static_mode():
+            if not in_dynamic_mode():
                error_type = TypeError
                unit_test.assertRaises(error_type, op, x=x, out=1)

--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -19,8 +19,8 @@ import numpy as np
 from eager_op_test import OpTest
 import paddle
-from paddle import _C_ops, _legacy_C_ops
+from paddle import _C_ops
-from paddle.fluid import _non_static_mode, core, in_dygraph_mode
+from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
@@ -42,7 +42,7 @@ def multiclass_nms3(
    helper = LayerHelper('multiclass_nms3', **locals())
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
        attrs = (
            score_threshold,
            nms_top_k,
@@ -58,30 +58,6 @@ def multiclass_nms3(
        if not return_index:
            index = None
        return output, index, nms_rois_num
-    elif _non_static_mode():
-        attrs = (
-            'background_label',
-            background_label,
-            'score_threshold',
-            score_threshold,
-            'nms_top_k',
-            nms_top_k,
-            'nms_threshold',
-            nms_threshold,
-            'keep_top_k',
-            keep_top_k,
-            'nms_eta',
-            nms_eta,
-            'normalized',
-            normalized,
-        )
-        output, index, nms_rois_num = _legacy_C_ops.multiclass_nms3(
-            bboxes, scores, rois_num, *attrs
-        )
-        if not return_index:
-            index = None
-        return output, index, nms_rois_num
    else:
        output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
        index = helper.create_variable_for_type_inference(dtype='int32')

--- a/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
@@ -20,11 +20,11 @@ from numpy import linalg as LA
 import paddle
 from paddle import _C_ops, _legacy_C_ops
-from paddle.framework import in_dygraph_mode
+from paddle.framework import in_dynamic_mode
 def test_squared_l2_norm(x):
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        return _C_ops.squared_l2_norm(x)
    else:
        return _legacy_C_ops.squared_l2_norm(x)

--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -120,9 +120,9 @@ def generate(key):
 # NOTE(zhiqiu): use c++ unique_name_generator in dygraph mode,
 # in order to keep name consistency.
 def generate_with_ignorable_key(key):
-    from .framework import _non_static_mode, _dygraph_tracer
+    from .framework import in_dygraph_mode, _dygraph_tracer
-    if _non_static_mode():
+    if in_dygraph_mode():
        return _dygraph_tracer()._generate_unique_name()
    return generator(key)

--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
@@ -436,8 +436,7 @@ def _getitem_impl_(var, item):
                start = 0 if step > 0 else MAX_INTEGER
            if end is None:
                if (
-                    paddle.fluid.framework._non_static_mode()
+                    paddle.in_dynamic_mode() or not is_tensor_array
-                    or not is_tensor_array
                ) and var.shape[dim] != -1:
                    end = var.shape[dim] if step > 0 else -1
                else:
@@ -550,7 +549,7 @@ def _getitem_impl_(var, item):
    out = var
    if len(axes) > 0:
        op_type = "strided_slice" if use_strided_slice else "slice"
-        if paddle.fluid.framework.in_dygraph_mode() and op_type == "slice":
+        if paddle.in_dynamic_mode() and op_type == "slice":
            if "StartsTensorList" in inputs.keys():
                st = inputs['StartsTensorList']
            else:
@@ -620,11 +619,11 @@ def _setitem_for_tensor_array(var, item, value):
    If item is case (1), we perform paddle.tensor.array_write,
    in other cases, we raise a NotImplementedError.
    """
-    from ..framework import LayerHelper, core, _non_static_mode
+    from ..framework import LayerHelper, core
    from .framework import Variable
    assert (
-        not _non_static_mode()
+        not paddle.in_dynamic_mode()
    ), "setitem for tensor_array must be called in static graph mode."
    if isinstance(item, (Variable, int)):
        from paddle.jit.dy2static.variable_trans_func import (
@@ -808,7 +807,7 @@ def _setitem_impl_(var, item, value):
            )
        )
-    if paddle.fluid.framework._non_static_mode():
+    if paddle.in_dynamic_mode():
        var._bump_inplace_version()
    cur_block = default_main_program().current_block()

--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -55,10 +55,7 @@ from ..fluid.framework import set_flags  # noqa: F401
 from ..fluid.framework import Parameter
 from ..fluid.dygraph.base import enable_dygraph as disable_static  # noqa: F401
 from ..fluid.dygraph.base import disable_dygraph as enable_static  # noqa: F401
-from ..fluid.framework import _non_static_mode as in_dynamic_mode  # noqa: F401
+from ..fluid.framework import in_dygraph_mode as in_dynamic_mode  # noqa: F401
-from ..fluid.framework import (  # noqa: F401
-    _non_static_mode,  # temporary used for hackson
-)
 from ..fluid.framework import (
    _current_expected_place,
    _get_paddle_place,
@@ -74,7 +71,6 @@ from ..fluid.framework import _dygraph_tracer  # noqa: F401
 from ..fluid.framework import generate_control_dev_var_name  # noqa: F401
 from ..fluid.layer_helper import LayerHelper  # noqa: F401
-from ..fluid.framework import in_dygraph_mode  # noqa: F401
 from ..fluid.framework import _global_flags  # noqa: F401
 from ..fluid.framework import _apply_pass  # noqa: F401
 from ..fluid.framework import switch_main_program

--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -34,7 +34,7 @@ from paddle.fluid.framework import (
    _create_tensor,
    _current_expected_place,
    _dygraph_tracer,
-    _non_static_mode,
+    in_dygraph_mode,
 )
 from .io_utils import (
@@ -438,7 +438,7 @@ def _to_LodTensor(ndarray):
 def _tuple_to_tensor(obj, return_numpy):
    if return_numpy:
        return obj[1]
-    if _non_static_mode():
+    if in_dygraph_mode():
        t = paddle.to_tensor(obj[1])
        # This function does modify the name of return value.
        # Loading the same variable multiple times may cause the same name.
@@ -451,7 +451,7 @@ def _tuple_to_tensor(obj, return_numpy):
 def _ndarray_to_tensor(obj, return_numpy):
    if return_numpy:
        return obj
-    if _non_static_mode():
+    if in_dygraph_mode():
        return paddle.to_tensor(obj)
    else:
        return _to_LodTensor(obj)
@@ -508,7 +508,7 @@ def _parse_load_result(obj, return_numpy):
        return obj
    if _contain_x(obj, is_layer):
-        if not _non_static_mode():
+        if not in_dygraph_mode():
            raise ValueError(
                "Layer can only be loaded in dynamic graph mode, but now in static graph mode."
            )
@@ -819,7 +819,7 @@ def save(obj, path, protocol=4, **configs):
                f.write(obj.desc.serialize_to_string())
        elif _is_state_dict(obj):
-            if _non_static_mode():
+            if in_dygraph_mode():
                _legacy_save(obj, path, protocol)
            else:
                _legacy_static_save(obj, path, protocol)
@@ -1110,7 +1110,7 @@ def load(path, **configs):
                    if config.return_numpy:
                        return np.array(tensor)
                    else:
-                        if _non_static_mode():
+                        if in_dygraph_mode():
                            return _lod_tensor2varbase(tensor)
                        return tensor
                except:

--- a/python/paddle/geometric/math.py
+++ b/python/paddle/geometric/math.py
@@ -14,8 +14,8 @@
 from paddle import _C_ops
 from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.layer_helper import LayerHelper
+from paddle.framework import in_dynamic_mode
 __all__ = []
@@ -50,7 +50,7 @@ def segment_sum(data, segment_ids, name=None):
            #Outputs: [[4., 4., 4.], [4., 5., 6.]]
    """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        return _C_ops.segment_pool(data, segment_ids, "SUM")
    else:
        check_variable_and_dtype(
@@ -107,7 +107,7 @@ def segment_mean(data, segment_ids, name=None):
    """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        return _C_ops.segment_pool(data, segment_ids, "MEAN")
    else:
@@ -164,7 +164,7 @@ def segment_min(data, segment_ids, name=None):
    """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        return _C_ops.segment_pool(data, segment_ids, "MIN")
    else:
        check_variable_and_dtype(
@@ -220,7 +220,7 @@ def segment_max(data, segment_ids, name=None):
    """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        return _C_ops.segment_pool(data, segment_ids, "MAX")
    else:
        check_variable_and_dtype(

--- a/python/paddle/geometric/message_passing/send_recv.py
+++ b/python/paddle/geometric/message_passing/send_recv.py
@@ -20,8 +20,9 @@ from paddle.fluid.data_feeder import (
    check_type,
    check_variable_and_dtype,
 )
-from paddle.fluid.framework import Variable, in_dygraph_mode
+from paddle.fluid.framework import Variable
 from paddle.fluid.layer_helper import LayerHelper
+from paddle.framework import in_dynamic_mode
 from .utils import (
    convert_out_size_to_list,
@@ -118,7 +119,7 @@ def send_u_recv(
    # TODO(daisiming): Should we add judgement for out_size: max(dst_index) + 1.
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        out_size = convert_out_size_to_list(out_size)
        return _C_ops.send_u_recv(
            x, src_index, dst_index, reduce_op.upper(), out_size
@@ -295,7 +296,7 @@ def send_ue_recv(
    # TODO(daisiming): Should we add judgement for out_size: max(dst_index) + 1.
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        out_size = convert_out_size_to_list(out_size)
        return _C_ops.send_ue_recv(
            x,
@@ -451,7 +452,7 @@ def send_uv(x, y, src_index, dst_index, message_op="add", name=None):
        message_op = 'mul'
        y = 1.0 / (y + 1e-12)
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        return _C_ops.send_uv(x, y, src_index, dst_index, message_op.upper())
    else:

--- a/python/paddle/geometric/reindex.py
+++ b/python/paddle/geometric/reindex.py
@@ -15,8 +15,9 @@
 import paddle
 from paddle import _C_ops
 from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle.fluid.framework import Variable, _non_static_mode
+from paddle.fluid.framework import Variable
 from paddle.fluid.layer_helper import LayerHelper
+from paddle.framework import in_dynamic_mode
 __all__ = []
@@ -86,7 +87,7 @@ def reindex_graph(
        True if value_buffer is not None and index_buffer is not None else False
    )
-    if _non_static_mode():
+    if in_dynamic_mode():
        reindex_src, reindex_dst, out_nodes = _C_ops.reindex_graph(
            x,
            neighbors,
@@ -205,7 +206,7 @@ def reindex_heter_graph(
        True if value_buffer is not None and index_buffer is not None else False
    )
-    if _non_static_mode():
+    if in_dynamic_mode():
        neighbors = paddle.concat(neighbors, axis=0)
        count = paddle.concat(count, axis=0)
        reindex_src, reindex_dst, out_nodes = _C_ops.reindex_graph(

--- a/python/paddle/geometric/sampling/neighbors.py
+++ b/python/paddle/geometric/sampling/neighbors.py
@@ -14,8 +14,8 @@
 from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle.fluid.framework import _non_static_mode, in_dygraph_mode
 from paddle.fluid.layer_helper import LayerHelper
+from paddle.framework import in_dynamic_mode
 __all__ = []
@@ -100,7 +100,7 @@ def sample_neighbors(
    use_perm_buffer = True if perm_buffer is not None else False
-    if _non_static_mode():
+    if in_dynamic_mode():
        (
            out_neighbors,
            out_count,
@@ -251,7 +251,7 @@ def weighted_sample_neighbors(
                "`eids` should not be None if `return_eids` is True."
            )
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        (
            out_neighbors,
            out_count,

--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -33,7 +33,8 @@ from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.executor import global_scope
 from paddle.fluid.framework import Variable
 from paddle.fluid.framework import _current_expected_place as _get_device
-from paddle.fluid.framework import _get_paddle_place, _non_static_mode
+from paddle.fluid.framework import _get_paddle_place
+from paddle.framework import in_dynamic_mode
 from paddle.framework.io_utils import is_belong_to_optimizer
 from paddle.io import DataLoader, Dataset, DistributedBatchSampler
 from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
@@ -256,7 +257,7 @@ def prepare_distributed_context(place=None):
            exe = fluid.Executor(place)
            exe.run(communicator_prog)
-        if fluid._non_static_mode():
+        if in_dynamic_mode():
            fluid.disable_dygraph()
            _init_context()
            fluid.enable_dygraph(place)
@@ -1170,7 +1171,7 @@ class Model:
        self._test_dataloader = None
        self.stop_training = False
-        if not _non_static_mode():
+        if not in_dynamic_mode():
            if not isinstance(inputs, (list, tuple, dict, Input)):
                raise TypeError(
                    "'inputs' must be list or tuple or dict, and couldn't be None."
@@ -1182,7 +1183,7 @@ class Model:
        self._labels = self._verify_spec(labels)
        # init backend
-        if fluid._non_static_mode():
+        if in_dynamic_mode():
            self._adapter = DynamicGraphAdapter(self)
        else:
            self._adapter = StaticGraphAdapter(self)
@@ -1238,7 +1239,7 @@ class Model:
        """
        loss = self._adapter.train_batch(inputs, labels, update)
-        if fluid._non_static_mode() and self._input_info is None:
+        if in_dynamic_mode() and self._input_info is None:
            self._update_inputs()
        return loss
@@ -1292,7 +1293,7 @@ class Model:
        """
        loss = self._adapter.eval_batch(inputs, labels)
-        if fluid._non_static_mode() and self._input_info is None:
+        if in_dynamic_mode() and self._input_info is None:
            self._update_inputs()
        return loss
@@ -1341,7 +1342,7 @@ class Model:
        """
        loss = self._adapter.predict_batch(inputs)
-        if fluid._non_static_mode() and self._input_info is None:
+        if in_dynamic_mode() and self._input_info is None:
            self._update_inputs()
        return loss
@@ -1527,7 +1528,7 @@ class Model:
        )
        # TODO: support save/load scaler state in static graph
-        if _non_static_mode():
+        if in_dynamic_mode():
            scaler_state = None
            if hasattr(self, '_scaler') and self._scaler is not None:
                if os.path.exists(path + '.pdscaler'):
@@ -1644,7 +1645,7 @@ class Model:
                )
            if 'use_fp16_guard' in amp_config_key_set:
-                if _non_static_mode():
+                if in_dynamic_mode():
                    raise ValueError(
                        "'use_fp16_guard' is supported in static graph mode only."
                    )
@@ -1702,7 +1703,7 @@ class Model:
                paddle.distributed.ParallelEnv().nranks > 1
                and not _parallel_context_initialized
            ):
-                if fluid._non_static_mode():
+                if in_dynamic_mode():
                    main_prog_seed = fluid.default_main_program().random_seed
                    startup_prog_seed = (
                        fluid.default_startup_program().random_seed
@@ -2228,7 +2229,7 @@ class Model:
            None
        """
-        if fluid._non_static_mode():
+        if in_dynamic_mode():
            with fluid.framework._dygraph_guard(None):
                layer = self.network
                if self._input_info is None:  # No provided or inferred
@@ -2428,7 +2429,7 @@ class Model:
                if (
                    shapes is not None
                    and dtypes is not None
-                    and fluid._non_static_mode()
+                    and in_dynamic_mode()
                ):
                    out_specs = [
                        Input(name=n, dtype=dtypes[i], shape=shapes[i])

--- a/python/paddle/incubate/autograd/functional.py
+++ b/python/paddle/incubate/autograd/functional.py
@@ -69,7 +69,7 @@ def vjp(func, xs, v=None):
    # ``_seprate`` breaks the dependencies between ``xs`` and other
    # variables. See more ``_seprate`` .
-    if paddle.fluid._non_static_mode() or not utils.prim_enabled():
+    if framework.in_dygraph_mode() or not utils.prim_enabled():
        xs, v = _separate(xs), _separate(v)
    ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs)
    _check_v_shape(v, ys)
@@ -130,12 +130,12 @@ def jvp(func, xs, v=None):
    _check_inputs(func, xs, v)
    # ``_seprate`` breaks the dependencies between ``xs`` and other
    # variables. See more ``_seprate`` .
-    if paddle.fluid._non_static_mode() or not utils.prim_enabled():
+    if framework.in_dygraph_mode() or not utils.prim_enabled():
        xs, v = _separate(xs), _separate(v)
    ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs)
    _check_v_shape(v, xs)
-    if not paddle.fluid._non_static_mode() and utils.prim_enabled():
+    if not framework.in_dygraph_mode() and utils.prim_enabled():
        return ys, primapi.forward_grad(ys, xs, v)
    else:
        return ys, _double_backward_trick(ys, xs, v)
@@ -352,7 +352,7 @@ class _Jacobian:
    def __init__(self, func, xs):
        # Skip separating in prim mode temporarily, as detach and clone are not
        # primitive operators.
-        if not paddle.fluid._non_static_mode() and utils.prim_enabled():
+        if not framework.in_dygraph_mode() and utils.prim_enabled():
            self._xs = xs
        else:
            self._xs = _separate(xs)
@@ -580,7 +580,7 @@ def _grad(ys, xs, v=None):
            Tensor is the sum of gradients of outputs with respect to the i-th
            inputs.
    """
-    if paddle.fluid._non_static_mode():
+    if framework.in_dygraph_mode():
        # paddle.grad returns a list though the inputs is a signle Tensor. The
        # follow code snippet fixes the problem by return the first element of
        # xs_grad when the xs is a signle Tensor.

--- a/python/paddle/incubate/distributed/models/moe/moe_layer.py
+++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py
@@ -26,7 +26,7 @@ from paddle import nn
 from paddle.autograd import PyLayer
 from paddle.distributed.utils.moe_utils import global_gather, global_scatter
 from paddle.distributed.utils.nccl_utils import check_nccl_version_for_p2p
-from paddle.framework import in_dygraph_mode
+from paddle.framework import in_dynamic_mode
 from paddle.incubate.distributed.fleet import recompute_hybrid
 from .gate import BaseGate, GShardGate, NaiveGate, SwitchGate
@@ -63,7 +63,7 @@ def _all_gather(tensor, group=None, use_calc_stream=True):
    if group is not None and not group.is_member():
        return
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        group = (
            paddle.distributed.collective._get_default_group()
            if group is None

--- a/python/paddle/incubate/distributed/models/moe/utils.py
+++ b/python/paddle/incubate/distributed/models/moe/utils.py
@@ -26,14 +26,14 @@ from paddle.distributed.models.moe.utils import (
    _number_count,
    _prune_gate_by_capacity,
 )
-from paddle.framework import in_dygraph_mode
+from paddle.framework import in_dynamic_mode
 def _alltoall(in_tensor_list, group=None, use_calc_stream=True):
    if group is not None and not group.is_member():
        return
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        group = (
            paddle.distributed.collective._get_default_group()
            if group is None

--- a/python/paddle/incubate/layers/nn.py
+++ b/python/paddle/incubate/layers/nn.py
@@ -1174,7 +1174,7 @@ def bilateral_slice(x, guide, grid, has_offset, name=None):
            output = paddle.incubate.layers.bilateral_slice(x, guide, grid, has_offset=True)
    """
-    if paddle.fluid._non_static_mode():
+    if paddle.in_dynamic_mode():
        attrs = ('has_offset', has_offset)
        return _legacy_C_ops.bilateral_slice(x, grid, guide, *attrs)
@@ -1252,7 +1252,7 @@ def correlation(
    """
-    if paddle.fluid._non_static_mode():
+    if paddle.in_dynamic_mode():
        attrs = (
            "pad_size",
            pad_size,
@@ -1501,7 +1501,7 @@ def fused_bn_add_act(
 def pow2_decay_with_linear_warmup(
    warmup_steps, total_steps, base_lr, end_lr, dtype='float32', name=None
 ):
-    if paddle.fluid._non_static_mode():
+    if paddle.in_dynamic_mode():
        raise NotImplementedError(
            "pow2_decay_with_linear_warmup does not support dygraph mode yet."
        )

--- a/python/paddle/incubate/nn/functional/fused_dropout_add.py
+++ b/python/paddle/incubate/nn/functional/fused_dropout_add.py
@@ -16,8 +16,7 @@
 from paddle import _C_ops
 from paddle.common_ops_import import default_main_program
 from paddle.fluid import core
-from paddle.fluid.framework import in_dygraph_mode
+from paddle.framework import LayerHelper, in_dynamic_mode
-from paddle.framework import LayerHelper
 def fused_dropout_add(
@@ -73,7 +72,7 @@ def fused_dropout_add(
            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'"
        )
    seed = None
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        if default_main_program().random_seed != 0:
            seed = default_main_program().random_seed
        out, seed_offset = _C_ops.fused_dropout_add(

--- a/python/paddle/incubate/nn/functional/fused_gate_attention.py
+++ b/python/paddle/incubate/nn/functional/fused_gate_attention.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from paddle import _legacy_C_ops
-from paddle.fluid.framework import _non_static_mode
+from paddle.framework import in_dynamic_mode
 def fused_gate_attention(
@@ -142,7 +142,7 @@ def fused_gate_attention(
            # [2, 4, 2, 4]
    """
-    if _non_static_mode():
+    if in_dynamic_mode():
        _, _, _, _, _, _, _, _, out = _legacy_C_ops.fused_gate_attention(
            query,
            key,

--- a/python/paddle/incubate/nn/functional/fused_matmul_bias.py
+++ b/python/paddle/incubate/nn/functional/fused_matmul_bias.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 from paddle import _legacy_C_ops
-from paddle.fluid.framework import _non_static_mode
 from paddle.fluid.layer_helper import LayerHelper
+from paddle.framework import in_dynamic_mode
 from paddle.tensor.linalg import matmul
@@ -53,7 +53,7 @@ def fused_matmul_bias(
    """
    if bias is None:
        return matmul(x, y, transpose_x, transpose_y, name)
-    if _non_static_mode():
+    if in_dynamic_mode():
        return _legacy_C_ops.fused_gemm_epilogue(
            x, y, bias, 'trans_x', transpose_x, 'trans_y', transpose_y
        )

--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -15,8 +15,9 @@
 from paddle import _legacy_C_ops
 from paddle.fluid import core
 from paddle.fluid.data_feeder import check_dtype, check_variable_and_dtype
-from paddle.fluid.framework import _non_static_mode, default_main_program
+from paddle.fluid.framework import default_main_program
 from paddle.fluid.layer_helper import LayerHelper
+from paddle.framework import in_dynamic_mode
 __all__ = []
@@ -132,7 +133,7 @@ def fused_feedforward(
        'downgrade_in_infer' if mode == 'downscale_in_infer' else mode
    )  # semantic transfer
-    if _non_static_mode():
+    if in_dynamic_mode():
        if default_main_program().random_seed != 0:
            seed = default_main_program().random_seed
        out, _, _, _, _, _, _, _, _, _, _ = _legacy_C_ops.fused_feedforward(
@@ -363,7 +364,7 @@ def fused_bias_dropout_residual_layer_norm(
            x.shape[len(x.shape) - 1] == ln_bias.shape[0]
        ), "The dim of ln_bias must equal to the last dim of x."
-    if _non_static_mode():
+    if in_dynamic_mode():
        if default_main_program().random_seed != 0:
            seed = default_main_program().random_seed
        (
@@ -620,7 +621,7 @@ def fused_multi_head_attention(
            f"The rank of the x should be 3, but received {x.ndim}."
        )
-    if _non_static_mode():
+    if in_dynamic_mode():
        if default_main_program().random_seed != 0:
            seed = default_main_program().random_seed
        # pre_ln_mean, pre_ln_variance, pre_ln_out, qkv_out, qkv_bias_out, transpose_out, qk_out,
@@ -1046,7 +1047,7 @@ def fused_multi_transformer(
        'downgrade_in_infer' if mode == 'downscale_in_infer' else mode
    )  # semantic transfer
-    if _non_static_mode():
+    if in_dynamic_mode():
        cache_kv_out, final_out = _legacy_C_ops.fused_multi_transformer(
            x,
            ln_scales,

--- a/python/paddle/incubate/nn/layer/fused_dropout_nd.py
+++ b/python/paddle/incubate/nn/layer/fused_dropout_nd.py
@@ -14,7 +14,7 @@
 import paddle
 from paddle import _legacy_C_ops
-from paddle.fluid.framework import _non_static_mode
+from paddle.framework import in_dynamic_mode
 class FusedDropout(paddle.nn.Layer):
@@ -104,7 +104,7 @@ class FusedDropout(paddle.nn.Layer):
        if self.p == 0:
            return input
-        if self.axis is not None and _non_static_mode():
+        if self.axis is not None and in_dynamic_mode():
            seed = None
            if paddle.static.default_main_program().random_seed != 0:
                seed = paddle.static.default_main_program().random_seed

--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -17,7 +17,8 @@ import paddle
 from paddle.fluid import core
 from paddle.fluid.core import VarDesc
 from paddle.fluid.dygraph import no_grad
-from paddle.fluid.framework import _non_static_mode, convert_np_dtype_to_dtype_
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+from paddle.framework import in_dynamic_mode
 from paddle.incubate.nn import functional as incubate_f
 from paddle.nn import Layer
 from paddle.nn.initializer import Constant
@@ -34,7 +35,7 @@ def _set_var_distributed(var):
    var.is_distributed = True
-    if not _non_static_mode():
+    if not in_dynamic_mode():
        # NOTE: use current_block and find_var_recursive to support while_loop
        startup_block = paddle.static.default_startup_program().current_block()
        main_block = paddle.static.default_main_program().current_block()

--- a/python/paddle/incubate/nn/loss.py
+++ b/python/paddle/incubate/nn/loss.py
@@ -15,7 +15,7 @@
 from paddle import _legacy_C_ops
 from paddle.fluid.data_feeder import check_variable_and_dtype
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.framework import _non_static_mode
+from paddle.framework import in_dynamic_mode
 def identity_loss(x, reduction="none"):
@@ -59,7 +59,7 @@ def identity_loss(x, reduction="none"):
        if reduction is None:
            raise Exception("Unsupported reduction type.")
-    if _non_static_mode():
+    if in_dynamic_mode():
        return _legacy_C_ops.identity_loss(x, "reduction", reduction)
    check_variable_and_dtype(x, 'x', ['float32', 'float64'], "identity_loss")

--- a/python/paddle/incubate/nn/memory_efficient_attention.py
+++ b/python/paddle/incubate/nn/memory_efficient_attention.py
@@ -21,8 +21,8 @@
 import paddle
 from paddle import _C_ops
-from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.layer_helper import LayerHelper
+from paddle.framework import in_dynamic_mode
 from .attn_bias import (
    BlockDiagonalCausalMask,
@@ -99,7 +99,7 @@ def memory_efficient_attention(
    bias = _get_tensor_bias(attn_bias)
    is_test = not training
-    if in_dygraph_mode():
+    if in_dynamic_mode():
        output, logsumexp, seed_and_offset = _C_ops.memory_efficient_attention(
            query,
            key,

--- a/python/paddle/incubate/operators/graph_khop_sampler.py
+++ b/python/paddle/incubate/operators/graph_khop_sampler.py
@@ -14,8 +14,8 @@
 from paddle import _legacy_C_ops
 from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle.fluid.framework import _non_static_mode
 from paddle.fluid.layer_helper import LayerHelper
+from paddle.framework import in_dynamic_mode
 def graph_khop_sampler(
@@ -84,7 +84,7 @@ def graph_khop_sampler(
    """
-    if _non_static_mode():
+    if in_dynamic_mode():
        if return_eids:
            if sorted_eids is None:
                raise ValueError(

--- a/python/paddle/incubate/operators/graph_reindex.py
+++ b/python/paddle/incubate/operators/graph_reindex.py
@@ -14,8 +14,8 @@
 from paddle import _C_ops
 from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle.fluid.framework import _non_static_mode
 from paddle.fluid.layer_helper import LayerHelper
+from paddle.framework import in_dynamic_mode
 from paddle.utils import deprecated
@@ -116,7 +116,7 @@ def graph_reindex(
                "be None if `flag_buffer_hashtable` is True."
            )
-    if _non_static_mode():
+    if in_dynamic_mode():
        reindex_src, reindex_dst, out_nodes = _C_ops.reindex_graph(
            x,
            neighbors,

--- a/python/paddle/incubate/operators/graph_sample_neighbors.py
+++ b/python/paddle/incubate/operators/graph_sample_neighbors.py
@@ -14,8 +14,8 @@
 from paddle import _legacy_C_ops
 from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle.fluid.framework import _non_static_mode
 from paddle.fluid.layer_helper import LayerHelper
+from paddle.framework import in_dynamic_mode
 from paddle.utils import deprecated
@@ -109,7 +109,7 @@ def graph_sample_neighbors(
                "is True."
            )
-    if _non_static_mode():
+    if in_dynamic_mode():
        (
            out_neighbors,
            out_count,

--- a/python/paddle/incubate/operators/graph_send_recv.py
+++ b/python/paddle/incubate/operators/graph_send_recv.py
--- a/python/paddle/incubate/operators/softmax_mask_fuse.py
+++ b/python/paddle/incubate/operators/softmax_mask_fuse.py
--- a/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
+++ b/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
--- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py
+++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
--- a/python/paddle/incubate/tensor/manipulation.py
+++ b/python/paddle/incubate/tensor/manipulation.py
--- a/python/paddle/incubate/tensor/math.py
+++ b/python/paddle/incubate/tensor/math.py
--- a/python/paddle/incubate/xpu/resnet_block.py
+++ b/python/paddle/incubate/xpu/resnet_block.py
--- a/python/paddle/io/dataloader/dataloader_iter.py
+++ b/python/paddle/io/dataloader/dataloader_iter.py
--- a/python/paddle/io/dataloader/dataset.py
+++ b/python/paddle/io/dataloader/dataset.py
--- a/python/paddle/io/reader.py
+++ b/python/paddle/io/reader.py
--- a/python/paddle/jit/api.py
+++ b/python/paddle/jit/api.py
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
--- a/python/paddle/jit/translated_layer.py
+++ b/python/paddle/jit/translated_layer.py
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
--- a/python/paddle/nn/decode.py
+++ b/python/paddle/nn/decode.py
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
--- a/python/paddle/nn/functional/distance.py
+++ b/python/paddle/nn/functional/distance.py
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
--- a/python/paddle/nn/quant/format.py
+++ b/python/paddle/nn/quant/format.py
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
--- a/python/paddle/quantization/quanters/abs_max.py
+++ b/python/paddle/quantization/quanters/abs_max.py
--- a/python/paddle/regularizer.py
+++ b/python/paddle/regularizer.py
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
--- a/python/paddle/sparse/unary.py
+++ b/python/paddle/sparse/unary.py
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
--- a/python/paddle/static/nn/control_flow.py
+++ b/python/paddle/static/nn/control_flow.py
--- a/python/paddle/static/nn/metric.py
+++ b/python/paddle/static/nn/metric.py
--- a/python/paddle/static/nn/sequence_lod.py
+++ b/python/paddle/static/nn/sequence_lod.py
--- a/python/paddle/tensor/array.py
+++ b/python/paddle/tensor/array.py
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
--- a/python/paddle/tensor/layer_function_generator.py
+++ b/python/paddle/tensor/layer_function_generator.py
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
--- a/python/paddle/tensor/ops.py
+++ b/python/paddle/tensor/ops.py
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
--- a/python/paddle/text/viterbi_decode.py
+++ b/python/paddle/text/viterbi_decode.py
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
--- a/python/paddle/utils/dlpack.py
+++ b/python/paddle/utils/dlpack.py
--- a/python/paddle/utils/inplace_utils.py
+++ b/python/paddle/utils/inplace_utils.py
--- a/python/paddle/utils/layers_utils.py
+++ b/python/paddle/utils/layers_utils.py
--- a/test/amp/amp_base_models.py
+++ b/test/amp/amp_base_models.py
--- a/test/auto_parallel/test_to_static.py
+++ b/test/auto_parallel/test_to_static.py
--- a/test/dygraph_to_static/test_convert_call.py
+++ b/test/dygraph_to_static/test_convert_call.py
--- a/test/dygraph_to_static/test_lac.py
+++ b/test/dygraph_to_static/test_lac.py
--- a/test/ir/inference/test_trt_multiclass_nms3_op.py
+++ b/test/ir/inference/test_trt_multiclass_nms3_op.py
--- a/test/legacy_test/test_dlpack.py
+++ b/test/legacy_test/test_dlpack.py
--- a/test/prim/composite_ops/test_composite_layer_norm.py
+++ b/test/prim/composite_ops/test_composite_layer_norm.py
--- a/test/tokenizer/test_faster_tokenizer_op.py
+++ b/test/tokenizer/test_faster_tokenizer_op.py