rm legacy nn part2 (#49259)

* rm legacy nn part2 * rm _non_static_mode * modify * modify unpool test * modify unpool test * modify loss * keep legacy for layer_norm

rm legacy nn part2 (#49259)
* rm legacy nn part2 * rm _non_static_mode * modify * modify unpool test * modify unpool test * modify loss * keep legacy for layer_norm
69e51c77 · 姜永久 · GitHub · e34e634a · 69e51c77 · 69e51c77
7 changed file
--- a/python/paddle/fluid/tests/unittests/test_unpool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unpool_op.py
@@ -414,6 +414,7 @@ class TestUnpoolOpAPI_st(unittest.TestCase):
            pool_out_np, indices_np, [2, 2], [2, 2], [0, 0], [5, 5]
        ).astype("float64")
        np.testing.assert_allclose(results[0], expect_res, rtol=1e-05)
+        paddle.disable_static()
 class TestOutputSizeTensor(UnittestBase):

--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -23,11 +23,7 @@ from ...fluid.data_feeder import (
    check_type,
    check_variable_and_dtype,
 )
-from ...fluid.framework import (
+from ...fluid.framework import in_dygraph_mode
-    _in_legacy_dygraph,
-    _non_static_mode,
-    in_dygraph_mode,
-)
 from ...fluid.layer_helper import LayerHelper
 from ...framework import convert_np_dtype_to_dtype_, core
 from ...static import Variable
@@ -326,25 +322,20 @@ def gather_tree(ids, parents):
    if in_dygraph_mode():
        return _C_ops.gather_tree(ids, parents)
    else:
-        if _in_legacy_dygraph():
+        helper = LayerHelper('gather_tree', **locals())
-            return _legacy_C_ops.gather_tree(ids, parents)
+        check_variable_and_dtype(ids, 'ids', ['int32', 'int64'], 'gather_tree')
-        else:
+        check_variable_and_dtype(
-            helper = LayerHelper('gather_tree', **locals())
+            parents, 'parents', ['int32', 'int64'], 'gather_tree'
-            check_variable_and_dtype(
+        )
-                ids, 'ids', ['int32', 'int64'], 'gather_tree'
+        out = helper.create_variable_for_type_inference(dtype=ids.dtype)
-            )
-            check_variable_and_dtype(
-                parents, 'parents', ['int32', 'int64'], 'gather_tree'
-            )
-            out = helper.create_variable_for_type_inference(dtype=ids.dtype)
-            helper.append_op(
-                type="gather_tree",
-                inputs={"Ids": ids, "Parents": parents},
-                outputs={"Out": out},
-            )
-            return out
+        helper.append_op(
+            type="gather_tree",
+            inputs={"Ids": ids, "Parents": parents},
+            outputs={"Out": out},
+        )
+        return out
 @templatedoc()
@@ -385,35 +376,27 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
        )
    if in_dygraph_mode():
        return _C_ops.temporal_shift(x, seg_num, shift_ratio, data_format)
-    if _non_static_mode():
+    else:
-        return _legacy_C_ops.temporal_shift(
+        helper = LayerHelper("temporal_shift", **locals())
-            x,
+        check_variable_and_dtype(
-            'seg_num',
+            x, 'x', ['float32', 'float64'], 'temporal_shift'
-            seg_num,
-            'shift_ratio',
-            shift_ratio,
-            'data_format',
-            data_format,
        )
+        check_type(seg_num, 'seg_num', int, 'temporal_shift')
-    helper = LayerHelper("temporal_shift", **locals())
+        check_type(shift_ratio, 'shift_ratio', float, 'temporal_shift')
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'temporal_shift')
-    check_type(seg_num, 'seg_num', int, 'temporal_shift')
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    check_type(shift_ratio, 'shift_ratio', float, 'temporal_shift')
+        if not isinstance(seg_num, int):
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+            raise TypeError("seg_num must be int type.")
-    if not isinstance(seg_num, int):
+        helper.append_op(
-        raise TypeError("seg_num must be int type.")
+            type="temporal_shift",
+            inputs={"X": x},
-    helper.append_op(
+            outputs={"Out": out},
-        type="temporal_shift",
+            attrs={
-        inputs={"X": x},
+                "seg_num": seg_num,
-        outputs={"Out": out},
+                "shift_ratio": shift_ratio,
-        attrs={
+                "data_format": data_format,
-            "seg_num": seg_num,
+            },
-            "shift_ratio": shift_ratio,
+        )
-            "data_format": data_format,
+        return out
-        },
-    )
-    return out
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddle import _C_ops, _legacy_C_ops
+from paddle import _C_ops
 from ...fluid.data_feeder import check_variable_and_dtype
-from ...fluid.framework import _in_legacy_dygraph, in_dygraph_mode
+from ...fluid.framework import in_dygraph_mode
 from ...fluid.layer_helper import LayerHelper
 from ...static import Variable
@@ -88,35 +88,26 @@ def one_hot(x, num_classes, name=None):
    if in_dygraph_mode():
        return _C_ops.one_hot(x, num_classes)
    else:
-        if _in_legacy_dygraph():
+        check_variable_and_dtype(x, 'input', ['int32', 'int64'], 'one_hot_v2')
-            return _legacy_C_ops.one_hot_v2(
+        helper = LayerHelper("one_hot_v2", **locals())
-                x, 'depth', num_classes, 'allow_out_of_range', False
-            )
+        one_hot_out = helper.create_variable_for_type_inference(dtype='float32')
+        if not isinstance(num_classes, Variable):
+            # user attribute
+            inputs = {'X': x}
+            attrs = {'depth': num_classes, 'allow_out_of_range': False}
        else:
-            check_variable_and_dtype(
+            num_classes.stop_gradient = True
-                x, 'input', ['int32', 'int64'], 'one_hot_v2'
+            inputs = {'X': x, 'depth_tensor': num_classes}
-            )
+            attrs = {'allow_out_of_range': False}
-            helper = LayerHelper("one_hot_v2", **locals())
+        helper.append_op(
+            type="one_hot_v2",
-            one_hot_out = helper.create_variable_for_type_inference(
+            inputs=inputs,
-                dtype='float32'
+            attrs=attrs,
-            )
+            outputs={'Out': one_hot_out},
-            if not isinstance(num_classes, Variable):
+            stop_gradient=True,
-                # user attribute
+        )
-                inputs = {'X': x}
+        return one_hot_out
-                attrs = {'depth': num_classes, 'allow_out_of_range': False}
-            else:
-                num_classes.stop_gradient = True
-                inputs = {'X': x, 'depth_tensor': num_classes}
-                attrs = {'allow_out_of_range': False}
-            helper.append_op(
-                type="one_hot_v2",
-                inputs=inputs,
-                attrs=attrs,
-                outputs={'Out': one_hot_out},
-                stop_gradient=True,
-            )
-            return one_hot_out
 def embedding(x, weight, padding_idx=None, sparse=False, name=None):
@@ -212,19 +203,6 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
    if in_dygraph_mode():
        return _C_ops.embedding(x, weight, padding_idx, sparse)
-    elif _in_legacy_dygraph():
-        return _legacy_C_ops.lookup_table_v2(
-            weight,
-            x,
-            'is_sparse',
-            sparse,
-            'is_distributed',
-            False,
-            'remote_prefetch',
-            False,
-            'padding_idx',
-            padding_idx,
-        )
    else:
        helper = LayerHelper('embedding', **locals())
        dtype = helper.input_dtype(input_param_name='weight')

--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -21,15 +21,8 @@ from paddle.framework import core
 from paddle.utils import deprecated
 from ...fluid.data_feeder import check_variable_and_dtype
-from ...fluid.framework import (
+from ...fluid.framework import _current_expected_place, in_dygraph_mode
-    _current_expected_place,
-    _in_legacy_dygraph,
-    _non_static_mode,
-    _varbase_creator,
-    in_dygraph_mode,
-)
 from ...fluid.layer_helper import LayerHelper
-from ...fluid.layers.nn import _elementwise_op_in_dygraph
 from ...static import Variable
 from ...tensor.manipulation import reshape
@@ -260,7 +253,7 @@ def fluid_softmax_with_cross_entropy(
            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
            #        [1.15328646])
    """
-    if _non_static_mode():
+    if in_dygraph_mode():
        if core.is_compiled_with_npu():
            softmax, backprop, loss = _legacy_C_ops.softmax_with_cross_entropy(
                logits,
@@ -275,59 +268,47 @@ def fluid_softmax_with_cross_entropy(
                axis,
            )
        else:
-            if in_dygraph_mode():
+            softmax, loss = _C_ops.cross_entropy_with_softmax(
-                softmax, loss = _C_ops.cross_entropy_with_softmax(
+                logits,
-                    logits,
+                label,
-                    label,
+                soft_label,
-                    soft_label,
+                True,
-                    True,
+                numeric_stable_mode,
-                    numeric_stable_mode,
+                ignore_index,
-                    ignore_index,
+                axis,
-                    axis,
+            )
-                )
-            if _in_legacy_dygraph():
-                softmax, loss = _legacy_C_ops.softmax_with_cross_entropy(
-                    logits,
-                    label,
-                    'soft_label',
-                    soft_label,
-                    'ignore_index',
-                    ignore_index,
-                    'numeric_stable_mode',
-                    numeric_stable_mode,
-                    'axis',
-                    axis,
-                )
        if not return_softmax:
            return loss
        else:
            return loss, softmax
+    else:
+        attrs = {
+            'soft_label': soft_label,
+            'ignore_index': ignore_index,
+            'numeric_stable_mode': numeric_stable_mode,
+            'axis': axis,
+        }
+        helper = LayerHelper('softmax_with_cross_entropy', **locals())
+        softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
+        loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    attrs = {
+        outputs = {'Softmax': softmax, 'Loss': loss}
-        'soft_label': soft_label,
+        if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
-        'ignore_index': ignore_index,
+            backprop = helper.create_variable_for_type_inference(
-        'numeric_stable_mode': numeric_stable_mode,
+                dtype=logits.dtype
-        'axis': axis,
+            )
-    }
+            outputs['Backprop'] = backprop
-    helper = LayerHelper('softmax_with_cross_entropy', **locals())
+        helper.append_op(
-    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
+            type='softmax_with_cross_entropy',
-    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
+            inputs={'Logits': logits, 'Label': label},
+            outputs=outputs,
-    outputs = {'Softmax': softmax, 'Loss': loss}
+            attrs=attrs,
-    if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
+        )
-        backprop = helper.create_variable_for_type_inference(dtype=logits.dtype)
-        outputs['Backprop'] = backprop
-    helper.append_op(
-        type='softmax_with_cross_entropy',
-        inputs={'Logits': logits, 'Label': label},
-        outputs=outputs,
-        attrs=attrs,
-    )
-    if return_softmax:
+        if return_softmax:
-        return loss, softmax
+            return loss, softmax
-    return loss
+        return loss
 def npair_loss(anchor, positive, labels, l2_reg=0.002):
@@ -441,30 +422,30 @@ def square_error_cost(input, label):
        minus_out = _C_ops.subtract(input, label)
        square_out = _C_ops.square(minus_out)
        return square_out
-    elif _in_legacy_dygraph():
+    else:
-        minus_out = _legacy_C_ops.elementwise_sub(input, label)
+        check_variable_and_dtype(
-        square_out = _legacy_C_ops.square(minus_out)
+            input, "input", ['float32', 'float64'], 'square_error_cost'
-        return square_out
+        )
+        check_variable_and_dtype(
-    check_variable_and_dtype(
+            label, "label", ['float32', 'float64'], 'square_error_cost'
-        input, "input", ['float32', 'float64'], 'square_error_cost'
+        )
-    )
+        helper = LayerHelper('square_error_cost', **locals())
-    check_variable_and_dtype(
+        minus_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-        label, "label", ['float32', 'float64'], 'square_error_cost'
+        helper.append_op(
-    )
+            type='elementwise_sub',
-    helper = LayerHelper('square_error_cost', **locals())
+            inputs={'X': [input], 'Y': [label]},
-    minus_out = helper.create_variable_for_type_inference(dtype=input.dtype)
+            outputs={'Out': [minus_out]},
-    helper.append_op(
+        )
-        type='elementwise_sub',
-        inputs={'X': [input], 'Y': [label]},
-        outputs={'Out': [minus_out]},
-    )
-    square_out = helper.create_variable_for_type_inference(dtype=input.dtype)
+        square_out = helper.create_variable_for_type_inference(
-    helper.append_op(
+            dtype=input.dtype
-        type='square', inputs={'X': [minus_out]}, outputs={'Out': [square_out]}
+        )
-    )
+        helper.append_op(
-    return square_out
+            type='square',
+            inputs={'X': [minus_out]},
+            outputs={'Out': [square_out]},
+        )
+        return square_out
 def edit_distance(
@@ -675,53 +656,40 @@ def binary_cross_entropy(
        else:
            return out
    else:
-        if _in_legacy_dygraph():
+        check_variable_and_dtype(
-            out = _legacy_C_ops.bce_loss(input, label)
+            input, 'input', ['float32', 'float64'], 'binary_cross_entropy'
-            if weight is not None:
+        )
-                out = _legacy_C_ops.elementwise_mul(out, weight, 'axis', -1)
+        check_variable_and_dtype(
-            if reduction == 'sum':
+            label, 'label', ['float32', 'float64'], 'binary_cross_entropy'
-                return _legacy_C_ops.reduce_sum(
+        )
-                    out, 'dim', [0], 'keep_dim', False, "reduce_all", True
-                )
-            elif reduction == 'mean':
-                return _legacy_C_ops.mean(out)
-            else:
-                return out
-        else:
-            check_variable_and_dtype(
-                input, 'input', ['float32', 'float64'], 'binary_cross_entropy'
-            )
-            check_variable_and_dtype(
-                label, 'label', ['float32', 'float64'], 'binary_cross_entropy'
-            )
-            sub_name = name if weight is None and reduction == 'none' else None
-            helper = LayerHelper("binary_cross_entropy", name=sub_name)
-            out = helper.create_variable_for_type_inference(dtype=input.dtype)
-            helper.append_op(
-                type='bce_loss',
-                inputs={
-                    'X': [input],
-                    'Label': [label],
-                },
-                outputs={'Out': [out]},
-            )
-            if weight is not None:
+        sub_name = name if weight is None and reduction == 'none' else None
-                if isinstance(weight, paddle.static.Variable):
+        helper = LayerHelper("binary_cross_entropy", name=sub_name)
-                    weight_name = name if reduction == 'none' else None
+        out = helper.create_variable_for_type_inference(dtype=input.dtype)
-                    out = paddle.multiply(out, weight, name=weight_name)
+        helper.append_op(
-                else:
+            type='bce_loss',
-                    raise ValueError(
+            inputs={
-                        "The weight is not a Tensor, please convert to Tensor."
+                'X': [input],
-                    )
+                'Label': [label],
+            },
+            outputs={'Out': [out]},
+        )
-            if reduction == 'sum':
+        if weight is not None:
-                return paddle.sum(out, name=name)
+            if isinstance(weight, paddle.static.Variable):
-            elif reduction == 'mean':
+                weight_name = name if reduction == 'none' else None
-                return paddle.mean(out, name=name)
+                out = paddle.multiply(out, weight, name=weight_name)
            else:
-                return out
+                raise ValueError(
+                    "The weight is not a Tensor, please convert to Tensor."
+                )
+        if reduction == 'sum':
+            return paddle.sum(out, name=name)
+        elif reduction == 'mean':
+            return paddle.mean(out, name=name)
+        else:
+            return out
 def binary_cross_entropy_with_logits(
@@ -833,98 +801,65 @@ def binary_cross_entropy_with_logits(
            return _C_ops.mean_all(out)
        else:
            return out
-    elif _in_legacy_dygraph():
+    else:
-        one = _varbase_creator(dtype=logit.dtype)
-        _legacy_C_ops.fill_constant(
-            one,
-            'value',
-            float(1.0),
-            'force_cpu',
-            False,
-            'dtype',
-            one.dtype,
-            'str_value',
-            '1.0',
-            'shape',
-            [1],
-        )
-        out = _legacy_C_ops.sigmoid_cross_entropy_with_logits(logit, label)
-        if pos_weight is not None:
-            log_weight = _legacy_C_ops.elementwise_add(
-                _legacy_C_ops.elementwise_mul(
-                    label, _legacy_C_ops.elementwise_sub(pos_weight, one)
-                ),
-                one,
-            )
-            out = _legacy_C_ops.elementwise_mul(out, log_weight)
-        if weight is not None:
-            out = _legacy_C_ops.elementwise_mul(out, weight)
-        if reduction == "sum":
-            return _legacy_C_ops.reduce_sum(out, 'reduce_all', True)
-        elif reduction == "mean":
-            return _legacy_C_ops.mean(out)
-        else:
-            return out
-    check_variable_and_dtype(
-        logit,
-        'logit',
-        ['float32', 'float64'],
-        'binary_cross_entropy_with_logits',
-    )
-    check_variable_and_dtype(
-        label,
-        'label',
-        ['float32', 'float64'],
-        'binary_cross_entropy_with_logits',
-    )
-    sigmoid_name = None
-    if reduction == 'none' and pos_weight is None and weight is None:
-        sigmoid_name = name
-    helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals())
-    out = helper.create_variable_for_type_inference(dtype=logit.dtype)
-    helper.append_op(
-        type="sigmoid_cross_entropy_with_logits",
-        inputs={"X": logit, "Label": label},
-        attrs={"ignore_index": kIgnoreIndex, 'normalize': False},
-        outputs={"Out": out},
-    )
-    one = paddle.full(shape=[1], fill_value=1.0, dtype=logit.dtype)
-    if pos_weight is not None:
        check_variable_and_dtype(
-            pos_weight,
+            logit,
-            'pos_weight',
+            'logit',
            ['float32', 'float64'],
            'binary_cross_entropy_with_logits',
        )
-        log_weight = paddle.add(
-            paddle.multiply(label, paddle.subtract(pos_weight, one)), one
-        )
-        pos_weight_name = (
-            name if reduction == 'none' and weight is None else None
-        )
-        out = paddle.multiply(out, log_weight, name=pos_weight_name)
-    if weight is not None:
        check_variable_and_dtype(
-            weight,
+            label,
-            'weight',
+            'label',
            ['float32', 'float64'],
            'binary_cross_entropy_with_logits',
        )
-        weight_name = name if reduction == 'none' else None
+        sigmoid_name = None
-        out = paddle.multiply(out, weight, name=weight_name)
+        if reduction == 'none' and pos_weight is None and weight is None:
+            sigmoid_name = name
-    if reduction == "sum":
+        helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals())
-        return paddle.sum(out, name=name)
-    elif reduction == "mean":
+        out = helper.create_variable_for_type_inference(dtype=logit.dtype)
-        return paddle.mean(out, name=name)
-    return out
+        helper.append_op(
+            type="sigmoid_cross_entropy_with_logits",
+            inputs={"X": logit, "Label": label},
+            attrs={"ignore_index": kIgnoreIndex, 'normalize': False},
+            outputs={"Out": out},
+        )
+        one = paddle.full(shape=[1], fill_value=1.0, dtype=logit.dtype)
+        if pos_weight is not None:
+            check_variable_and_dtype(
+                pos_weight,
+                'pos_weight',
+                ['float32', 'float64'],
+                'binary_cross_entropy_with_logits',
+            )
+            log_weight = paddle.add(
+                paddle.multiply(label, paddle.subtract(pos_weight, one)), one
+            )
+            pos_weight_name = (
+                name if reduction == 'none' and weight is None else None
+            )
+            out = paddle.multiply(out, log_weight, name=pos_weight_name)
+        if weight is not None:
+            check_variable_and_dtype(
+                weight,
+                'weight',
+                ['float32', 'float64'],
+                'binary_cross_entropy_with_logits',
+            )
+            weight_name = name if reduction == 'none' else None
+            out = paddle.multiply(out, weight, name=weight_name)
+        if reduction == "sum":
+            return paddle.sum(out, name=name)
+        elif reduction == "mean":
+            return paddle.mean(out, name=name)
+        return out
 def hsigmoid_loss(
@@ -1031,67 +966,55 @@ def hsigmoid_loss(
            is_sparse,
        )
        return out
-    elif _in_legacy_dygraph():
+    else:
-        out, _, _ = _legacy_C_ops.hierarchical_sigmoid(
-            input,
-            weight,
-            label,
-            path_table,
-            path_code,
-            bias,
-            'num_classes',
-            num_classes,
-            'is_sparse',
-            is_sparse,
-            'remote_prefetch',
-            is_sparse,
-        )
-        return out
-    check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'hsigmoid_loss'
-    )
-    check_variable_and_dtype(label, 'label', ['int64'], 'hsigmoid_loss')
-    check_variable_and_dtype(
-        weight, 'weight', ['float32', 'float64'], 'hsigmoid_loss'
-    )
-    if bias is not None:
-        check_variable_and_dtype(
-            bias, 'bias', ['float32', 'float64'], 'hsigmoid_loss'
-        )
-    if path_table is not None:
        check_variable_and_dtype(
-            path_table, 'path_table', ['int64'], 'hsigmoid_loss'
+            input, 'input', ['float32', 'float64'], 'hsigmoid_loss'
        )
-    if path_code is not None:
+        check_variable_and_dtype(label, 'label', ['int64'], 'hsigmoid_loss')
        check_variable_and_dtype(
-            path_code, 'path_code', ['int64'], 'hsigmoid_loss'
+            weight, 'weight', ['float32', 'float64'], 'hsigmoid_loss'
        )
+        if bias is not None:
+            check_variable_and_dtype(
+                bias, 'bias', ['float32', 'float64'], 'hsigmoid_loss'
+            )
+        if path_table is not None:
+            check_variable_and_dtype(
+                path_table, 'path_table', ['int64'], 'hsigmoid_loss'
+            )
+        if path_code is not None:
+            check_variable_and_dtype(
+                path_code, 'path_code', ['int64'], 'hsigmoid_loss'
+            )
-    attrs = {
+        attrs = {
-        "num_classes": num_classes,
+            "num_classes": num_classes,
-        "is_sparse": is_sparse,
+            "is_sparse": is_sparse,
-        "remote_prefetch": is_sparse,
+            "remote_prefetch": is_sparse,
-    }
+        }
-    inputs = {
-        "X": input,
-        "W": weight,
-        "Bias": bias,
-        "PathTable": path_table,
-        "PathCode": path_code,
-        "Label": label,
-    }
-    helper = LayerHelper('hsigmoid_loss', **locals())
-    out = helper.create_variable_for_type_inference(input.dtype)
-    pre_out = helper.create_variable_for_type_inference(input.dtype)
-    outputs = {"Out": out, "PreOut": pre_out, "W_Out": weight}
-    helper.append_op(
+        inputs = {
-        type="hierarchical_sigmoid", inputs=inputs, outputs=outputs, attrs=attrs
+            "X": input,
-    )
+            "W": weight,
-    return out
+            "Bias": bias,
+            "PathTable": path_table,
+            "PathCode": path_code,
+            "Label": label,
+        }
+        helper = LayerHelper('hsigmoid_loss', **locals())
+        out = helper.create_variable_for_type_inference(input.dtype)
+        pre_out = helper.create_variable_for_type_inference(input.dtype)
+        outputs = {"Out": out, "PreOut": pre_out, "W_Out": weight}
+        helper.append_op(
+            type="hierarchical_sigmoid",
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+        )
+        return out
 def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
@@ -1246,65 +1169,55 @@ def margin_ranking_loss(
        elif reduction == 'mean':
            return _C_ops.mean_all(out)
        return out
-    elif _in_legacy_dygraph():
+    else:
-        out = _legacy_C_ops.elementwise_sub(other, input)
+        helper = LayerHelper("margin_ranking_loss", **locals())
-        out = _legacy_C_ops.elementwise_mul(out, label)
+        check_variable_and_dtype(
-        if margin != 0.0:
+            input, 'input', ['float32', 'float64'], 'margin_rank_loss'
-            margin = fluid.dygraph.base.to_variable([margin], dtype=out.dtype)
+        )
-            out = _legacy_C_ops.elementwise_add(out, margin)
+        check_variable_and_dtype(
-        out = _legacy_C_ops.relu(out)
+            other, 'other', ['float32', 'float64'], 'margin_rank_loss'
-        if reduction == 'sum':
+        )
-            return _legacy_C_ops.reduce_sum(out, 'reduce_all', True)
+        check_variable_and_dtype(
-        elif reduction == 'mean':
+            label, 'label', ['float32', 'float64'], 'margin_rank_loss'
-            return _legacy_C_ops.mean(out)
+        )
-        return out
-    helper = LayerHelper("margin_ranking_loss", **locals())
-    check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'margin_rank_loss'
-    )
-    check_variable_and_dtype(
-        other, 'other', ['float32', 'float64'], 'margin_rank_loss'
-    )
-    check_variable_and_dtype(
-        label, 'label', ['float32', 'float64'], 'margin_rank_loss'
-    )
-    out = paddle.subtract(input, other)
+        out = paddle.subtract(input, other)
-    neg_label = paddle.neg(label)
+        neg_label = paddle.neg(label)
-    out = paddle.multiply(neg_label, out)
+        out = paddle.multiply(neg_label, out)
-    if margin != 0.0:
+        if margin != 0.0:
-        margin_var = out.block.create_var(dtype=out.dtype)
+            margin_var = out.block.create_var(dtype=out.dtype)
-        margin_var = paddle.full(shape=[1], fill_value=margin, dtype=out.dtype)
+            margin_var = paddle.full(
-        out = paddle.add(out, margin_var)
+                shape=[1], fill_value=margin, dtype=out.dtype
+            )
+            out = paddle.add(out, margin_var)
-    result_out = helper.create_variable_for_type_inference(input.dtype)
+        result_out = helper.create_variable_for_type_inference(input.dtype)
-    if reduction == 'none':
+        if reduction == 'none':
-        helper.append_op(
+            helper.append_op(
-            type="relu", inputs={"X": out}, outputs={"Out": result_out}
+                type="relu", inputs={"X": out}, outputs={"Out": result_out}
-        )
+            )
-        return result_out
+            return result_out
-    elif reduction == 'sum':
+        elif reduction == 'sum':
-        out = paddle.nn.functional.relu(out)
+            out = paddle.nn.functional.relu(out)
-        attrs = {"dim": [0], "keep_dim": False, "reduce_all": True}
+            attrs = {"dim": [0], "keep_dim": False, "reduce_all": True}
-        helper.append_op(
+            helper.append_op(
-            type="reduce_sum",
+                type="reduce_sum",
-            inputs={"X": out},
+                inputs={"X": out},
-            outputs={"Out": result_out},
+                outputs={"Out": result_out},
-            attrs=attrs,
+                attrs=attrs,
-        )
+            )
-        return result_out
+            return result_out
-    elif reduction == 'mean':
+        elif reduction == 'mean':
-        out = paddle.nn.functional.relu(out)
+            out = paddle.nn.functional.relu(out)
-        helper.append_op(
+            helper.append_op(
-            type="mean",
+                type="mean",
-            inputs={"X": out},
+                inputs={"X": out},
-            outputs={"Out": result_out},
+                outputs={"Out": result_out},
-            attrs={},
+                attrs={},
-        )
+            )
-        return result_out
+            return result_out
 def l1_loss(input, label, reduction='mean', name=None):
@@ -1384,34 +1297,22 @@ def l1_loss(input, label, reduction='mean', name=None):
            return _C_ops.sum(unreduced, [], None, False)
        else:
            return unreduced
-    elif _in_legacy_dygraph():
+    else:
-        unreduced = _elementwise_op_in_dygraph(
+        check_variable_and_dtype(
-            input, label, axis=-1, act='abs', op_name='elementwise_sub'
+            input, 'input', ['float32', 'float64', 'int32', 'int64'], 'l1_loss'
+        )
+        check_variable_and_dtype(
+            label, 'label', ['float32', 'float64', 'int32', 'int64'], 'l1_loss'
        )
-        if reduction == 'mean':
-            return _legacy_C_ops.mean(unreduced)
-        elif reduction == 'sum':
-            return _legacy_C_ops.reduce_sum(
-                unreduced, 'dim', [0], 'keep_dim', False, 'reduce_all', True
-            )
-        else:
-            return unreduced
-    check_variable_and_dtype(
-        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'l1_loss'
-    )
-    check_variable_and_dtype(
-        label, 'label', ['float32', 'float64', 'int32', 'int64'], 'l1_loss'
-    )
-    if reduction == 'sum':
+        if reduction == 'sum':
-        unreduced = paddle.abs(paddle.subtract(x=input, y=label))
+            unreduced = paddle.abs(paddle.subtract(x=input, y=label))
-        return paddle.sum(unreduced, name=name)
+            return paddle.sum(unreduced, name=name)
-    elif reduction == 'mean':
+        elif reduction == 'mean':
-        unreduced = paddle.abs(paddle.subtract(x=input, y=label))
+            unreduced = paddle.abs(paddle.subtract(x=input, y=label))
-        return paddle.mean(unreduced, name=name)
+            return paddle.mean(unreduced, name=name)
-    else:
+        else:
-        return paddle.abs(paddle.subtract(x=input, y=label, name=name))
+            return paddle.abs(paddle.subtract(x=input, y=label, name=name))
 def nll_loss(
@@ -1488,53 +1389,37 @@ def nll_loss(
        if input_dims != 2 and input_dims != 4 and reduction == 'none':
            out = _C_ops.reshape(out, out_shape)
        return out
-    elif _in_legacy_dygraph():
+    else:
+        helper = LayerHelper('nll_loss', **locals())
        if input_dims != 2 and input_dims != 4:
-            input, _ = _legacy_C_ops.reshape2(
+            input = reshape(input, shape=[n, c, 1, -1])
-                input, None, 'shape', [n, c, 1, -1]
+            label = reshape(label, shape=[n, 1, -1])
-            )
-            label, _ = _legacy_C_ops.reshape2(label, None, 'shape', [n, 1, -1])
            out_shape = [n] + input_shape[2:]
-        out, total_weight = _legacy_C_ops.nll_loss(
+        check_variable_and_dtype(
-            input,
+            input, 'input', ['float32', 'float64'], 'nll_loss'
-            label,
-            weight,
-            'ignore_index',
-            ignore_index,
-            'reduction',
-            reduction,
        )
-        if input_dims != 2 and input_dims != 4 and reduction == 'none':
+        check_variable_and_dtype(label, 'label', ['int64'], 'nll_loss')
-            out, _ = _legacy_C_ops.reshape2(out, None, 'shape', out_shape)
+        inputs = {'X': input, 'Label': label}
-        return out
+        attrs = {'reduction': reduction, 'ignore_index': ignore_index}
+        if weight is not None:
-    helper = LayerHelper('nll_loss', **locals())
+            if isinstance(weight, Variable):
+                inputs['Weight'] = weight
-    if input_dims != 2 and input_dims != 4:
-        input = reshape(input, shape=[n, c, 1, -1])
-        label = reshape(label, shape=[n, 1, -1])
-        out_shape = [n] + input_shape[2:]
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'nll_loss')
-    check_variable_and_dtype(label, 'label', ['int64'], 'nll_loss')
-    inputs = {'X': input, 'Label': label}
-    attrs = {'reduction': reduction, 'ignore_index': ignore_index}
-    if weight is not None:
-        if isinstance(weight, Variable):
-            inputs['Weight'] = weight
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+        out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    total_weight = helper.create_variable_for_type_inference(dtype=input.dtype)
+        total_weight = helper.create_variable_for_type_inference(
-    outputs = {'Out': out, 'Total_weight': total_weight}
+            dtype=input.dtype
+        )
+        outputs = {'Out': out, 'Total_weight': total_weight}
-    helper.append_op(
+        helper.append_op(
-        type='nll_loss', inputs=inputs, outputs=outputs, attrs=attrs
+            type='nll_loss', inputs=inputs, outputs=outputs, attrs=attrs
-    )
+        )
-    if input_dims != 2 and input_dims != 4 and reduction == 'none':
+        if input_dims != 2 and input_dims != 4 and reduction == 'none':
-        out = reshape(out, shape=out_shape)
+            out = reshape(out, shape=out_shape)
-    return out
+        return out
 def kl_div(input, label, reduction='mean', name=None):
@@ -1624,40 +1509,33 @@ def kl_div(input, label, reduction='mean', name=None):
                batch_size = input.shape[0]
                out = paddle.sum(out) / batch_size
        return out
-    elif _in_legacy_dygraph():
+    else:
-        out = _legacy_C_ops.kldiv_loss(input, label, 'reduction', 'none')
+        helper = LayerHelper('kl_div', **locals())
-        if reduction == 'mean':
-            out = paddle.mean(out)
-        elif reduction == 'sum':
-            out = paddle.sum(out)
-        elif reduction == 'batchmean':
-            if len(input.shape) > 0:
-                batch_size = input.shape[0]
-                out = paddle.sum(out) / batch_size
-        return out
-    helper = LayerHelper('kl_div', **locals())
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'kl_div')
+        check_variable_and_dtype(
-    check_variable_and_dtype(label, 'label', ['float32', 'float64'], 'kl_div')
+            input, 'input', ['float32', 'float64'], 'kl_div'
-    fluid.data_feeder.check_type(reduction, 'reduction', str, 'kl_div')
+        )
+        check_variable_and_dtype(
+            label, 'label', ['float32', 'float64'], 'kl_div'
+        )
+        fluid.data_feeder.check_type(reduction, 'reduction', str, 'kl_div')
-    loss = helper.create_variable_for_type_inference(dtype=input.dtype)
+        loss = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
+        helper.append_op(
-        type='kldiv_loss',
+            type='kldiv_loss',
-        inputs={'X': input, 'Target': label},
+            inputs={'X': input, 'Target': label},
-        outputs={'Loss': loss},
+            outputs={'Loss': loss},
-        attrs={'reduction': 'none'},
+            attrs={'reduction': 'none'},
-    )
+        )
-    if reduction == 'mean':
+        if reduction == 'mean':
-        loss = paddle.mean(loss)
+            loss = paddle.mean(loss)
-    elif reduction == 'sum':
+        elif reduction == 'sum':
-        loss = paddle.sum(loss)
+            loss = paddle.sum(loss)
-    elif reduction == 'batchmean':
+        elif reduction == 'batchmean':
-        batch_size = paddle.shape(input)[0]
+            batch_size = paddle.shape(input)[0]
-        loss = paddle.sum(loss) / batch_size
+            loss = paddle.sum(loss) / batch_size
-    return loss
+        return loss
 def mse_loss(input, label, reduction='mean', name=None):
@@ -1836,51 +1714,40 @@ def ctc_loss(
                input, label, input_length, label_length, blank, norm_by_times
            )
            return loss_out
-        if _non_static_mode():
+        else:
-            if input_length is None or label_length is None:
+            helper = LayerHelper('warpctc', **locals())
-                raise ValueError(
-                    "input_length and label_length must not be None in dygraph mode!"
-                )
-            grad, loss_out = _legacy_C_ops.warpctc(
-                input,
-                label,
-                input_length,
-                label_length,
-                'blank',
-                blank,
-                'norm_by_times',
-                norm_by_times,
-            )
-            return loss_out
-        helper = LayerHelper('warpctc', **locals())
-        check_variable_and_dtype(
-            input, 'input', ['float32', 'float64'], "warpctc"
-        )
-        check_variable_and_dtype(label, 'label', ['int32'], "warpctc")
-        this_inputs = {'Logits': [input], 'Label': [label]}
-        if input_length is not None and label_length is not None:
-            check_variable_and_dtype(
-                input_length, 'LogitsLength', ['int64'], "warpctc"
-            )
            check_variable_and_dtype(
-                label_length, 'LabelLength', ['int64'], "warpctc"
+                input, 'input', ['float32', 'float64'], "warpctc"
            )
-            this_inputs['LogitsLength'] = [input_length]
+            check_variable_and_dtype(label, 'label', ['int32'], "warpctc")
-            this_inputs['LabelLength'] = [label_length]
+            this_inputs = {'Logits': [input], 'Label': [label]}
+            if input_length is not None and label_length is not None:
+                check_variable_and_dtype(
+                    input_length, 'LogitsLength', ['int64'], "warpctc"
+                )
+                check_variable_and_dtype(
+                    label_length, 'LabelLength', ['int64'], "warpctc"
+                )
+                this_inputs['LogitsLength'] = [input_length]
+                this_inputs['LabelLength'] = [label_length]
-        loss_out = helper.create_variable_for_type_inference(dtype=input.dtype)
+            loss_out = helper.create_variable_for_type_inference(
-        grad_out = helper.create_variable_for_type_inference(dtype=input.dtype)
+                dtype=input.dtype
+            )
+            grad_out = helper.create_variable_for_type_inference(
+                dtype=input.dtype
+            )
-        helper.append_op(
+            helper.append_op(
-            type='warpctc',
+                type='warpctc',
-            inputs=this_inputs,
+                inputs=this_inputs,
-            outputs={'WarpCTCGrad': [grad_out], 'Loss': [loss_out]},
+                outputs={'WarpCTCGrad': [grad_out], 'Loss': [loss_out]},
-            attrs={
+                attrs={
-                'blank': blank,
+                    'blank': blank,
-                'norm_by_times': norm_by_times,
+                    'norm_by_times': norm_by_times,
-            },
+                },
-        )
+            )
-        return loss_out
+            return loss_out
    loss_out = warpctc(
        log_probs, labels, blank, norm_by_times, input_lengths, label_lengths
@@ -2274,77 +2141,48 @@ def margin_cross_entropy(
            return loss
        else:
            return loss, softmax
-    elif _in_legacy_dygraph():
+    else:
-        softmax, loss = _legacy_C_ops.margin_cross_entropy(
+        op_type = 'margin_cross_entropy'
+        helper = LayerHelper(op_type, **locals())
+        softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
+        loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
+        check_variable_and_dtype(
            logits,
-            label,
+            'logits',
-            'ring_id',
+            ['float16', 'float32', 'float64'],
-            ring_id,
+            'margin_cross_entropy',
-            'rank',
-            rank,
-            'nranks',
-            nranks,
-            'margin1',
-            margin1,
-            'margin2',
-            margin2,
-            'margin3',
-            margin3,
-            'scale',
-            scale,
-            'return_softmax',
-            return_softmax,
        )
+        check_variable_and_dtype(
+            label, 'label', ['int32', 'int64'], 'margin_cross_entropy'
+        )
+        helper.append_op(
+            type=op_type,
+            inputs={'Logits': logits, 'Label': label},
+            outputs={'Softmax': softmax, 'Loss': loss},
+            attrs={
+                'return_softmax': return_softmax,
+                'ring_id': ring_id,
+                'rank': rank,
+                'nranks': nranks,
+                'margin1': margin1,
+                'margin2': margin2,
+                'margin3': margin3,
+                'scale': scale,
+            },
+        )
        if reduction == 'mean':
            loss = paddle.mean(loss)
        elif reduction == 'sum':
            loss = paddle.sum(loss)
        if not return_softmax:
            return loss
        else:
            return loss, softmax
-    op_type = 'margin_cross_entropy'
-    helper = LayerHelper(op_type, **locals())
-    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    check_variable_and_dtype(
-        logits,
-        'logits',
-        ['float16', 'float32', 'float64'],
-        'margin_cross_entropy',
-    )
-    check_variable_and_dtype(
-        label, 'label', ['int32', 'int64'], 'margin_cross_entropy'
-    )
-    helper.append_op(
-        type=op_type,
-        inputs={'Logits': logits, 'Label': label},
-        outputs={'Softmax': softmax, 'Loss': loss},
-        attrs={
-            'return_softmax': return_softmax,
-            'ring_id': ring_id,
-            'rank': rank,
-            'nranks': nranks,
-            'margin1': margin1,
-            'margin2': margin2,
-            'margin3': margin3,
-            'scale': scale,
-        },
-    )
-    if reduction == 'mean':
-        loss = paddle.mean(loss)
-    elif reduction == 'sum':
-        loss = paddle.sum(loss)
-    if not return_softmax:
-        return loss
-    else:
-        return loss, softmax
 @deprecated(
    since="2.0.0",
@@ -2863,73 +2701,54 @@ def cross_entropy(
                out = paddle.squeeze(out, axis=axis)
            return out
-    elif _in_legacy_dygraph():
+    else:
-        if not soft_label:
+        check_variable_and_dtype(
-            valid_label = (
+            input,
-                paddle.cast(label != ignore_index, dtype=label.dtype) * label
+            'input',
-            )
+            ['float16', 'float32', 'float64'],
-            label_min = paddle.min(valid_label)
+            'softmax_cross_entropy',
-            label_max = paddle.max(valid_label)
+        )
-            if label_min < 0:
+        check_variable_and_dtype(
-                raise ValueError(
+            label,
-                    "Target {} is out of lower bound.".format(label_min.item())
+            'label',
-                )
+            ['uint8', 'int8', 'int16', 'int32', 'int64', 'float32', 'float64'],
-            if label_max >= input.shape[axis]:
+            'softmax_cross_entropy',
-                raise ValueError(
+        )
-                    "Target {} is out of upper bound.".format(label_max.item())
+        attrs = {
-                )
+            'soft_label': soft_label,
+            'ignore_index': ignore_index,
+            'numeric_stable_mode': True,
+            'axis': axis,
+            'use_softmax': use_softmax,
+        }
+        helper = LayerHelper('softmax_with_cross_entropy', **locals())
+        softmax = helper.create_variable_for_type_inference(dtype=input.dtype)
+        out = helper.create_variable_for_type_inference(dtype=input.dtype)
+        outputs = {'Softmax': softmax, 'Loss': out}
        if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
-            if not soft_label:
+            backprop = helper.create_variable_for_type_inference(
-                _, _, out = _legacy_C_ops.softmax_with_cross_entropy(
+                dtype=input.dtype
-                    input,
-                    valid_label,
-                    'soft_label',
-                    soft_label,
-                    'ignore_index',
-                    ignore_index,
-                    'numeric_stable_mode',
-                    True,
-                    'axis',
-                    axis,
-                    'use_softmax',
-                    use_softmax,
-                )
-            else:
-                _, _, out = _legacy_C_ops.softmax_with_cross_entropy(
-                    input,
-                    label,
-                    'soft_label',
-                    soft_label,
-                    'ignore_index',
-                    ignore_index,
-                    'numeric_stable_mode',
-                    True,
-                    'axis',
-                    axis,
-                    'use_softmax',
-                    use_softmax,
-                )
-        else:
-            _, out = _legacy_C_ops.softmax_with_cross_entropy(
-                input,
-                label,
-                'soft_label',
-                soft_label,
-                'ignore_index',
-                ignore_index,
-                'numeric_stable_mode',
-                True,
-                'axis',
-                axis,
-                'use_softmax',
-                use_softmax,
            )
+            outputs['Backprop'] = backprop
+        helper.append_op(
+            type='softmax_with_cross_entropy',
+            inputs={'Logits': input, 'Label': label},
+            outputs=outputs,
+            attrs=attrs,
+        )
        if weight is not None:
+            check_variable_and_dtype(
-            # trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases.
+                weight,
+                'weight',
+                ['float32', 'float64'],
+                'softmax_cross_entropy',
+            )
+            weight_name = name if reduction == 'none' else None
            if soft_label:
                # chajchaj:
+                # trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases.
                # weight's shape is C, where C is class num.
                # for 1d case: label's shape is [N,C], weight_gather's shape is N.
                # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
@@ -2939,12 +2758,10 @@ def cross_entropy(
                    transpose_x=False,
                    transpose_y=True,
                )
                out_shape = list(out.shape)
                weight_gather_reshape = reshape(weight_gather, shape=out_shape)
                out = paddle.cast(out, weight_gather_reshape.dtype)
-                out = _legacy_C_ops.elementwise_mul(out, weight_gather_reshape)
            else:
                if input.shape[axis] != weight.shape[-1]:
                    raise ValueError(
@@ -2955,14 +2772,16 @@ def cross_entropy(
                        )
                    )
+                valid_label = paddle.multiply(
+                    paddle.cast(label != ignore_index, dtype=label.dtype), label
+                )
                ignore_weight_mask = paddle.cast(
-                    (label != ignore_index), out.dtype
+                    (label != ignore_index), input.dtype
                )
                if (
                    ignore_weight_mask.ndim > 1
                    and ignore_weight_mask.shape[axis] == 1
                ):
-                    # TODO: Temporarily use squeeze instead of squeeze_
                    ignore_weight_mask = paddle.squeeze(
                        ignore_weight_mask, axis
                    )
@@ -2976,193 +2795,54 @@ def cross_entropy(
                        )
                        + [axis % valid_label.ndim]
                    )
-                    weight_gather = _legacy_C_ops.gather_nd(
+                    weight_gather = paddle.gather_nd(
-                        weight, valid_label.transpose(temp_perm)
+                        weight, paddle.transpose(valid_label, temp_perm)
                    )
                else:
-                    weight_gather = _legacy_C_ops.gather_nd(weight, valid_label)
+                    weight_gather = paddle.gather_nd(weight, valid_label)
-                weight_gather = _legacy_C_ops.elementwise_mul(
+                weight_gather = paddle.multiply(
                    weight_gather, ignore_weight_mask
                )
                input_shape = list(label.shape)
                weight_gather_reshape = reshape(
                    weight_gather, shape=input_shape
                )
-                out = paddle.cast(out, weight_gather_reshape.dtype)
+            out = paddle.multiply(out, weight_gather_reshape, name=weight_name)
-                out = _legacy_C_ops.elementwise_mul(out, weight_gather_reshape)
        if reduction == "sum":
-            #   because of fluid_softmax_with_cross_entropy op's inner logic,
+            return paddle.sum(out, name=name)
-            #   in the out tensor of this op, the loss of sample with class_index==ignore_index is 0
-            #   so, reduce_sum all directly is ok
-            return _legacy_C_ops.reduce_sum(out, 'reduce_all', True)
        elif reduction == "mean":
-            # 1. if weight==none,
+            if ignore_index >= 0:
-            #     numerator: reduce_sum all loss directly is ok causeof fluid_softmax_with_cross_entropy's inner logic
+                out_sum = paddle.sum(out, name=name)
-            #     denominator: count sample num with class_index!=ignore_index
-            # 2. else
-            #     numerator: loss's weighted sum
-            #     denominator: cal the sum of weight where the sample's class_index!=ignore_index
-            is_ignore = label == ignore_index
-            mask = ~is_ignore
-            if paddle.count_nonzero(is_ignore) > 0:  # ignore label
-                out_sum = _legacy_C_ops.reduce_sum(out, 'reduce_all', True)
                # for each label[i],set 1 or 0, according to ignore_index
                # mask[i]=0, if label[i]==ignore_index
                # mask[i]=1, otherwise
+                mask = label != ignore_index
                if weight is None:
                    mask = paddle.cast(mask, dtype=out_sum.dtype)
-                    count = _legacy_C_ops.reduce_sum(mask, 'reduce_all', True)
+                    count = paddle.sum(mask, name=name)
                    ret = out_sum / (count + (count == 0.0))
                else:
                    mask = paddle.cast(mask, weight_gather_reshape.dtype)
-                    weight_ignored = _legacy_C_ops.elementwise_mul(
+                    weight_ignored = paddle.multiply(
                        mask, weight_gather_reshape
                    )
-                    weight_sum = _legacy_C_ops.reduce_sum(
+                    weight_sum = paddle.sum(weight_ignored, name=name)
-                        weight_ignored, 'reduce_all', True
-                    )
                    ret = out_sum / (weight_sum + (weight_sum == 0.0))
                return ret
            elif weight is not None:
-                out_sum = _legacy_C_ops.reduce_sum(out, 'reduce_all', True)
+                out_sum = paddle.sum(out, name=name)
-                total_weight = _legacy_C_ops.reduce_sum(
+                total_weight = paddle.sum(weight_gather_reshape)
-                    weight_gather_reshape, 'reduce_all', True
-                )
                return out_sum / (total_weight + (total_weight == 0.0))
            else:
-                return _legacy_C_ops.mean(out)
+                return paddle.mean(out, name=name)
        else:
            if input_dims - 1 == label_dims:
                out = paddle.squeeze(out, axis=axis)
-            return out
-    check_variable_and_dtype(
-        input,
-        'input',
-        ['float16', 'float32', 'float64'],
-        'softmax_cross_entropy',
-    )
-    check_variable_and_dtype(
-        label,
-        'label',
-        ['uint8', 'int8', 'int16', 'int32', 'int64', 'float32', 'float64'],
-        'softmax_cross_entropy',
-    )
-    attrs = {
-        'soft_label': soft_label,
-        'ignore_index': ignore_index,
-        'numeric_stable_mode': True,
-        'axis': axis,
-        'use_softmax': use_softmax,
-    }
-    helper = LayerHelper('softmax_with_cross_entropy', **locals())
-    softmax = helper.create_variable_for_type_inference(dtype=input.dtype)
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    outputs = {'Softmax': softmax, 'Loss': out}
-    if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
-        backprop = helper.create_variable_for_type_inference(dtype=input.dtype)
-        outputs['Backprop'] = backprop
-    helper.append_op(
-        type='softmax_with_cross_entropy',
-        inputs={'Logits': input, 'Label': label},
-        outputs=outputs,
-        attrs=attrs,
-    )
-    if weight is not None:
-        check_variable_and_dtype(
-            weight, 'weight', ['float32', 'float64'], 'softmax_cross_entropy'
-        )
-        weight_name = name if reduction == 'none' else None
-        if soft_label:
-            # chajchaj:
-            # trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases.
-            # weight's shape is C, where C is class num.
-            # for 1d case: label's shape is [N,C], weight_gather's shape is N.
-            # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
-            weight_gather = paddle.matmul(
-                x=paddle.cast(label, weight.dtype),
-                y=weight,
-                transpose_x=False,
-                transpose_y=True,
-            )
-            out_shape = list(out.shape)
-            weight_gather_reshape = reshape(weight_gather, shape=out_shape)
-            out = paddle.cast(out, weight_gather_reshape.dtype)
-        else:
-            if input.shape[axis] != weight.shape[-1]:
-                raise ValueError(
-                    "input's class_dimension({}) must equal to "
-                    "weight's class_dimension({}) "
-                    "when weight is provided".format(
-                        input.shape[axis], weight.shape[-1]
-                    )
-                )
-            valid_label = paddle.multiply(
-                paddle.cast(label != ignore_index, dtype=label.dtype), label
-            )
-            ignore_weight_mask = paddle.cast(
-                (label != ignore_index), input.dtype
-            )
-            if (
-                ignore_weight_mask.ndim > 1
-                and ignore_weight_mask.shape[axis] == 1
-            ):
-                ignore_weight_mask = paddle.squeeze(ignore_weight_mask, axis)
-            if axis != -1 and axis != valid_label.ndim - 1:
-                temp_perm = (
-                    list(range(axis % valid_label.ndim))
-                    + list(
-                        range((axis % valid_label.ndim + 1), valid_label.ndim)
-                    )
-                    + [axis % valid_label.ndim]
-                )
-                weight_gather = paddle.gather_nd(
-                    weight, paddle.transpose(valid_label, temp_perm)
-                )
-            else:
-                weight_gather = paddle.gather_nd(weight, valid_label)
-            weight_gather = paddle.multiply(weight_gather, ignore_weight_mask)
-            input_shape = list(label.shape)
-            weight_gather_reshape = reshape(weight_gather, shape=input_shape)
-        out = paddle.multiply(out, weight_gather_reshape, name=weight_name)
-    if reduction == "sum":
-        return paddle.sum(out, name=name)
-    elif reduction == "mean":
-        if ignore_index >= 0:
-            out_sum = paddle.sum(out, name=name)
-            # for each label[i],set 1 or 0, according to ignore_index
-            # mask[i]=0, if label[i]==ignore_index
-            # mask[i]=1, otherwise
-            mask = label != ignore_index
-            if weight is None:
-                mask = paddle.cast(mask, dtype=out_sum.dtype)
-                count = paddle.sum(mask, name=name)
-                ret = out_sum / (count + (count == 0.0))
-            else:
-                mask = paddle.cast(mask, weight_gather_reshape.dtype)
-                weight_ignored = paddle.multiply(mask, weight_gather_reshape)
-                weight_sum = paddle.sum(weight_ignored, name=name)
-                ret = out_sum / (weight_sum + (weight_sum == 0.0))
-            return ret
-        elif weight is not None:
-            out_sum = paddle.sum(out, name=name)
-            total_weight = paddle.sum(weight_gather_reshape)
-            return out_sum / (total_weight + (total_weight == 0.0))
-        else:
-            return paddle.mean(out, name=name)
-    else:
-        if input_dims - 1 == label_dims:
-            out = paddle.squeeze(out, axis=axis)
-        return out
+            return out
 def sigmoid_focal_loss(
@@ -3306,92 +2986,40 @@ def sigmoid_focal_loss(
        return loss
-    elif _in_legacy_dygraph():
+    else:
-        one = _varbase_creator(dtype=logit.dtype)
+        check_variable_and_dtype(
-        _legacy_C_ops.fill_constant(
+            logit, 'logit', ['float32', 'float64'], 'sigmoid_focal_loss'
-            one,
-            'value',
-            float(1.0),
-            'force_cpu',
-            False,
-            'dtype',
-            one.dtype,
-            'str_value',
-            '1.0',
-            'shape',
-            logit.shape,
-        )
-        loss = _legacy_C_ops.sigmoid_cross_entropy_with_logits(logit, label)
-        pred = _legacy_C_ops.sigmoid(logit)
-        p_t = _legacy_C_ops.elementwise_add(
-            _legacy_C_ops.elementwise_mul(pred, label),
-            _legacy_C_ops.elementwise_mul(
-                _legacy_C_ops.elementwise_sub(one, pred),
-                _legacy_C_ops.elementwise_sub(one, label),
-            ),
        )
+        check_variable_and_dtype(
-        alpha = fluid.dygraph.base.to_variable([alpha], dtype=loss.dtype)
+            label, 'label', ['float32', 'float64'], 'sigmoid_focal_loss'
-        alpha_t = _legacy_C_ops.elementwise_add(
-            _legacy_C_ops.elementwise_mul(alpha, label),
-            _legacy_C_ops.elementwise_mul(
-                _legacy_C_ops.elementwise_sub(one, alpha),
-                _legacy_C_ops.elementwise_sub(one, label),
-            ),
        )
-        loss = _legacy_C_ops.elementwise_mul(alpha_t, loss)
-        gamma = fluid.dygraph.base.to_variable([gamma], dtype=loss.dtype)
+        bce_name = None
-        gamma_t = _legacy_C_ops.elementwise_pow(
+        if reduction == 'none' and normalizer is None:
-            _legacy_C_ops.elementwise_sub(one, p_t), gamma
+            bce_name = name
+        loss = paddle.nn.functional.binary_cross_entropy_with_logits(
+            logit, label, reduction='none', name=bce_name
        )
-        loss = _legacy_C_ops.elementwise_mul(gamma_t, loss)
-        if normalizer is not None:
-            loss = _legacy_C_ops.elementwise_div(loss, normalizer)
-        if reduction == "sum":
-            return _legacy_C_ops.reduce_sum(loss, 'reduce_all', True)
-        elif reduction == "mean":
-            return _legacy_C_ops.mean(loss)
-        return loss
-    check_variable_and_dtype(
-        logit, 'logit', ['float32', 'float64'], 'sigmoid_focal_loss'
-    )
-    check_variable_and_dtype(
-        label, 'label', ['float32', 'float64'], 'sigmoid_focal_loss'
-    )
-    bce_name = None
-    if reduction == 'none' and normalizer is None:
-        bce_name = name
-    loss = paddle.nn.functional.binary_cross_entropy_with_logits(
-        logit, label, reduction='none', name=bce_name
-    )
-    pred = paddle.nn.functional.sigmoid(logit)
+        pred = paddle.nn.functional.sigmoid(logit)
-    p_t = pred * label + (1 - pred) * (1 - label)
+        p_t = pred * label + (1 - pred) * (1 - label)
-    alpha_t = alpha * label + (1 - alpha) * (1 - label)
+        alpha_t = alpha * label + (1 - alpha) * (1 - label)
-    loss = paddle.multiply(alpha_t, loss)
+        loss = paddle.multiply(alpha_t, loss)
-    gamma_t = paddle.pow((1 - p_t), gamma)
+        gamma_t = paddle.pow((1 - p_t), gamma)
-    loss = paddle.multiply(gamma_t, loss)
+        loss = paddle.multiply(gamma_t, loss)
-    if normalizer is not None:
+        if normalizer is not None:
-        normalizer_name = name if reduction == 'none' else None
+            normalizer_name = name if reduction == 'none' else None
-        loss = paddle.divide(loss, normalizer, name=normalizer_name)
+            loss = paddle.divide(loss, normalizer, name=normalizer_name)
-    if reduction == 'mean':
+        if reduction == 'mean':
-        loss = paddle.mean(loss, name=name)
+            loss = paddle.mean(loss, name=name)
-    elif reduction == 'sum':
+        elif reduction == 'sum':
-        loss = paddle.sum(loss, name=name)
+            loss = paddle.sum(loss, name=name)
-    return loss
+        return loss
 def multi_label_soft_margin_loss(
@@ -3463,7 +3091,7 @@ def multi_label_soft_margin_loss(
            "but received {}!={}".format(input.shape, label.shape)
        )
-    if not _non_static_mode():
+    if not in_dygraph_mode():
        check_variable_and_dtype(
            input,
            'input',
@@ -3483,7 +3111,7 @@ def multi_label_soft_margin_loss(
    )
    if weight is not None:
-        if not _non_static_mode():
+        if not in_dygraph_mode():
            check_variable_and_dtype(
                weight,
                'weight',
@@ -3582,7 +3210,7 @@ def hinge_embedding_loss(input, label, margin=1.0, reduction='mean', name=None):
            "but received {}.".format(reduction)
        )
-    if not _non_static_mode():
+    if not in_dygraph_mode():
        check_variable_and_dtype(
            input, 'input', ['float32', 'float64'], 'hinge_embedding_loss'
        )
@@ -3807,7 +3435,7 @@ def triplet_margin_with_distance_loss(
        raise ValueError(
            "The margin between positive samples and negative samples should be greater than 0."
        )
-    if not _non_static_mode():
+    if not in_dygraph_mode():
        check_variable_and_dtype(
            input,
            'input',
@@ -3956,7 +3584,7 @@ def triplet_margin_loss(
        raise ValueError(
            "The margin between positive samples and negative samples should be greater than 0."
        )
-    if not _non_static_mode():
+    if not in_dygraph_mode():
        check_variable_and_dtype(
            input, 'input', ['float32', 'float64'], 'triplet_margin_loss'
        )
@@ -4066,7 +3694,7 @@ def multi_margin_loss(
            "but received {}.".format(reduction)
        )
-    if not _non_static_mode():
+    if not in_dygraph_mode():
        check_variable_and_dtype(
            input, 'input', ['float32', 'float64'], 'multi_margin_loss'
        )
@@ -4083,7 +3711,7 @@ def multi_margin_loss(
    label = label.reshape((-1, 1))
    index_sample = paddle.index_sample(input, label)
    if weight is not None:
-        if not _non_static_mode():
+        if not in_dygraph_mode():
            check_variable_and_dtype(
                weight, 'weight', ['float32', 'float64'], 'multi_margin_loss'
            )
@@ -4187,7 +3815,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
            % reduction
        )
-    if not _non_static_mode():
+    if not in_dygraph_mode():
        fluid.data_feeder.check_variable_and_dtype(
            input, 'input', ['float32', 'float64'], 'soft_margin_loss'
        )

--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -83,47 +83,33 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
        out = _C_ops.p_norm(x, float(p), axis, epsilon, True, False)
        return x / _C_ops.maximum(out, eps)
-    if _in_legacy_dygraph():
+    else:
-        eps = fluid.dygraph.base.to_variable([epsilon], dtype=x.dtype)
+        check_type(p, 'p', (float, int), 'normalize')
-        out = _legacy_C_ops.p_norm(
+        check_type(axis, 'axis', (int), 'normalize')
-            x,
+        check_variable_and_dtype(
-            'axis',
+            x, 'x', ['float16', 'float32', 'float64'], 'normalize'
-            axis,
-            'porder',
-            float(p),
-            'keepdim',
-            True,
-            'epsilon',
-            epsilon,
        )
-        return x / _legacy_C_ops.elementwise_max(out, eps)
+        if len(x.shape) == 1 and axis != 0 and axis != -1:
+            raise ValueError(
-    check_type(p, 'p', (float, int), 'normalize')
+                "Axis must be 0 or -1 when x is a 1-D tensor, but received axis = {}".format(
-    check_type(axis, 'axis', (int), 'normalize')
+                    axis
-    check_variable_and_dtype(
+                )
-        x, 'x', ['float16', 'float32', 'float64'], 'normalize'
-    )
-    if len(x.shape) == 1 and axis != 0 and axis != -1:
-        raise ValueError(
-            "Axis must be 0 or -1 when x is a 1-D tensor, but received axis = {}".format(
-                axis
            )
-        )
-    attrs = {
+        attrs = {
-        'axis': axis,
+            'axis': axis,
-        'porder': float(p),
+            'porder': float(p),
-        'keepdim': True,
+            'keepdim': True,
-        'epsilon': epsilon,
+            'epsilon': epsilon,
-    }
+        }
-    helper = LayerHelper('p_norm', **locals())
+        helper = LayerHelper('p_norm', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
+        helper.append_op(
-        type='p_norm', inputs={'X': x}, outputs={'Out': out}, attrs=attrs
+            type='p_norm', inputs={'X': x}, outputs={'Out': out}, attrs=attrs
-    )
+        )
-    eps = out.block.create_var(dtype=out.dtype)
+        eps = out.block.create_var(dtype=out.dtype)
-    eps = paddle.full(shape=[1], fill_value=epsilon, dtype=out.dtype)
+        eps = paddle.full(shape=[1], fill_value=epsilon, dtype=out.dtype)
-    return paddle.divide(x, paddle.maximum(out, eps), name=name)
+        return paddle.divide(x, paddle.maximum(out, eps), name=name)
 def batch_norm(
@@ -229,98 +215,62 @@ def batch_norm(
            batch_norm_out, act=None
        )
-    elif _in_legacy_dygraph():
+    else:
-        # for dygraph need tuple
+        check_variable_and_dtype(
-        attrs = (
+            x, 'input', ['float16', 'float32', 'float64'], 'BatchNorm'
-            "momentum",
-            momentum,
-            "epsilon",
-            epsilon,
-            "is_test",
-            not training,
-            "data_layout",
-            data_format,
-            "use_mkldnn",
-            False,
-            "fuse_with_relu",
-            False,
-            "use_global_stats",
-            use_global_stats,
-            "trainable_statistics",
-            trainable_statistics,
        )
-        batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
+        # for static need dict
-            x,
+        attrs = {
-            weight,
+            "momentum": momentum,
-            bias,
+            "epsilon": epsilon,
-            running_mean,
+            "is_test": not training,
-            running_var,
+            "data_layout": data_format,
-            None,
+            "use_mkldnn": False,
-            mean_out,
+            "fuse_with_relu": False,
-            variance_out,
+            "use_global_stats": use_global_stats,
-            *attrs
+            "trainable_statistics": trainable_statistics,
+        }
+        inputs = {
+            "X": [x],
+            "Scale": [weight],
+            "Bias": [bias],
+            "Mean": [running_mean],
+            "Variance": [running_var],
+        }
+        helper = LayerHelper('batch_norm', **locals())
+        param_dtype = x.dtype if x.dtype != 'float16' else 'float32'
+        saved_mean = helper.create_variable_for_type_inference(
+            dtype=param_dtype, stop_gradient=True
        )
+        saved_variance = helper.create_variable_for_type_inference(
-        return dygraph_utils._append_activation_in_dygraph(
+            dtype=param_dtype, stop_gradient=True
-            batch_norm_out, act=None
        )
+        batch_norm_out = helper.create_variable_for_type_inference(x.dtype)
+        outputs = {
+            "Y": [batch_norm_out],
+            "MeanOut": [running_mean],
+            "VarianceOut": [running_var],
+            "SavedMean": [saved_mean],
+            "SavedVariance": [saved_variance],
+        }
+        if training or trainable_statistics:
+            # reserve_space is only used for training.
+            reserve_space = helper.create_variable_for_type_inference(
+                dtype=x.dtype, stop_gradient=True
+            )
+            outputs["ReserveSpace"] = [reserve_space]
-    check_variable_and_dtype(
+        helper.append_op(
-        x, 'input', ['float16', 'float32', 'float64'], 'BatchNorm'
+            type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
-    )
-    # for static need dict
-    attrs = {
-        "momentum": momentum,
-        "epsilon": epsilon,
-        "is_test": not training,
-        "data_layout": data_format,
-        "use_mkldnn": False,
-        "fuse_with_relu": False,
-        "use_global_stats": use_global_stats,
-        "trainable_statistics": trainable_statistics,
-    }
-    inputs = {
-        "X": [x],
-        "Scale": [weight],
-        "Bias": [bias],
-        "Mean": [running_mean],
-        "Variance": [running_var],
-    }
-    helper = LayerHelper('batch_norm', **locals())
-    param_dtype = x.dtype if x.dtype != 'float16' else 'float32'
-    saved_mean = helper.create_variable_for_type_inference(
-        dtype=param_dtype, stop_gradient=True
-    )
-    saved_variance = helper.create_variable_for_type_inference(
-        dtype=param_dtype, stop_gradient=True
-    )
-    batch_norm_out = helper.create_variable_for_type_inference(x.dtype)
-    outputs = {
-        "Y": [batch_norm_out],
-        "MeanOut": [running_mean],
-        "VarianceOut": [running_var],
-        "SavedMean": [saved_mean],
-        "SavedVariance": [saved_variance],
-    }
-    if training or trainable_statistics:
-        # reserve_space is only used for training.
-        reserve_space = helper.create_variable_for_type_inference(
-            dtype=x.dtype, stop_gradient=True
        )
-        outputs["ReserveSpace"] = [reserve_space]
-    helper.append_op(
+        return helper.append_activation(batch_norm_out)
-        type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
-    )
-    return helper.append_activation(batch_norm_out)
 def layer_norm(
@@ -483,48 +433,41 @@ def instance_norm(
    if in_dygraph_mode():
        out = _C_ops.instance_norm(x, weight, bias, eps)
        return out
-    if _in_legacy_dygraph():
+    else:
-        out, _, _ = _legacy_C_ops.instance_norm(
+        check_variable_and_dtype(
-            x,
+            x, 'input', ['float32', 'float64'], "InstanceNorm"
-            weight,
-            bias,
-            "epsilon",
-            eps,
-            "momentum",
-            momentum,
-            "data_format",
-            data_format,
        )
-        return out
-    check_variable_and_dtype(x, 'input', ['float32', 'float64'], "InstanceNorm")
+        attrs = {
+            "epsilon": eps,
+            "momentum": momentum,
+            "data_format": data_format,
+        }
-    attrs = {"epsilon": eps, "momentum": momentum, "data_format": data_format}
+        if weight and bias:
+            inputs = {"X": [x], "Scale": [weight], "Bias": [bias]}
+        else:
+            inputs = {"X": [x]}
-    if weight and bias:
+        helper = LayerHelper('instance_norm', **locals())
-        inputs = {"X": [x], "Scale": [weight], "Bias": [bias]}
+        saved_mean = helper.create_variable_for_type_inference(
-    else:
+            dtype=x.dtype, stop_gradient=True
-        inputs = {"X": [x]}
+        )
+        saved_variance = helper.create_variable_for_type_inference(
-    helper = LayerHelper('instance_norm', **locals())
+            dtype=x.dtype, stop_gradient=True
-    saved_mean = helper.create_variable_for_type_inference(
+        )
-        dtype=x.dtype, stop_gradient=True
+        instance_norm_out = helper.create_variable_for_type_inference(x.dtype)
-    )
-    saved_variance = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=True
-    )
-    instance_norm_out = helper.create_variable_for_type_inference(x.dtype)
-    outputs = {
+        outputs = {
-        "Y": [instance_norm_out],
+            "Y": [instance_norm_out],
-        "SavedMean": [saved_mean],
+            "SavedMean": [saved_mean],
-        "SavedVariance": [saved_variance],
+            "SavedVariance": [saved_variance],
-    }
+        }
-    helper.append_op(
+        helper.append_op(
-        type="instance_norm", inputs=inputs, outputs=outputs, attrs=attrs
+            type="instance_norm", inputs=inputs, outputs=outputs, attrs=attrs
-    )
+        )
-    return instance_norm_out
+        return instance_norm_out
 def local_response_norm(

--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -13,12 +13,7 @@
 # limitations under the License.
 from paddle import _C_ops, _legacy_C_ops, in_dynamic_mode
-from paddle.fluid.framework import (
+from paddle.fluid.framework import Variable, in_dygraph_mode
-    Variable,
-    _in_legacy_dygraph,
-    _non_static_mode,
-    in_dygraph_mode,
-)
 from ...fluid.data_feeder import check_type, check_variable_and_dtype
@@ -266,59 +261,32 @@ def avg_pool1d(
        )
        return squeeze(output, [2])
-    if _in_legacy_dygraph():
+    else:
-        output = _legacy_C_ops.pool2d(
+        op_type = 'pool2d'
-            x,
+        helper = LayerHelper(op_type, **locals())
-            'pooling_type',
+        dtype = helper.input_dtype(input_param_name='x')
-            'avg',
+        pool_out = helper.create_variable_for_type_inference(dtype)
-            'ksize',
-            kernel_size,
-            'global_pooling',
-            False,
-            'strides',
-            stride,
-            'paddings',
-            padding,
-            'padding_algorithm',
-            padding_algorithm,
-            'use_cudnn',
-            True,
-            'ceil_mode',
-            ceil_mode,
-            'use_mkldnn',
-            False,
-            'exclusive',
-            exclusive,
-            'data_format',
-            data_format,
-        )
-        return squeeze(output, [2])
-    op_type = 'pool2d'
-    helper = LayerHelper(op_type, **locals())
-    dtype = helper.input_dtype(input_param_name='x')
-    pool_out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
+        helper.append_op(
-        type=op_type,
+            type=op_type,
-        inputs={"X": x},
+            inputs={"X": x},
-        outputs={"Out": pool_out},
+            outputs={"Out": pool_out},
-        attrs={
+            attrs={
-            "pooling_type": 'avg',
+                "pooling_type": 'avg',
-            "ksize": kernel_size,
+                "ksize": kernel_size,
-            "global_pooling": False,
+                "global_pooling": False,
-            "strides": stride,
+                "strides": stride,
-            "paddings": padding,
+                "paddings": padding,
-            "padding_algorithm": padding_algorithm,
+                "padding_algorithm": padding_algorithm,
-            "use_cudnn": True,
+                "use_cudnn": True,
-            "ceil_mode": ceil_mode,
+                "ceil_mode": ceil_mode,
-            "use_mkldnn": False,
+                "use_mkldnn": False,
-            "exclusive": exclusive,
+                "exclusive": exclusive,
-            "data_format": data_format,
+                "data_format": data_format,
-        },
+            },
-    )
+        )
-    return squeeze(pool_out, [2])
+        return squeeze(pool_out, [2])
 def avg_pool2d(
@@ -397,83 +365,58 @@ def avg_pool2d(
        padding, 2, channel_last, ceil_mode=ceil_mode
    )
-    if _non_static_mode():
+    if in_dygraph_mode():
-        if in_dygraph_mode():
+        output = _C_ops.pool2d(
-            output = _C_ops.pool2d(
+            x,
-                x,
+            kernel_size,
-                kernel_size,
+            stride,
-                stride,
+            padding,
-                padding,
+            ceil_mode,
-                ceil_mode,
+            exclusive,
-                exclusive,
+            data_format,
-                data_format,
+            'avg',
-                'avg',
+            False,
-                False,
+            False,
-                False,
+            padding_algorithm,
-                padding_algorithm,
+        )
-            )
-        else:
-            output = _legacy_C_ops.pool2d(
-                x,
-                'pooling_type',
-                'avg',
-                'ksize',
-                kernel_size,
-                'global_pooling',
-                False,
-                'padding_algorithm',
-                padding_algorithm,
-                'strides',
-                stride,
-                'paddings',
-                padding,
-                'use_cudnn',
-                True,
-                'ceil_mode',
-                ceil_mode,
-                'use_mkldnn',
-                False,
-                'exclusive',
-                exclusive,
-                'data_format',
-                data_format,
-            )
        if divisor_override is None:
            return output
        else:
            _check_instance(divisor_override, "divisor_override")
            return output * (kernel_size[0] * kernel_size[1]) / divisor_override
+    else:
+        op_type = 'pool2d'
+        helper = LayerHelper(op_type, **locals())
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool2d')
+        dtype = helper.input_dtype(input_param_name='x')
+        pool_out = helper.create_variable_for_type_inference(dtype)
-    op_type = 'pool2d'
+        helper.append_op(
-    helper = LayerHelper(op_type, **locals())
+            type=op_type,
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool2d')
+            inputs={"X": x},
-    dtype = helper.input_dtype(input_param_name='x')
+            outputs={"Out": pool_out},
-    pool_out = helper.create_variable_for_type_inference(dtype)
+            attrs={
+                "pooling_type": "avg",
-    helper.append_op(
+                "ksize": kernel_size,
-        type=op_type,
+                "global_pooling": False,
-        inputs={"X": x},
+                "strides": stride,
-        outputs={"Out": pool_out},
+                "paddings": padding,
-        attrs={
+                "padding_algorithm": padding_algorithm,
-            "pooling_type": "avg",
+                "use_cudnn": True,
-            "ksize": kernel_size,
+                "ceil_mode": ceil_mode,
-            "global_pooling": False,
+                "use_mkldnn": False,
-            "strides": stride,
+                "exclusive": exclusive,
-            "paddings": padding,
+                "data_format": data_format,
-            "padding_algorithm": padding_algorithm,
+            },
-            "use_cudnn": True,
+        )
-            "ceil_mode": ceil_mode,
-            "use_mkldnn": False,
-            "exclusive": exclusive,
-            "data_format": data_format,
-        },
-    )
-    if divisor_override is None:
+        if divisor_override is None:
-        return pool_out
+            return pool_out
-    else:
+        else:
-        _check_instance(divisor_override, "divisor_override")
+            _check_instance(divisor_override, "divisor_override")
-        return pool_out * (kernel_size[0] * kernel_size[1]) / divisor_override
+            return (
+                pool_out * (kernel_size[0] * kernel_size[1]) / divisor_override
+            )
 def avg_pool3d(
@@ -565,32 +508,6 @@ def avg_pool3d(
            False,
            padding_algorithm,
        )
-    elif _in_legacy_dygraph():
-        pool_out = _legacy_C_ops.pool3d(
-            x,
-            'pooling_type',
-            'avg',
-            'ksize',
-            kernel_size,
-            'strides',
-            stride,
-            'paddings',
-            padding,
-            'global_pooling',
-            False,
-            'padding_algorithm',
-            padding_algorithm,
-            'use_cudnn',
-            True,
-            'ceil_mode',
-            ceil_mode,
-            'use_mkldnn',
-            False,
-            'exclusive',
-            exclusive,
-            'data_format',
-            data_format,
-        )
    else:
        op_type = "pool3d"
        helper = LayerHelper(op_type, **locals())
@@ -723,95 +640,38 @@ def max_pool1d(
            )
            return squeeze(pool_out, [2])
-    if _in_legacy_dygraph():
+    else:
-        if return_mask:
+        op_type = 'max_pool2d_with_index' if return_mask else "pool2d"
-            pool_out = _legacy_C_ops.max_pool2d_with_index(
+        helper = LayerHelper(op_type, **locals())
-                x,
+        dtype = helper.input_dtype(input_param_name='x')
-                'ksize',
+        pool_out = helper.create_variable_for_type_inference(dtype)
-                kernel_size,
+        mask = helper.create_variable_for_type_inference('int32')
-                'global_pooling',
+        outputs = {"Out": pool_out, "Mask": mask}
-                False,
-                'strides',
-                stride,
-                'paddings',
-                padding,
-                'padding_algorithm',
-                padding_algorithm,
-                'use_cudnn',
-                True,
-                'ceil_mode',
-                ceil_mode,
-                'use_mkldnn',
-                False,
-                'exclusive',
-                True,
-                'data_format',
-                data_format,
-            )
-            return (
-                (squeeze(pool_out[0], [2]), squeeze(pool_out[1], [2]))
-                if return_mask
-                else squeeze(pool_out[0], [2])
-            )
-        else:
-            pool_out = _legacy_C_ops.pool2d(
-                x,
-                'pooling_type',
-                'max',
-                'ksize',
-                kernel_size,
-                'global_pooling',
-                False,
-                'padding_algorithm',
-                padding_algorithm,
-                'strides',
-                stride,
-                'paddings',
-                padding,
-                'use_cudnn',
-                True,
-                'ceil_mode',
-                ceil_mode,
-                'use_mkldnn',
-                False,
-                'exclusive',
-                True,
-                'data_format',
-                data_format,
-            )
-            return squeeze(pool_out, [2])
-    op_type = 'max_pool2d_with_index' if return_mask else "pool2d"
-    helper = LayerHelper(op_type, **locals())
-    dtype = helper.input_dtype(input_param_name='x')
-    pool_out = helper.create_variable_for_type_inference(dtype)
-    mask = helper.create_variable_for_type_inference('int32')
-    outputs = {"Out": pool_out, "Mask": mask}
-    helper.append_op(
+        helper.append_op(
-        type=op_type,
+            type=op_type,
-        inputs={"X": x},
+            inputs={"X": x},
-        outputs=outputs,
+            outputs=outputs,
-        attrs={
+            attrs={
-            "pooling_type": 'max',
+                "pooling_type": 'max',
-            "ksize": kernel_size,
+                "ksize": kernel_size,
-            "global_pooling": False,
+                "global_pooling": False,
-            "strides": stride,
+                "strides": stride,
-            "paddings": padding,
+                "paddings": padding,
-            "padding_algorithm": padding_algorithm,
+                "padding_algorithm": padding_algorithm,
-            "use_cudnn": True,
+                "use_cudnn": True,
-            "ceil_mode": ceil_mode,
+                "ceil_mode": ceil_mode,
-            "use_mkldnn": False,
+                "use_mkldnn": False,
-            "exclusive": True,
+                "exclusive": True,
-            "data_format": data_format,
+                "data_format": data_format,
-        },
+            },
-    )
+        )
-    return (
+        return (
-        (squeeze(pool_out, [2]), squeeze(mask, [2]))
+            (squeeze(pool_out, [2]), squeeze(mask, [2]))
-        if return_mask
+            if return_mask
-        else squeeze(pool_out, [2])
+            else squeeze(pool_out, [2])
-    )
+        )
 def _unpool_output_size(x, kernel_size, stride, padding, output_size):
@@ -831,7 +691,7 @@ def _unpool_output_size(x, kernel_size, stride, padding, output_size):
    if output_size is None:
        return default_size
    elif utils._contain_var(output_size):
-        if not _non_static_mode():
+        if not in_dygraph_mode():
            has_static_var = True
            output_size = utils._convert_to_tensor_list(output_size)
        else:
@@ -1366,114 +1226,61 @@ def max_pool2d(
                padding_algorithm,
            )
-    if _in_legacy_dygraph():
+    else:
+        op_type = 'max_pool2d_with_index' if return_mask else "pool2d"
+        helper = LayerHelper(op_type, **locals())
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64'], 'max_pool2d'
+        )
+        dtype = helper.input_dtype(input_param_name='x')
+        pool_out = helper.create_variable_for_type_inference(dtype)
        if return_mask:
-            output = _legacy_C_ops.max_pool2d_with_index(
+            mask = helper.create_variable_for_type_inference("int32")
-                x,
+            outputs = {"Out": pool_out, "Mask": mask}
-                'ksize',
-                kernel_size,
+            helper.append_op(
-                'global_pooling',
+                type="max_pool2d_with_index",
-                False,
+                inputs={"X": x},
-                'strides',
+                outputs=outputs,
-                stride,
+                attrs={
-                'paddings',
+                    "pooling_type": 'max',
-                padding,
+                    "ksize": kernel_size,
-                'padding_algorithm',
+                    "global_pooling": False,
-                padding_algorithm,
+                    "strides": stride,
-                'use_cudnn',
+                    "paddings": padding,
-                True,
+                    "padding_algorithm": padding_algorithm,
-                'ceil_mode',
+                    "use_cudnn": True,
-                ceil_mode,
+                    "ceil_mode": ceil_mode,
-                'use_mkldnn',
+                    "use_mkldnn": False,
-                False,
+                    "exclusive": True,
-                'exclusive',
+                    "data_format": data_format,
-                True,
+                },
-                'data_format',
-                data_format,
            )
-            return output if return_mask else output[0]
+            return (pool_out, mask)
        else:
-            output = _legacy_C_ops.pool2d(
+            outputs = {"Out": pool_out}
-                x,
-                'pooling_type',
+            helper.append_op(
-                'max',
+                type="pool2d",
-                'ksize',
+                inputs={"X": x},
-                kernel_size,
+                outputs=outputs,
-                'global_pooling',
+                attrs={
-                False,
+                    "pooling_type": 'max',
-                'padding_algorithm',
+                    "ksize": kernel_size,
-                padding_algorithm,
+                    "global_pooling": False,
-                'strides',
+                    "strides": stride,
-                stride,
+                    "paddings": padding,
-                'paddings',
+                    "padding_algorithm": padding_algorithm,
-                padding,
+                    "use_cudnn": True,
-                'use_cudnn',
+                    "ceil_mode": ceil_mode,
-                True,
+                    "use_mkldnn": False,
-                'ceil_mode',
+                    "exclusive": True,
-                ceil_mode,
+                    "data_format": data_format,
-                'use_mkldnn',
+                },
-                False,
-                'exclusive',
-                True,
-                'data_format',
-                data_format,
            )
-            return output
+            return pool_out
-    op_type = 'max_pool2d_with_index' if return_mask else "pool2d"
-    helper = LayerHelper(op_type, **locals())
-    check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64'], 'max_pool2d'
-    )
-    dtype = helper.input_dtype(input_param_name='x')
-    pool_out = helper.create_variable_for_type_inference(dtype)
-    if return_mask:
-        mask = helper.create_variable_for_type_inference("int32")
-        outputs = {"Out": pool_out, "Mask": mask}
-        helper.append_op(
-            type="max_pool2d_with_index",
-            inputs={"X": x},
-            outputs=outputs,
-            attrs={
-                "pooling_type": 'max',
-                "ksize": kernel_size,
-                "global_pooling": False,
-                "strides": stride,
-                "paddings": padding,
-                "padding_algorithm": padding_algorithm,
-                "use_cudnn": True,
-                "ceil_mode": ceil_mode,
-                "use_mkldnn": False,
-                "exclusive": True,
-                "data_format": data_format,
-            },
-        )
-        return (pool_out, mask)
-    else:
-        outputs = {"Out": pool_out}
-        helper.append_op(
-            type="pool2d",
-            inputs={"X": x},
-            outputs=outputs,
-            attrs={
-                "pooling_type": 'max',
-                "ksize": kernel_size,
-                "global_pooling": False,
-                "strides": stride,
-                "paddings": padding,
-                "padding_algorithm": padding_algorithm,
-                "use_cudnn": True,
-                "ceil_mode": ceil_mode,
-                "use_mkldnn": False,
-                "exclusive": True,
-                "data_format": data_format,
-            },
-        )
-        return pool_out
 def max_pool3d(
@@ -1580,90 +1387,35 @@ def max_pool3d(
                padding_algorithm,
            )
-    if _in_legacy_dygraph():
+    else:
-        if return_mask:
+        op_type = "max_pool3d_with_index" if return_mask else "pool3d"
-            output = _legacy_C_ops.max_pool3d_with_index(
+        helper = LayerHelper(op_type, **locals())
-                x,
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
-                'pooling_type',
+        dtype = helper.input_dtype(input_param_name='x')
-                'max',
+        pool_out = helper.create_variable_for_type_inference(dtype)
-                'ksize',
+        mask = helper.create_variable_for_type_inference('int32')
-                kernel_size,
+        outputs = {"Out": pool_out, "Mask": mask}
-                'strides',
-                stride,
-                'paddings',
-                padding,
-                'global_pooling',
-                False,
-                'padding_algorithm',
-                padding_algorithm,
-                'use_cudnn',
-                True,
-                'ceil_mode',
-                ceil_mode,
-                'use_mkldnn',
-                False,
-                'exclusive',
-                True,
-                'data_format',
-                data_format,
-            )
-            return output if return_mask else output[0]
-        else:
-            output = _legacy_C_ops.pool3d(
-                x,
-                'pooling_type',
-                'max',
-                'ksize',
-                kernel_size,
-                'global_pooling',
-                False,
-                'padding_algorithm',
-                padding_algorithm,
-                'strides',
-                stride,
-                'paddings',
-                padding,
-                'use_cudnn',
-                True,
-                'ceil_mode',
-                ceil_mode,
-                'use_mkldnn',
-                False,
-                'exclusive',
-                True,
-                'data_format',
-                data_format,
-            )
-            return output
-    op_type = "max_pool3d_with_index" if return_mask else "pool3d"
-    helper = LayerHelper(op_type, **locals())
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
-    dtype = helper.input_dtype(input_param_name='x')
-    pool_out = helper.create_variable_for_type_inference(dtype)
-    mask = helper.create_variable_for_type_inference('int32')
-    outputs = {"Out": pool_out, "Mask": mask}
-    helper.append_op(
+        helper.append_op(
-        type=op_type,
+            type=op_type,
-        inputs={"X": x},
+            inputs={"X": x},
-        outputs=outputs,
+            outputs=outputs,
-        attrs={
+            attrs={
-            "pooling_type": 'max',
+                "pooling_type": 'max',
-            "ksize": kernel_size,
+                "ksize": kernel_size,
-            "global_pooling": False,
+                "global_pooling": False,
-            "strides": stride,
+                "strides": stride,
-            "paddings": padding,
+                "paddings": padding,
-            "padding_algorithm": padding_algorithm,
+                "padding_algorithm": padding_algorithm,
-            "use_cudnn": True,
+                "use_cudnn": True,
-            "ceil_mode": ceil_mode,
+                "ceil_mode": ceil_mode,
-            "use_mkldnn": False,
+                "use_mkldnn": False,
-            "exclusive": False,
+                "exclusive": False,
-            "data_format": data_format,
+                "data_format": data_format,
-        },
+            },
-    )
+        )
-    return (pool_out, mask) if return_mask else pool_out
+        return (pool_out, mask) if return_mask else pool_out
 def adaptive_avg_pool1d(x, output_size, name=None):
@@ -1729,31 +1481,26 @@ def adaptive_avg_pool1d(x, output_size, name=None):
            "EXPLICIT",
        )
        return squeeze(pool_out, [2])
-    if _in_legacy_dygraph():
+    else:
-        pool_out = _legacy_C_ops.pool2d(
+        l_type = "pool2d"
-            x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True
-        )
-        return squeeze(pool_out, [2])
-    l_type = "pool2d"
-    helper = LayerHelper(l_type, **locals())
+        helper = LayerHelper(l_type, **locals())
-    dtype = helper.input_dtype(input_param_name='x')
+        dtype = helper.input_dtype(input_param_name='x')
-    pool_out = helper.create_variable_for_type_inference(dtype)
+        pool_out = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": pool_out}
+        outputs = {"Out": pool_out}
-    helper.append_op(
+        helper.append_op(
-        type=l_type,
+            type=l_type,
-        inputs={"X": x},
+            inputs={"X": x},
-        outputs=outputs,
+            outputs=outputs,
-        attrs={
+            attrs={
-            "pooling_type": pool_type,
+                "pooling_type": pool_type,
-            "ksize": pool_size,
+                "ksize": pool_size,
-            "adaptive": True,
+                "adaptive": True,
-        },
+            },
-    )
+        )
-    return squeeze(pool_out, [2])
+        return squeeze(pool_out, [2])
 def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
@@ -1841,7 +1588,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
        if output_size[1] is None:
            output_size[1] = in_w
-    if _non_static_mode():
+    if in_dygraph_mode():
        output_size = [
            item.numpy().item(0) if isinstance(item, Variable) else item
            for item in output_size
@@ -1866,42 +1613,28 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
            "EXPLICIT",
        )
-    if _in_legacy_dygraph():
+    else:
-        return _legacy_C_ops.pool2d(
+        l_type = 'pool2d'
-            x,
-            'pooling_type',
-            'avg',
-            'ksize',
-            output_size,
-            'global_pooling',
-            False,
-            'adaptive',
-            True,
-            'data_format',
-            data_format,
-        )
-    l_type = 'pool2d'
-    helper = LayerHelper(l_type, **locals())
+        helper = LayerHelper(l_type, **locals())
-    dtype = helper.input_dtype(input_param_name='x')
+        dtype = helper.input_dtype(input_param_name='x')
-    pool_out = helper.create_variable_for_type_inference(dtype)
+        pool_out = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": pool_out}
+        outputs = {"Out": pool_out}
-    helper.append_op(
+        helper.append_op(
-        type=l_type,
+            type=l_type,
-        inputs={"X": x},
+            inputs={"X": x},
-        outputs=outputs,
+            outputs=outputs,
-        attrs={
+            attrs={
-            "pooling_type": "avg",
+                "pooling_type": "avg",
-            "ksize": output_size,
+                "ksize": output_size,
-            "adaptive": True,
+                "adaptive": True,
-            "data_format": data_format,
+                "data_format": data_format,
-        },
+            },
-    )
+        )
-    return pool_out
+        return pool_out
 def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
@@ -2010,41 +1743,27 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
            True,
            "EXPLICIT",
        )
-    elif _in_legacy_dygraph():
+    else:
-        return _legacy_C_ops.pool3d(
+        l_type = 'pool3d'
-            x,
-            'pooling_type',
-            'avg',
-            'ksize',
-            output_size,
-            'global_pooling',
-            False,
-            'adaptive',
-            True,
-            'data_format',
-            data_format,
-        )
-    l_type = 'pool3d'
-    helper = LayerHelper(l_type, **locals())
+        helper = LayerHelper(l_type, **locals())
-    dtype = helper.input_dtype(input_param_name='x')
+        dtype = helper.input_dtype(input_param_name='x')
-    pool_out = helper.create_variable_for_type_inference(dtype)
+        pool_out = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": pool_out}
+        outputs = {"Out": pool_out}
-    helper.append_op(
+        helper.append_op(
-        type=l_type,
+            type=l_type,
-        inputs={"X": x},
+            inputs={"X": x},
-        outputs=outputs,
+            outputs=outputs,
-        attrs={
+            attrs={
-            "pooling_type": "avg",
+                "pooling_type": "avg",
-            "ksize": output_size,
+                "ksize": output_size,
-            "adaptive": True,
+                "adaptive": True,
-            "data_format": data_format,
+                "data_format": data_format,
-        },
+            },
-    )
+        )
-    return pool_out
+        return pool_out
 def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
@@ -2112,41 +1831,32 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
            if return_mask
            else squeeze(pool_out[0], [2])
        )
-    if _in_legacy_dygraph():
+    else:
-        pool_out = _legacy_C_ops.max_pool2d_with_index(
+        l_type = 'max_pool2d_with_index'
-            x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True
-        )
-        return (
-            (squeeze(pool_out[0], [2]), squeeze(pool_out[1], [2]))
-            if return_mask
-            else squeeze(pool_out[0], [2])
-        )
-    l_type = 'max_pool2d_with_index'
-    helper = LayerHelper(l_type, **locals())
+        helper = LayerHelper(l_type, **locals())
-    dtype = helper.input_dtype(input_param_name='x')
+        dtype = helper.input_dtype(input_param_name='x')
-    pool_out = helper.create_variable_for_type_inference(dtype)
+        pool_out = helper.create_variable_for_type_inference(dtype)
-    mask = helper.create_variable_for_type_inference('int32')
+        mask = helper.create_variable_for_type_inference('int32')
-    outputs = {"Out": pool_out, "Mask": mask}
+        outputs = {"Out": pool_out, "Mask": mask}
-    helper.append_op(
+        helper.append_op(
-        type=l_type,
+            type=l_type,
-        inputs={"X": x},
+            inputs={"X": x},
-        outputs=outputs,
+            outputs=outputs,
-        attrs={
+            attrs={
-            "pooling_type": pool_type,
+                "pooling_type": pool_type,
-            "ksize": pool_size,
+                "ksize": pool_size,
-            "adaptive": True,
+                "adaptive": True,
-        },
+            },
-    )
+        )
-    return (
+        return (
-        (squeeze(pool_out, [2]), squeeze(mask, [2]))
+            (squeeze(pool_out, [2]), squeeze(mask, [2]))
-        if return_mask
+            if return_mask
-        else squeeze(pool_out, [2])
+            else squeeze(pool_out, [2])
-    )
+        )
 def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
@@ -2211,33 +1921,28 @@ def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
            x, output_size, [1, 1], [0, 0], False, True
        )
        return pool_out if return_mask else pool_out[0]
-    if _in_legacy_dygraph():
+    else:
-        pool_out = _legacy_C_ops.max_pool2d_with_index(
+        l_type = 'max_pool2d_with_index'
-            x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True
-        )
-        return pool_out if return_mask else pool_out[0]
-    l_type = 'max_pool2d_with_index'
-    helper = LayerHelper(l_type, **locals())
+        helper = LayerHelper(l_type, **locals())
-    dtype = helper.input_dtype(input_param_name='x')
+        dtype = helper.input_dtype(input_param_name='x')
-    pool_out = helper.create_variable_for_type_inference(dtype)
+        pool_out = helper.create_variable_for_type_inference(dtype)
-    mask = helper.create_variable_for_type_inference('int32')
+        mask = helper.create_variable_for_type_inference('int32')
-    outputs = {"Out": pool_out, "Mask": mask}
+        outputs = {"Out": pool_out, "Mask": mask}
-    helper.append_op(
+        helper.append_op(
-        type=l_type,
+            type=l_type,
-        inputs={"X": x},
+            inputs={"X": x},
-        outputs=outputs,
+            outputs=outputs,
-        attrs={
+            attrs={
-            "pooling_type": 'max',
+                "pooling_type": 'max',
-            "ksize": output_size,
+                "ksize": output_size,
-            "adaptive": True,
+                "adaptive": True,
-        },
+            },
-    )
+        )
-    # return (pool_out, mask) if return_mask else pool_out
+        # return (pool_out, mask) if return_mask else pool_out
-    return pool_out
+        return pool_out
 def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
@@ -2304,36 +2009,31 @@ def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
        if output_size[2] is None:
            output_size[2] = in_w
-    if in_dynamic_mode():
+    if in_dygraph_mode():
-        if in_dygraph_mode():
+        # By default, strides is [1,1,1] and paddings is [0, 0, 0]
-            # By default, strides is [1,1,1] and paddings is [0, 0, 0]
+        pool_out = _C_ops.max_pool3d_with_index(
-            pool_out = _C_ops.max_pool3d_with_index(
+            x, output_size, [1, 1, 1], [0, 0, 0], False, True
-                x, output_size, [1, 1, 1], [0, 0, 0], False, True
+        )
-            )
-        elif _in_legacy_dygraph():
-            pool_out = _legacy_C_ops.max_pool3d_with_index(
-                x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True
-            )
        return pool_out if return_mask else pool_out[0]
+    else:
+        l_type = 'max_pool3d_with_index'
-    l_type = 'max_pool3d_with_index'
+        helper = LayerHelper(l_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
-    helper = LayerHelper(l_type, **locals())
+        pool_out = helper.create_variable_for_type_inference(dtype)
-    dtype = helper.input_dtype(input_param_name='x')
-    pool_out = helper.create_variable_for_type_inference(dtype)
-    mask = helper.create_variable_for_type_inference('int32')
+        mask = helper.create_variable_for_type_inference('int32')
-    outputs = {"Out": pool_out, "Mask": mask}
+        outputs = {"Out": pool_out, "Mask": mask}
-    helper.append_op(
+        helper.append_op(
-        type=l_type,
+            type=l_type,
-        inputs={"X": x},
+            inputs={"X": x},
-        outputs=outputs,
+            outputs=outputs,
-        attrs={
+            attrs={
-            "pooling_type": 'max',
+                "pooling_type": 'max',
-            "ksize": output_size,
+                "ksize": output_size,
-            "adaptive": True,
+                "adaptive": True,
-        },
+            },
-    )
+        )
-    return (pool_out, mask) if return_mask else pool_out
+        return (pool_out, mask) if return_mask else pool_out
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 from paddle import _C_ops, _legacy_C_ops, in_dynamic_mode
-from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode
+from paddle.fluid.framework import in_dygraph_mode
-from paddle.framework import _non_static_mode
 from ...device import get_cudnn_version, is_compiled_with_rocm
 from ...fluid.data_feeder import check_variable_and_dtype
@@ -381,22 +380,22 @@ def pixel_shuffle(x, upscale_factor, data_format="NCHW", name=None):
        )
    if in_dygraph_mode():
        return _C_ops.pixel_shuffle(x, upscale_factor, data_format)
+    else:
-    if _in_legacy_dygraph():
+        helper = LayerHelper("pixel_shuffle", **locals())
-        return _legacy_C_ops.pixel_shuffle(
+        check_variable_and_dtype(
-            x, "upscale_factor", upscale_factor, "data_format", data_format
+            x, 'x', ['float32', 'float64'], 'pixel_shuffle'
        )
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper = LayerHelper("pixel_shuffle", **locals())
+        helper.append_op(
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'pixel_shuffle')
+            type="pixel_shuffle",
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+            inputs={"X": x},
-    helper.append_op(
+            outputs={"Out": out},
-        type="pixel_shuffle",
+            attrs={
-        inputs={"X": x},
+                "upscale_factor": upscale_factor,
-        outputs={"Out": out},
+                "data_format": data_format,
-        attrs={"upscale_factor": upscale_factor, "data_format": data_format},
+            },
-    )
+        )
-    return out
+        return out
 def pixel_unshuffle(x, downscale_factor, data_format="NCHW", name=None):
@@ -442,7 +441,7 @@ def pixel_unshuffle(x, downscale_factor, data_format="NCHW", name=None):
            "But recevie Attr(data_format): {} ".format(data_format)
        )
-    if _non_static_mode():
+    if in_dygraph_mode():
        return _legacy_C_ops.pixel_unshuffle(
            x, "downscale_factor", downscale_factor, "data_format", data_format
        )
@@ -516,7 +515,7 @@ def channel_shuffle(x, groups, data_format="NCHW", name=None):
            "But recevie Attr(data_format): {} ".format(data_format)
        )
-    if _non_static_mode():
+    if in_dygraph_mode():
        return _legacy_C_ops.channel_shuffle(
            x, "groups", groups, "data_format", data_format
        )