[Eager] use final op in maskrcnn and hrnet (#41927)

* update * add conv yaml * add backward * remove useless code * fix bug * fix bug * revert fluid dygraph conv2d * remove useless infermeta function * fix meta fn deluplicat error * conv using custom impl * remove amp include * fix bug * use final op in maskrcnn and hrnet * refine Co-authored-by: N phlrain <phliuhongyu@126.com>

[Eager] use final op in maskrcnn and hrnet (#41927)
* update * add conv yaml * add backward * remove useless code * fix bug * fix bug * revert fluid dygraph conv2d * remove useless infermeta function * fix meta fn deluplicat error * conv using custom impl * remove amp include * fix bug * use final op in maskrcnn and hrnet * refine Co-authored-by: N phlrain <phliuhongyu@126.com>
aaabb796 · wanghuancoder · GitHub · 9291358d · aaabb796 · aaabb796
15 changed file
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -41,6 +41,7 @@ limitations under the License. */
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "pybind11/detail/internals.h"
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/pybind/tensor_py.h"
@@ -713,15 +714,19 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
        break;
      }
    }
+    std::vector<int64_t> slice_axes_tmp(slice_axes.begin(), slice_axes.end());
+    std::vector<int64_t> infer_flags_tmp(infer_flags.begin(),
+                                         infer_flags.end());
+    std::vector<int64_t> decrease_axis_tmp(decrease_axis.begin(),
+                                           decrease_axis.end());
    if (op_type == "slice") {
-      out = slice_dygraph_function(self->tensor, paddle::experimental::Tensor(),
+      out = slice_final_state_dygraph_function(
-                                   paddle::experimental::Tensor(), {}, {},
+          self->tensor, slice_axes_tmp, slice_starts, slice_ends,
-                                   std::move(attrs));
+          infer_flags_tmp, decrease_axis_tmp);
    } else if (op_type == "strided_slice") {
-      out = strided_slice_dygraph_function(
+      out = strided_slice_final_state_dygraph_function(
-          self->tensor, paddle::experimental::Tensor(),
+          self->tensor, slice_axes, slice_starts, slice_ends, slice_strides);
-          paddle::experimental::Tensor(), paddle::experimental::Tensor(), {},
-          {}, {}, attrs);
    } else {
      PADDLE_THROW(platform::errors::InvalidArgument(
          "Slice is only support slice and strided_slice, but we got %s which "
@@ -776,8 +781,8 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
    paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx,
                                        idx_tensor.get());
    framework::AttributeMap attrs = {{"dim", 0}};
-    out = index_select_dygraph_function(self->tensor, select_index,
+    out = index_select_final_state_dygraph_function(self->tensor, select_index,
-                                        std::move(attrs));
+                                                    0);
  }
  return ToPyObject(out);

--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -1027,22 +1027,22 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
  // obj could be: int, float, bool, paddle.Tensor
  PyTypeObject* type = obj->ob_type;
  auto type_name = std::string(type->tp_name);
-  if (type_name == "int") {
+  if (PyBool_Check(obj)) {
+    bool value = CastPyArg2Boolean(obj, op_type, arg_pos);
+    return paddle::experimental::Scalar(value);
+  } else if (PyLong_Check(obj)) {
    int value = CastPyArg2Int(obj, op_type, arg_pos);
    return paddle::experimental::Scalar(value);
-  } else if (type_name == "float") {
+  } else if (PyFloat_Check(obj)) {
    float value = CastPyArg2Float(obj, op_type, arg_pos);
    return paddle::experimental::Scalar(value);
+  } else if (IsEagerTensor(obj)) {
-  } else if (type_name == "bool") {
-    bool value = CastPyArg2Boolean(obj, op_type, arg_pos);
-    return paddle::experimental::Scalar(value);
-  } else if (type_name == "Tensor") {
    paddle::experimental::Tensor& value = GetTensorFromPyObject(
        op_type, "" /*arg_name*/, obj, arg_pos, false /*dispensable*/);
    return paddle::experimental::Scalar(value);
+  } else if (PyObject_CheckLongOrToLong(&obj)) {
+    int value = CastPyArg2Int(obj, op_type, arg_pos);
+    return paddle::experimental::Scalar(value);
  } else {
    PADDLE_THROW(platform::errors::InvalidArgument(
        "%s(): argument (position %d) must be "
@@ -1159,11 +1159,7 @@ paddle::Place CastPyArg2Place(PyObject* obj, const std::string& op_type,
 paddle::DataType CastPyArg2DataType(PyObject* obj, const std::string& op_type,
                                    ssize_t arg_pos) {
  if (obj == Py_None) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    return paddle::experimental::DataType::UNDEFINED;
-        "%s(): argument (position %d) must be "
-        "data_type, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
  }
  framework::proto::VarType::Type type = CastPyArg2ProtoType(obj, arg_pos);

--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -543,7 +543,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
                clip_input = (clip_var.astype('float16')
                              if g.dtype == core.VarDesc.VarType.FP16 else
                              clip_var)
-                new_grad = layers.elementwise_mul(x=g, y=clip_input)
+                new_grad = _C_ops.elementwise_mul(g, clip_input)
                params_and_grads.append((p, new_grad))
            else:
                params_and_grads.append((p, g))

--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -62,15 +62,6 @@ _complex_dtypes = [
 _already_patch_varbase = False
 _already_patch_eager_tensor = False
-# Dispatch to final state Python-C functions
-_final_state_op_type_mapping = {
-    "elementwise_add": "final_state_add",
-    "elementwise_sub": "final_state_subtract",
-    "elementwise_div": "final_state_divide",
-    "elementwise_mul": "final_state_multiply",
-    "matmul_v2": "final_state_matmul",
-}
 def monkey_patch_math_varbase():
    """
@@ -80,9 +71,13 @@ def monkey_patch_math_varbase():
    @no_grad
    def create_tensor(value, dtype, shape):
-        out = _varbase_creator(dtype=dtype)
+        if framework._in_eager_mode_:
-        out = _C_ops.fill_constant(out, 'dtype', dtype, 'shape', shape, 'value',
+            out = _C_ops.final_state_full(shape, value, dtype,
-                                   value, 'force_cpu', False)
+                                          framework._current_expected_place())
+        else:
+            out = _varbase_creator(dtype=dtype)
+            out = _C_ops.fill_constant(out, 'dtype', dtype, 'shape', shape,
+                                       'value', value, 'force_cpu', False)
        out.stop_gradient = True
        return out
@@ -120,9 +115,9 @@ def monkey_patch_math_varbase():
        return _C_ops.final_state_cast(self, dtype)
    def _scalar_elementwise_op_(var, scale, bias):
-        if _in_legacy_dygraph():
+        if framework.in_dygraph_mode():
-            return _C_ops.scale(var, 'scale', scale, 'bias', bias)
+            return _C_ops.final_state_scale(var, float(scale), bias, True)
-        return _C_ops.final_state_scale(var, float(scale), bias, True)
+        return _C_ops.scale(var, 'scale', scale, 'bias', bias)
    def _neg_(var):
        return _scalar_elementwise_op_(var, -1.0, 0.0)
@@ -203,7 +198,8 @@ def monkey_patch_math_varbase():
    def _binary_creator_(method_name,
                         op_type,
                         reverse=False,
-                         scalar_method=None):
+                         scalar_method=None,
+                         call_final_api=False):
        def __impl__(self, other_var):
            # 1. scalar exists cases
            # we need combine the tensor.dtype and scalar.dtype, cast correct object
@@ -287,15 +283,15 @@ def monkey_patch_math_varbase():
            # 4. calculation
            axis = -1
-            if in_dygraph_mode(
+            math_op = getattr(_C_ops, op_type)
-            ) and op_type in _final_state_op_type_mapping.keys():
+            if call_final_api:
-                math_op = getattr(_C_ops, _final_state_op_type_mapping[op_type])
+                return math_op(self, other_var, -1)
-                return math_op(self, other_var)
+            return math_op(self, other_var, 'axis', axis)
-            else:
-                math_op = getattr(_C_ops, op_type)
-                return math_op(self, other_var, 'axis', axis)
-        comment = OpProtoHolder.instance().get_op_proto(op_type).comment
+        if call_final_api:
+            comment = ""
+        else:
+            comment = OpProtoHolder.instance().get_op_proto(op_type).comment
        __impl__.__doc__ = """
        {0}
@@ -321,28 +317,48 @@ def monkey_patch_math_varbase():
        ('ndim', _ndim_),
        ('size', _size_),
        ('T', _T_),
-        ('__add__',
+        ('__add__', _binary_creator_('__add__', 'final_state_add', False,
-         _binary_creator_('__add__', 'elementwise_add', False, _scalar_add_)),
+                                     _scalar_add_, True))
+        if framework._in_eager_mode_ else ('__add__', _binary_creator_(
+            '__add__', 'elementwise_add', False, _scalar_add_)),
        ##  a+b == b+a. Do not need to reverse explicitly
-        ('__radd__',
+        ('__radd__', _binary_creator_('__radd__', 'final_state_add', False,
-         _binary_creator_('__radd__', 'elementwise_add', False, _scalar_add_)),
+                                      _scalar_add_, True))
-        ('__sub__', _binary_creator_('__sub__', 'elementwise_sub', False,
+        if framework._in_eager_mode_ else ('__radd__', _binary_creator_(
-                                     _scalar_sub_)),
+            '__radd__', 'elementwise_add', False, _scalar_add_)),
-        ('__rsub__', _binary_creator_('__rsub__', 'elementwise_sub', True,
+        ('__sub__', _binary_creator_('__sub__', 'final_state_subtract', False,
-                                      _scalar_rsub_)),
+                                     _scalar_sub_, True))
-        ('__mul__', _binary_creator_('__mul__', 'elementwise_mul', False,
+        if framework._in_eager_mode_ else ('__sub__', _binary_creator_(
-                                     _scalar_mul_)),
+            '__sub__', 'elementwise_sub', False, _scalar_sub_)),
+        ('__rsub__', _binary_creator_('__rsub__', 'final_state_subtract', True,
+                                      _scalar_rsub_, True))
+        if framework._in_eager_mode_ else ('__rsub__', _binary_creator_(
+            '__rsub__', 'elementwise_sub', True, _scalar_rsub_)),
+        ('__mul__', _binary_creator_('__mul__', 'final_state_multiply', False,
+                                     _scalar_mul_, True))
+        if framework._in_eager_mode_ else ('__mul__', _binary_creator_(
+            '__mul__', 'elementwise_mul', False, _scalar_mul_)),
        ## a*b == b*a. Do not need to reverse explicitly
-        ('__rmul__',
+        ('__rmul__', _binary_creator_('__rmul__', 'final_state_multiply', False,
-         _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
+                                      _scalar_mul_, True))
-        ('__div__', _binary_creator_('__div__', 'elementwise_div', False,
+        if framework._in_eager_mode_ else ('__rmul__', _binary_creator_(
-                                     _scalar_div_)),
+            '__rmul__', 'elementwise_mul', False, _scalar_mul_)),
-        ('__truediv__', _binary_creator_('__truediv__', 'elementwise_div',
+        ('__div__', _binary_creator_('__div__', 'final_state_divide', False,
-                                         False, _scalar_div_)),
+                                     _scalar_div_, True))
-        ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
+        if framework._in_eager_mode_ else ('__div__', _binary_creator_(
-                                      None)),
+            '__div__', 'elementwise_div', False, _scalar_div_)),
-        ('__rtruediv__', _binary_creator_('rtruediv__', 'elementwise_div', True,
+        ('__truediv__', _binary_creator_('__truediv__', 'final_state_divide',
-                                          None)),
+                                         False, _scalar_div_, True))
+        if framework._in_eager_mode_ else ('__truediv__', _binary_creator_(
+            '__truediv__', 'elementwise_div', False, _scalar_div_)),
+        ('__rdiv__', _binary_creator_('__rdiv__', 'final_state_divide', True,
+                                      None, True)) if framework._in_eager_mode_
+        else ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
+                                           None)),
+        ('__rtruediv__', _binary_creator_('rtruediv__', 'final_state_divide',
+                                          True, None, True))
+        if framework._in_eager_mode_ else ('__rtruediv__', _binary_creator_(
+            'rtruediv__', 'elementwise_div', True, None)),
        ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
                                     None)),
        ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
@@ -354,11 +370,26 @@ def monkey_patch_math_varbase():
        ('__matmul__', _binary_creator_('__matmul__', "matmul_v2", False,
                                        None)),
        ## for logical compare
+        ('__eq__',
+         _binary_creator_('__eq__', 'final_state_equal', False, None, True))
+        if framework._in_eager_mode_ else
        ('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
-        ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
+        ('__ne__', _binary_creator_('__ne__', 'final_state_not_equal', False,
-        ('__lt__', _binary_creator_('__lt__', 'less_than', False, None)),
+                                    None, True)) if framework._in_eager_mode_
-        ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
+        else ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
+        ('__lt__', _binary_creator_('__lt__', 'final_state_less_than', False,
+                                    None, True)) if framework._in_eager_mode_
+        else ('__lt__', _binary_creator_('__lt__', 'less_than', False, None)),
+        ('__le__', _binary_creator_('__le__', 'final_state_less_equal', False,
+                                    None, True)) if framework._in_eager_mode_
+        else ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
+        ('__gt__', _binary_creator_('__gt__', 'final_state_greater_than', False,
+                                    None, True))
+        if framework._in_eager_mode_ else
        ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None)),
+        ('__ge__', _binary_creator_('__ge__', 'final_state_greater_equal',
+                                    False, None, True))
+        if framework._in_eager_mode_ else
        ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None)),
        ('__array_ufunc__', None)
    ]

--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
@@ -72,6 +72,17 @@ final_state_name_mapping = {
        "axis2": "axis2",
        "out": "Out",
    },
+    "roi_align": {
+        "final_op_name": "final_state_roi_align",
+        "x": "X",
+        "boxes": "ROIs",
+        "boxes_num": "RoisNum",
+        "pooled_height": "pooled_height",
+        "pooled_width": "pooled_width",
+        "spatial_scale": "spatial_scale",
+        "sampling_ratio": "sampling_ratio",
+        "aligned": "aligned",
+    },
    # "one_hot": {
    #     "final_op_name": "final_state_one_hot",
    #     "x": "X",

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1796,13 +1796,16 @@ def greater_than(x, y, cond=None, name=None):
    attrs = dict()
-    helper.append_op(
+    if in_dygraph_mode():
-        type='greater_than',
+        return _C_ops.final_state_greater_than(x, y, -1)
-        inputs={'X': [x],
+    else:
-                'Y': [y]},
+        helper.append_op(
-        outputs={'Out': [cond]},
+            type='greater_than',
-        attrs=attrs)
+            inputs={'X': [x],
-    return cond
+                    'Y': [y]},
+            outputs={'Out': [cond]},
+            attrs=attrs)
+        return cond
 @templatedoc()

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -196,6 +196,17 @@ __all__ = [
    'unbind',
 ]
+OP_NAMEMAPPING = {
+    'elementwise_max': 'final_state_maximum',
+    'elementwise_min': 'final_state_minimum',
+    'elementwise_pow': 'final_state_elementwise_pow',
+    'elementwise_floordiv': 'final_state_floor_divide',
+    'elementwise_add': 'final_state_add',
+    'elementwise_sub': 'final_state_subtract',
+    'elementwise_mul': 'final_state_multiply',
+    'elementwise_div': 'final_state_divide',
+}
 @dygraph_only
 def _elementwise_op_in_dygraph(x,
@@ -204,8 +215,21 @@ def _elementwise_op_in_dygraph(x,
                               act=None,
                               use_mkldnn=False,
                               op_name=None):
-    op = getattr(_C_ops, op_name)
+    def is_inplace(op_name):
-    out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn)
+        return op_name[-1] == "_"
+    if op_name not in OP_NAMEMAPPING.keys() or axis != -1:
+        op = getattr(_C_ops, op_name)
+        out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn)
+    else:
+        if in_dygraph_mode():
+            op = getattr(_C_ops, OP_NAMEMAPPING[op_name]
+                         if not is_inplace(op_name) else op_name)
+            out = op(x, y)
+        if _in_legacy_dygraph():
+            op = getattr(_C_ops, op_name)
+            out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn)
    return dygraph_utils._append_activation_in_dygraph(
        out, act, use_mkldnn=use_mkldnn)
@@ -5093,9 +5117,12 @@ def split(input, num_or_sections, dim=-1, name=None):
            raise TypeError(
                "The type of 'num_or_sections' in split must be int, list or tuple in imperative mode, but "
                "received %s." % (type(num_or_sections)))
-        out = [_varbase_creator() for n in range(num)]
+        if in_dygraph_mode():
-        _C_ops.split(input, out, *attrs)
+            return _C_ops.final_state_split(input, [num], dim)
-        return out
+        elif _in_legacy_dygraph():
+            out = [_varbase_creator() for n in range(num)]
+            _C_ops.split(input, out, *attrs)
+            return out
    check_variable_and_dtype(
        input, 'input',
@@ -7284,7 +7311,12 @@ def roi_align(input,
                                               sampling_ratio=-1,
                                               rois_num=rois_num)
    """
-    if _non_static_mode():
+    if in_dygraph_mode():
+        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        return _C_ops.final_state_roi_align(
+            input, rois, rois_num, pooled_height, pooled_width, spatial_scale,
+            sampling_ratio, False)
+    if _in_legacy_dygraph():
        assert rois_num is not None, "rois_num should not be None in dygraph mode."
        align_out = _C_ops.roi_align(
            input, rois, rois_num, "pooled_height", pooled_height,

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -2848,7 +2848,11 @@ class AdamaxOptimizer(Optimizer):
                beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
                                                      param)
                if framework._non_static_mode():
-                    tmp = _C_ops.scale(beta1_pow_acc, "scale", self._beta1)
+                    if framework.in_dygraph_mode():
+                        tmp = _C_ops.final_state_scale(beta1_pow_acc,
+                                                       self._beta1, 0.0, True)
+                    else:
+                        tmp = _C_ops.scale(beta1_pow_acc, "scale", self._beta1)
                    beta1_pow_acc.copy_(tmp, False)
                else:
                    block.append_op(

--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -134,7 +134,11 @@ class L2DecayRegularizer(WeightDecayRegularizer):
        assert isinstance(block, framework.Block)
        if framework._non_static_mode():
-            return _C_ops.scale(param, "scale", self._regularization_coeff)
+            if framework.in_dygraph_mode():
+                return _C_ops.final_state_scale(
+                    param, self._regularization_coeff, 0.0, True)
+            else:
+                return _C_ops.scale(param, "scale", self._regularization_coeff)
        else:
            decay = block.create_var(
                dtype=param.dtype, shape=param.shape, lod_level=param.lod_level)

--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -37,7 +37,7 @@ from paddle.utils import deprecated
 from paddle import _C_ops
 from paddle import in_dynamic_mode
 from paddle.framework import core
-from ...fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode
+from ...fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode, _current_expected_place
 __all__ = []
@@ -116,13 +116,13 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
    if in_dygraph_mode():
        out = _C_ops.final_state_bce_loss(input, label)
        if weight is not None:
-            out = _C_ops.elementwise_mul(out, weight, 'axis', -1)
+            out = _C_ops.final_state_multiply(out, weight, 'axis', -1)
        if reduction == 'sum':
            return _C_ops.reduce_sum(out, 'dim', [0], 'keep_dim', False,
                                     "reduce_all", True)
        elif reduction == 'mean':
-            return _C_ops.mean(out)
+            return _C_ops.final_state_mean_all(out)
        else:
            return out
    else:
@@ -260,14 +260,17 @@ def binary_cross_entropy_with_logits(logit,
            % reduction)
    if _non_static_mode():
-        one = _varbase_creator(dtype=logit.dtype)
-        _C_ops.fill_constant(one, 'value',
-                             float(1.0), 'force_cpu', False, 'dtype', one.dtype,
-                             'str_value', '1.0', 'shape', [1])
        if in_dygraph_mode():
+            one = _C_ops.final_state_full([1],
+                                          float(1.0), core.VarDesc.VarType.FP32,
+                                          _current_expected_place())
            out = _C_ops.final_state_sigmoid_cross_entropy_with_logits(
                logit, label, False, -100)
        else:
+            one = _varbase_creator(dtype=logit.dtype)
+            _C_ops.fill_constant(one, 'value',
+                                 float(1.0), 'force_cpu', False, 'dtype',
+                                 one.dtype, 'str_value', '1.0', 'shape', [1])
            out = _C_ops.sigmoid_cross_entropy_with_logits(logit, label)
        if pos_weight is not None:
            log_weight = _C_ops.elementwise_add(
@@ -405,7 +408,7 @@ def hsigmoid_loss(input,
            #  [2.2407534]]
    """
-    if in_dynamic_mode():
+    if _non_static_mode():
        out, _, _ = _C_ops.hierarchical_sigmoid(
            input, weight, label, path_table, path_code, bias, 'num_classes',
            num_classes, 'is_sparse', is_sparse, 'remote_prefetch', is_sparse)
@@ -582,7 +585,19 @@ def margin_ranking_loss(input,
        raise ValueError(
            "The value of 'reduction' in MarginRankingLoss should be 'sum', 'mean' or 'none', but "
            "received %s, which is not allowed." % reduction)
-    if in_dynamic_mode():
+    if in_dygraph_mode():
+        out = _C_ops.final_state_subtract(other, input)
+        out = _C_ops.final_state_multiply(out, label)
+        if margin != 0.0:
+            margin = fluid.dygraph.base.to_variable([margin], dtype=out.dtype)
+            out = _C_ops.elementwise_add(out, margin)
+        out = _C_ops.relu(out)
+        if reduction == 'sum':
+            return _C_ops.reduce_sum(out, 'reduce_all', True)
+        elif reduction == 'mean':
+            return _C_ops.final_state_mean_all(out)
+        return out
+    elif _in_legacy_dygraph():
        out = _C_ops.elementwise_sub(other, input)
        out = _C_ops.elementwise_mul(out, label)
        if margin != 0.0:
@@ -698,7 +713,17 @@ def l1_loss(input, label, reduction='mean', name=None):
            "The value of 'reduction' in L1Loss should be 'sum', 'mean' or 'none', but "
            "received %s, which is not allowed." % reduction)
-    if in_dynamic_mode():
+    if in_dygraph_mode():
+        unreduced = _elementwise_op_in_dygraph(
+            input, label, axis=-1, act='abs', op_name='elementwise_sub')
+        if reduction == 'mean':
+            return _C_ops.final_state_mean_all(unreduced)
+        elif reduction == 'sum':
+            return _C_ops.reduce_sum(unreduced, 'dim', [0], 'keep_dim', False,
+                                     'reduce_all', True)
+        else:
+            return unreduced
+    elif in_dynamic_mode():
        unreduced = _elementwise_op_in_dygraph(
            input, label, axis=-1, act='abs', op_name='elementwise_sub')
        if reduction == 'mean':
@@ -1819,7 +1844,10 @@ def cross_entropy(input,
                                                 'reduce_all', True)
                return out_sum / (total_weight + (total_weight == 0.0))
            else:
-                return _C_ops.mean(out)
+                if in_dygraph_mode():
+                    return _C_ops.final_state_mean_all(out)
+                else:
+                    return _C_ops.mean(out)
        else:
            if input_dims - 1 == label_dims:
@@ -2064,6 +2092,8 @@ def sigmoid_focal_loss(logit,
        if reduction == "sum":
            return _C_ops.reduce_sum(loss, 'reduce_all', True)
        elif reduction == "mean":
+            if in_dygraph_mode():
+                return _C_ops.final_state_mean_all(loss)
            return _C_ops.mean(loss)
        return loss
@@ -2179,7 +2209,7 @@ def hinge_embedding_loss(input, label, margin=1.0, reduction='mean', name=None):
            "'reduction' in 'hinge_embedding_loss' should be 'sum', 'mean' or 'none', "
            "but received {}.".format(reduction))
-    if not in_dynamic_mode():
+    if not _non_static_mode():
        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                                 'hinge_embedding_loss')
        check_variable_and_dtype(label, 'label', ['float32', 'float64'],

--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -18,7 +18,7 @@ import numpy as np
 import paddle.fluid as fluid
 import paddle
 from .. import functional as F
-from paddle.fluid.framework import _varbase_creator
+from paddle.fluid.framework import _varbase_creator, in_dygraph_mode, _in_legacy_dygraph
 from .. import Layer
 from paddle import in_dynamic_mode
@@ -597,7 +597,11 @@ class MSELoss(Layer):
            fluid.data_feeder.check_variable_and_dtype(
                label, 'label', ['float32', 'float64'], 'MSELoss')
-        square_out = paddle.square(paddle.subtract(input, label))
+        if in_dygraph_mode():
+            square_out = paddle._C_ops.final_state_square(
+                paddle.subtract(input, label))
+        else:
+            square_out = paddle.square(paddle.subtract(input, label))
        if self.reduction == 'none':
            return square_out

--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -987,7 +987,9 @@ class Optimizer(object):
        assert regularization_term is not None
-        if framework._non_static_mode():
+        if framework.in_dygraph_mode():
+            return _C_ops.final_state_add_n([grad, regularization_term])
+        elif framework._in_legacy_dygraph():
            return _C_ops.sum([grad, regularization_term])
        new_grad = grad

--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -758,10 +758,13 @@ def cond(x, p=None, name=None):
        axis = axis if axis != None and axis != [] else [0]
        keepdim = False
-        if paddle.in_dynamic_mode():
+        if _non_static_mode():
            abs_out = _C_ops.abs(input)
-            sum_out = _C_ops.reduce_sum(abs_out, 'dim', axis, 'keepdim',
+            if in_dygraph_mode():
-                                        keepdim, 'reduce_all', reduce_all)
+                sum_out = _C_ops.final_state_sum(abs_out, axis, None, keepdim)
+            else:
+                sum_out = _C_ops.reduce_sum(abs_out, 'dim', axis, 'keepdim',
+                                            keepdim, 'reduce_all', reduce_all)
            if porder == 1 or porder == np.inf:
                return _C_ops.reduce_max(sum_out, 'dim', [-1], 'keepdim',
                                         keepdim, 'reduce_all', reduce_all)
@@ -815,7 +818,12 @@ def cond(x, p=None, name=None):
        reduce_all = True if axis is None or axis == [] else False
        keepdim = False
-        if paddle.in_dynamic_mode():
+        if in_dygraph_mode():
+            pow_out = _C_ops.pow(input, 'factor', porder)
+            sum_out_1 = _C_ops.final_state_sum(pow_out, axis, None, keepdim)
+            sum_out_2 = _C_ops.final_state_sum(sum_out_1, axis, None, keepdim)
+            return _C_ops.pow(sum_out_2, 'factor', float(1. / porder))
+        elif paddle.in_dynamic_mode():
            pow_out = _C_ops.pow(input, 'factor', porder)
            sum_out_1 = _C_ops.reduce_sum(pow_out, 'dim', axis, 'keepdim',
                                          keepdim, 'reduce_all', reduce_all)
@@ -869,10 +877,13 @@ def cond(x, p=None, name=None):
        u, s, vh = svd(input, full_matrices=False)
-        if paddle.in_dynamic_mode():
+        if _non_static_mode():
            if porder == "nuc":
-                return _C_ops.reduce_sum(s, 'dim', axis, 'keepdim', keepdim,
+                if in_dygraph_mode():
-                                         'reduce_all', reduce_all)
+                    return _C_ops.final_state_sum(s, axis, None, keepdim)
+                else:
+                    return _C_ops.reduce_sum(s, 'dim', axis, 'keepdim', keepdim,
+                                             'reduce_all', reduce_all)
            max_out = _C_ops.reduce_max(s, 'dim', axis, 'keepdim', keepdim,
                                        'reduce_all', reduce_all)
            min_out = _C_ops.reduce_min(s, 'dim', axis, 'keepdim', keepdim,
@@ -2530,7 +2541,7 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
            # or              out * x * out = x ;
    """
-    if paddle.in_dynamic_mode():
+    if _non_static_mode():
        if not hermitian:
            # combine svd and matmul op
            u, s, vt = _C_ops.svd(x, 'full_matrices', False)
@@ -2554,8 +2565,11 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
            v, _ = _C_ops.transpose2(vt, 'axis', perm)
            out_1 = v * st
-            out_2 = _C_ops.matmul_v2(out_1, u, 'trans_x', False, 'trans_y',
+            if in_dygraph_mode():
-                                     True)
+                out_2 = _C_ops.final_state_matmul(out_1, u, False, True)
+            else:
+                out_2 = _C_ops.matmul_v2(out_1, u, 'trans_x', False, 'trans_y',
+                                         True)
            return out_2
        else:
            # combine eigh and matmul op
@@ -2578,8 +2592,11 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
            out_1 = u * st
            u_conj = _C_ops.conj(u)
-            out_2 = _C_ops.matmul_v2(out_1, u_conj, 'trans_x', False, 'trans_y',
+            if in_dygraph_mode():
-                                     True)
+                out_2 = _C_ops.final_state_matmul(out_1, u_conj, False, True)
+            else:
+                out_2 = _C_ops.matmul_v2(out_1, u_conj, 'trans_x', False,
+                                         'trans_y', True)
            return out_2
    else:
        if not hermitian:
@@ -3080,7 +3097,7 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
        elif x.dtype == paddle.float64:
            rcond = 1e-15 * max(x.shape[-2], x.shape[-1])
-    if paddle.in_dynamic_mode():
+    if _non_static_mode():
        solution, rank, singular_values = _C_ops.lstsq(x, y, "rcond", rcond,
                                                       "driver", driver)
        if x.shape[-2] > x.shape[-1]:
@@ -3089,8 +3106,11 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
                          False)
            minus_out = _C_ops.elementwise_sub(matmul_out, y)
            pow_out = _C_ops.pow(minus_out, 'factor', 2)
-            residuals = _C_ops.reduce_sum(pow_out, 'dim', [-2], 'keepdim',
+            if in_dygraph_mode():
-                                          False, 'reduce_all', False)
+                residuals = _C_ops.final_state_sum(pow_out, [-2], None, False)
+            else:
+                residuals = _C_ops.reduce_sum(pow_out, 'dim', [-2], 'keepdim',
+                                              False, 'reduce_all', False)
        else:
            residuals = paddle.empty(shape=[0], dtype=x.dtype)

--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -424,6 +424,10 @@ OP_NAMEMAPPING = {
    'elementwise_pow': 'final_state_elementwise_pow',
    'elementwise_floordiv': 'final_state_floor_divide',
    'elementwise_mod': 'final_state_modulo',
+    'elementwise_add': 'final_state_add',
+    'elementwise_sub': 'final_state_subtract',
+    'elementwise_mul': 'final_state_multiply',
+    'elementwise_div': 'final_state_divide',
 }
 @dygraph_only
@@ -436,7 +440,7 @@ def _elementwise_op_in_dygraph(x,
    def is_inplace(op_name):
        return  op_name[-1] == "_"
-    if op_name not in OP_NAMEMAPPING.keys():
+    if op_name not in OP_NAMEMAPPING.keys() or axis != -1:
        op = getattr(_C_ops, op_name)
        out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn)
    else:
@@ -1528,7 +1532,9 @@ def mm(input, mat2, name=None):
    """
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
+        return _C_ops.final_state_matmul(input, mat2, False, False)
+    elif paddle.in_dynamic_mode():
        return _C_ops.matmul_v2(input, mat2)
    def __check_input(x, y):
@@ -1751,7 +1757,9 @@ def inner(x, y, name=None):
        nx = x.reshape((-1, xshape[-1]))
        ny = y.reshape((-1, yshape[-1]))
-        if paddle.in_dynamic_mode():
+        if in_dygraph_mode():
+            return _C_ops.final_state_matmul(nx, ny.T, False, False).reshape(dstshape)
+        elif paddle.in_dynamic_mode():
            return _C_ops.matmul_v2(nx, ny.T).reshape(dstshape)
        def __check_input(x, y):
@@ -1814,7 +1822,9 @@ def outer(x, y, name=None):
    nx = x.reshape((-1, 1))
    ny = y.reshape((1, -1))
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
+        return _C_ops.final_state_matmul(nx, ny, False, False)
+    elif paddle.in_dynamic_mode():
        return _C_ops.matmul_v2(nx, ny)
    def __check_input(x, y):
@@ -3965,7 +3975,11 @@ def rad2deg(x, name=None):
            #         [57.29578018])
    """
    rad2deg_scale = 180 / np.pi
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
+        if convert_dtype(x.dtype) in ['int32', 'int64']:
+            x = cast(x, dtype="float32")
+        return _C_ops.final_state_scale(x, rad2deg_scale, 0.0, True)
+    elif paddle.in_dynamic_mode():
        if convert_dtype(x.dtype) in ['int32', 'int64']:
            x = cast(x, dtype="float32")
        return _C_ops.scale(x, 'scale', rad2deg_scale)
@@ -4018,7 +4032,11 @@ def deg2rad(x, name=None):
            #         [3.14159274])
    """
    deg2rad_scale = np.pi / 180.0
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
+        if convert_dtype(x.dtype) in ['int32', 'int64']:
+            x = cast(x, dtype="float32")
+        return _C_ops.final_state_scale(x, deg2rad_scale, 0.0, True)
+    elif paddle.in_dynamic_mode():
        if convert_dtype(x.dtype) in ['int32', 'int64']:
            x = cast(x, dtype="float32")
        return _C_ops.scale(x, 'scale', deg2rad_scale)
@@ -4263,14 +4281,22 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
        attrs_1 += ('starts', starts_1)
        ends_1 = [dim_len - 1]
        attrs_1 += ('ends', ends_1)
-        input_front = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
+        if in_dygraph_mode():
-            'infer_flags', infer_flags, *attrs_1)
+            input_front = _C_ops.final_state_slice(new_input, axes, starts_1, ends_1, infer_flags,
+                                            [])
+        else:
+            input_front = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
+                'infer_flags', infer_flags, *attrs_1)
        starts_2 = [1]
        attrs_2 += ('starts', starts_2)
        ends_2 = [dim_len]
        attrs_2 += ('ends', ends_2)
-        input_back = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
+        if in_dygraph_mode():
-            'infer_flags', infer_flags, *attrs_2)
+            input_back = input_front = _C_ops.final_state_slice(new_input, axes, starts_2, ends_2, infer_flags,
+                                            [])
+        else:
+            input_back = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
+                'infer_flags', infer_flags, *attrs_2)
        if x.dtype == paddle.bool:
            op = getattr(_C_ops, "logical_xor")

--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -398,7 +398,9 @@ def nonzero(x, as_tuple=False):
    shape = x.shape
    rank = len(shape)
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
+        outs = _C_ops.final_state_where_index(x)
+    elif paddle.in_dynamic_mode():
        outs = _C_ops.where_index(x)
    else:
        helper = LayerHelper("where_index", **locals())