[Eager, Performance optimization] math op sink to Cpp level ( + and - operator...

[Eager, Performance optimization] math op sink to Cpp level ( + and - operator as an example ) (#45811) * [Eager] math op sink to Cpp level * fix ci errors * draft version * draft version * draft version * support + and - operator under cpp directly * add static test * polish code * promote types or unify right type to left * recover static test case * polish code and fix some ci errors * support complex and polish code * fix conflicts * fix windows ci errors * fix windows-inference-ci errors * polish and fix tests * fix test case * polish code * polish code * polish code and fix code-format * polish code * polish code * polish code * polish code

[Eager, Performance optimization] math op sink to Cpp level ( + and - operator...
[Eager, Performance optimization] math op sink to Cpp level ( + and - operator as an example ) (#45811) * [Eager] math op sink to Cpp level * fix ci errors * draft version * draft version * draft version * support + and - operator under cpp directly * add static test * polish code * promote types or unify right type to left * recover static test case * polish code and fix some ci errors * support complex and polish code * fix conflicts * fix windows ci errors * fix windows-inference-ci errors * polish and fix tests * fix test case * polish code * polish code * polish code and fix code-format * polish code * polish code * polish code * polish code
23c50648 · Weilong Wu · GitHub · 88eb82fb · 23c50648 · 23c50648
7 changed file
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -557,6 +557,7 @@ if(WITH_PYTHON)
    set(PYBIND_SRCS eager_py_layer.cc ${PYBIND_SRCS})
    set(PYBIND_SRCS eager_legacy_op_function.cc ${PYBIND_SRCS})
    set(PYBIND_SRCS eager_op_function.cc ${PYBIND_SRCS})
+    set(PYBIND_SRCS eager_math_op_patch.cc ${PYBIND_SRCS})
    list(APPEND PYBIND_DEPS eager_api)
    list(APPEND PYBIND_DEPS autograd_meta)
    list(APPEND PYBIND_DEPS backward)

--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -1113,6 +1113,20 @@ int StringTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
  return 1;
 }

+void AddPyMethodDefs(std::vector<PyMethodDef>* vector, PyMethodDef* methods) {
+  if (!vector->empty()) {
+    // remove nullptr terminator
+    vector->pop_back();
+  }
+  while (true) {
+    vector->push_back(*methods);
+    if (!methods->ml_name) {
+      break;
+    }
+    methods++;
+  }
+}
+
 static void TensorDealloc(TensorObject* self) {
  if (self->weakrefs != NULL)
    PyObject_ClearWeakRefs(reinterpret_cast<PyObject*>(self));
@@ -1124,6 +1138,7 @@ extern struct PyGetSetDef variable_properties[];
 extern struct PyGetSetDef string_tensor_variable_properties[];

 extern PyMethodDef variable_methods[];
+extern PyMethodDef math_op_patch_methods[];
 extern PyMethodDef string_tensor_variable_methods[];

 PyNumberMethods number_methods;
@@ -1133,6 +1148,10 @@ PyMappingMethods mapping_methods;
 void BindEager(pybind11::module* module) {
  auto m = module->def_submodule("eager");

+  static std::vector<PyMethodDef> methods;
+  AddPyMethodDefs(&methods, variable_methods);
+  AddPyMethodDefs(&methods, math_op_patch_methods);
+
  auto heap_type = reinterpret_cast<PyHeapTypeObject*>(
      PyType_Type.tp_alloc(&PyType_Type, 0));
  heap_type->ht_name = ToPyObject("Tensor");
@@ -1144,7 +1163,7 @@ void BindEager(pybind11::module* module) {
  type->tp_as_number = &number_methods;
  type->tp_as_sequence = &sequence_methods;
  type->tp_as_mapping = &mapping_methods;
-  type->tp_methods = variable_methods;
+  type->tp_methods = methods.data();
  type->tp_getset = variable_properties;
  type->tp_init = TensorInit;
  type->tp_new = TensorNew;

--- a/paddle/fluid/pybind/eager_math_op_patch.cc
+++ b/paddle/fluid/pybind/eager_math_op_patch.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+// disable numpy compile error
+
+#if defined(_MSC_VER)
+#include <BaseTsd.h>
+typedef SSIZE_T ssize_t;
+#endif
+
+#include <Python.h>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/hooks.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/exception.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "pybind11/detail/internals.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/python_headers.h"
+#include "paddle/fluid/memory/allocation/mmap_allocator.h"
+#include "paddle/fluid/pybind/op_function_common.h"
+#include "paddle/fluid/pybind/tensor_py.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace paddle {
+namespace pybind {
+
+extern PyTypeObject* p_tensor_type;
+
+bool PyCheckTensor(PyObject* obj) {
+  return PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type));
+}
+
+static bool PyCheckInteger(PyObject* obj) {
+#if PY_VERSION_HEX < 0x03000000
+  return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj);
+#else
+  return PyLong_Check(obj) && !PyBool_Check(obj);
+#endif
+}
+
+static bool IsNumpyType(PyObject* obj) {
+  // It is not a good way to judge the type of obj by its type'name. Maybe using
+  // `PyArray_IsScalar` will be better. However, this interface cannot be used
+  // by including pybind11, and it needs to compile with numpy.
+  auto type_name = std::string(Py_TYPE(obj)->tp_name);
+  return type_name == "numpy.int64" || type_name == "numpy.longlong" ||
+         type_name == "numpy.int32" || type_name == "numpy.int16";
+}
+
+std::set<phi::DataType> _supported_int_dtype_{DataType::UINT8,
+                                              DataType::INT8,
+                                              DataType::INT16,
+                                              DataType::INT32,
+                                              DataType::INT64,
+                                              DataType::BOOL};
+std::set<phi::DataType> _complex_dtypes{
+    DataType::COMPLEX64,
+    DataType::COMPLEX128,
+};
+
+// _supported_promote_complex_types_
+//     '__add__',
+//     '__radd__',
+//     '__sub__',
+//     '__rsub__',
+//     '__mul__',
+//     '__rmul__',
+//     '__div__',
+//     '__truediv__',
+//     '__rdiv__',
+//     '__rtruediv__',
+//     '__matmul__',
+
+void SetDevice(paddle::platform::Place place) {
+  if (paddle::platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    phi::backends::gpu::SetDeviceId(place.device);
+    VLOG(1) << "CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId()
+            << " from " << static_cast<int>(place.device);
+#else
+    PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU if use CUDAPlace."));
+#endif
+  }
+
+  if (paddle::platform::is_custom_place(place)) {
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+    phi::DeviceManager::SetDevice(place);
+    VLOG(1) << "CurrentDeviceId: "
+            << phi::DeviceManager::GetDevice(place.GetDeviceType()) << " from "
+            << static_cast<int>(place.device);
+#else
+    PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with CUSTOM_DEVICE if use "
+        "CustomPlace."));
+#endif
+  }
+}
+
+// scalar func only support add, radd, sub, rsub, mul, rmul, div, truediv.
+// this function will update gradually.
+paddle::experimental::Tensor CallScalarFuction(
+    const paddle::experimental::Tensor& self_tensor,
+    float other,
+    std::string op_type) {
+  paddle::experimental::Tensor ret;
+  if (op_type == "add" || op_type == "radd") {
+    ret = scale_ad_func(self_tensor, phi::Scalar(1.0), other, true);
+  } else if (op_type == "sub") {
+    ret = scale_ad_func(self_tensor, phi::Scalar(1.0), -other, true);
+
+  } else if (op_type == "rsub") {
+    ret = scale_ad_func(self_tensor, phi::Scalar(-1.0), other, true);
+  }
+
+  return ret;
+}
+
+static PyObject* tensor__add__method(TensorObject* self,
+                                     PyObject* args,
+                                     PyObject* kwargs) {
+  paddle::platform::RecordEvent pythonc_record_event(
+      "__add__ or __radd_ pybind_patch_func",
+      paddle::platform::TracerEventType::UserDefined,
+      1);
+
+  EAGER_TRY
+  VLOG(6) << "Running Eager tensor__add__method";
+  // Set Device ID
+  auto place = egr::Controller::Instance().GetExpectedPlace();
+  SetDevice(place);
+
+  paddle::experimental::Tensor ret;
+  paddle::experimental::Tensor self_tensor = self->tensor;
+  PyObject* other_obj = PyTuple_GET_ITEM(args, 0);
+
+  // 1. scalar exists cases
+  if (PyFloat_Check(other_obj) || PyCheckInteger(other_obj) ||
+      IsNumpyType(other_obj)) {
+    float other = 0.0;
+    if (PyFloat_Check(other_obj)) {
+      other = CastPyArg2AttrFloat(other_obj, 0);
+      if (_supported_int_dtype_.find(self_tensor.dtype()) !=
+          _supported_int_dtype_.end()) {
+        eager_gil_scoped_release guard;
+        self_tensor = cast_ad_func(self_tensor, DataType::FLOAT32);
+      }
+    } else if (PyCheckInteger(other_obj) || IsNumpyType(other_obj)) {
+      other = static_cast<float>(CastPyArg2AttrInt(other_obj, 0));
+    }
+
+    {
+      eager_gil_scoped_release guard;
+      ret = CallScalarFuction(self_tensor, other, "add");
+    }
+    return ToPyObject(ret);
+  }
+
+  // 2. create or get tensor for other_obj
+  paddle::experimental::Tensor other_tensor;
+  if (!PyCheckTensor(other_obj)) {
+    paddle::experimental::Scalar value =
+        CastPyArg2Scalar(other_obj, "__add__", 0);
+    {
+      eager_gil_scoped_release guard;
+      other_tensor =
+          full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place);
+    }
+  } else {
+    other_tensor = CastPyArg2Tensor(other_obj, 0);
+  }
+
+  // 3. promote types or unify right var type to left var
+  phi::DataType lhs_dtype = self_tensor.dtype();
+  phi::DataType rhs_dtype = other_tensor.dtype();
+  if (lhs_dtype != rhs_dtype) {
+    // note: only op_type in _supported_promote_complex_types_ should promote
+    // dtype
+    if (_complex_dtypes.find(lhs_dtype) != _complex_dtypes.end() ||
+        _complex_dtypes.find(rhs_dtype) != _complex_dtypes.end()) {
+      phi::DataType promote_dtype =
+          framework::TransToPhiDataType(framework::PromoteTypesIfComplexExists(
+              framework::TransToProtoVarType(lhs_dtype),
+              framework::TransToProtoVarType(rhs_dtype)));
+      if (lhs_dtype != promote_dtype) {
+        // cast
+        eager_gil_scoped_release guard;
+        self_tensor = cast_ad_func(self_tensor, promote_dtype);
+      }
+      if (rhs_dtype != promote_dtype) {
+        eager_gil_scoped_release guard;
+        other_tensor = cast_ad_func(other_tensor, promote_dtype);
+      }
+    } else {
+      LOG(WARNING)
+          << "The dtype of left and right Tensor are not the same, left "
+             "dtype is "
+          << lhs_dtype << ", but right dtype is " << rhs_dtype
+          << ", the right dtype will convert to " << lhs_dtype;
+      eager_gil_scoped_release guard;
+      other_tensor = cast_ad_func(other_tensor, lhs_dtype);
+    }
+  }
+
+  // 4. calculation
+  VLOG(6) << "Calling add_ad_func in tensor__add__method";
+
+  {
+    eager_gil_scoped_release guard;
+    ret = add_ad_func(self_tensor, other_tensor);
+  }
+
+  return ToPyObject(ret);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor__sub__method(TensorObject* self,
+                                     PyObject* args,
+                                     PyObject* kwargs) {
+  paddle::platform::RecordEvent pythonc_record_event(
+      "__sub__ pybind_patch_func",
+      paddle::platform::TracerEventType::UserDefined,
+      1);
+
+  EAGER_TRY
+  VLOG(6) << "Running Eager tensor__sub__method";
+
+  // Set Device ID
+  auto place = egr::Controller::Instance().GetExpectedPlace();
+  SetDevice(place);
+
+  paddle::experimental::Tensor ret;
+  paddle::experimental::Tensor self_tensor = self->tensor;
+
+  PyObject* other_obj = PyTuple_GET_ITEM(args, 0);
+  // 1. scalar exists cases
+  if (PyFloat_Check(other_obj) || PyCheckInteger(other_obj) ||
+      IsNumpyType(other_obj)) {
+    float other = 0.0;
+    if (PyFloat_Check(other_obj)) {
+      other = CastPyArg2AttrFloat(other_obj, 0);
+      if (_supported_int_dtype_.find(self_tensor.dtype()) !=
+          _supported_int_dtype_.end()) {
+        eager_gil_scoped_release guard;
+        self_tensor = cast_ad_func(self_tensor, DataType::FLOAT32);
+      }
+    } else if (PyCheckInteger(other_obj) || IsNumpyType(other_obj)) {
+      other = static_cast<float>(CastPyArg2AttrInt(other_obj, 0));
+    }
+    {
+      eager_gil_scoped_release guard;
+      ret = CallScalarFuction(self_tensor, other, "sub");
+    }
+
+    return ToPyObject(ret);
+  }
+  // 2. create or get tensor for other_obj
+  paddle::experimental::Tensor other_tensor;
+  if (!PyCheckTensor(other_obj)) {
+    paddle::experimental::Scalar value =
+        CastPyArg2Scalar(other_obj, "__sub__", 0);
+    {
+      eager_gil_scoped_release guard;
+      other_tensor =
+          full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place);
+    }
+  } else {
+    other_tensor = CastPyArg2Tensor(other_obj, 0);
+  }
+
+  // 3. promote types or unify right var type to left var
+  phi::DataType lhs_dtype = self_tensor.dtype();
+  phi::DataType rhs_dtype = other_tensor.dtype();
+  if (lhs_dtype != rhs_dtype) {
+    if (_complex_dtypes.find(lhs_dtype) != _complex_dtypes.end() ||
+        _complex_dtypes.find(rhs_dtype) != _complex_dtypes.end()) {
+      phi::DataType promote_dtype =
+          framework::TransToPhiDataType(framework::PromoteTypesIfComplexExists(
+              framework::TransToProtoVarType(lhs_dtype),
+              framework::TransToProtoVarType(rhs_dtype)));
+      if (lhs_dtype != promote_dtype) {
+        // cast
+        eager_gil_scoped_release guard;
+        self_tensor = cast_ad_func(self_tensor, promote_dtype);
+      }
+      if (rhs_dtype != promote_dtype) {
+        eager_gil_scoped_release guard;
+        other_tensor = cast_ad_func(other_tensor, promote_dtype);
+      }
+    } else {
+      LOG(WARNING)
+          << "The dtype of left and right Tensor are not the same, left "
+             "dtype is "
+          << lhs_dtype << ", but right dtype is " << rhs_dtype
+          << ", the right dtype will convert to " << lhs_dtype;
+      eager_gil_scoped_release guard;
+      other_tensor = cast_ad_func(other_tensor, lhs_dtype);
+    }
+  }
+  // 4. calculation
+  VLOG(6) << "Calling subtract_ad_func in tensor__sub__method";
+  {
+    eager_gil_scoped_release guard;
+    ret = subtract_ad_func(self_tensor, other_tensor);
+  }
+
+  return ToPyObject(ret);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor__rsub__method(TensorObject* self,
+                                      PyObject* args,
+                                      PyObject* kwargs) {
+  paddle::platform::RecordEvent pythonc_record_event(
+      "__rsub__ pybind_patch_func",
+      paddle::platform::TracerEventType::UserDefined,
+      1);
+
+  EAGER_TRY
+  VLOG(1) << "Running Eager tensor__rsub__method";
+
+  // Set Device ID
+  auto place = egr::Controller::Instance().GetExpectedPlace();
+  SetDevice(place);
+
+  paddle::experimental::Tensor ret;
+  paddle::experimental::Tensor self_tensor = self->tensor;
+  PyObject* other_obj = PyTuple_GET_ITEM(args, 0);
+
+  // 1. scalar exists cases
+  if (PyFloat_Check(other_obj) || PyCheckInteger(other_obj) ||
+      IsNumpyType(other_obj)) {
+    float other = 0.0;
+    if (PyFloat_Check(other_obj)) {
+      other = CastPyArg2AttrFloat(other_obj, 0);
+      if (_supported_int_dtype_.find(self_tensor.dtype()) !=
+          _supported_int_dtype_.end()) {
+        eager_gil_scoped_release guard;
+        self_tensor = cast_ad_func(self_tensor, DataType::FLOAT32);
+      }
+    } else if (PyCheckInteger(other_obj) || IsNumpyType(other_obj)) {
+      other = static_cast<float>(CastPyArg2AttrInt(other_obj, 0));
+    }
+    {
+      eager_gil_scoped_release guard;
+      ret = CallScalarFuction(self_tensor, other, "rsub");
+    }
+    return ToPyObject(ret);
+  }
+
+  // 2. create or get tensor for other_obj
+  paddle::experimental::Tensor other_tensor;
+  if (!PyCheckTensor(other_obj)) {
+    paddle::experimental::Scalar value =
+        CastPyArg2Scalar(other_obj, "__rsub__", 0);
+    {
+      eager_gil_scoped_release guard;
+      other_tensor =
+          full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place);
+    }
+  } else {
+    other_tensor = CastPyArg2Tensor(other_obj, 0);
+  }
+
+  // 3. promote types or unify right var type to left var
+  phi::DataType lhs_dtype = self_tensor.dtype();
+  phi::DataType rhs_dtype = other_tensor.dtype();
+  if (lhs_dtype != rhs_dtype) {
+    if (_complex_dtypes.find(lhs_dtype) != _complex_dtypes.end() ||
+        _complex_dtypes.find(rhs_dtype) != _complex_dtypes.end()) {
+      phi::DataType promote_dtype =
+          framework::TransToPhiDataType(framework::PromoteTypesIfComplexExists(
+              framework::TransToProtoVarType(lhs_dtype),
+              framework::TransToProtoVarType(rhs_dtype)));
+      if (lhs_dtype != promote_dtype) {
+        // cast
+        eager_gil_scoped_release guard;
+        self_tensor = cast_ad_func(self_tensor, promote_dtype);
+      }
+      if (rhs_dtype != promote_dtype) {
+        eager_gil_scoped_release guard;
+        other_tensor = cast_ad_func(other_tensor, promote_dtype);
+      }
+    } else {
+      LOG(WARNING)
+          << "The dtype of left and right Tensor are not the same, left "
+             "dtype is "
+          << lhs_dtype << ", but right dtype is " << rhs_dtype
+          << ", the right dtype will convert to " << lhs_dtype;
+      eager_gil_scoped_release guard;
+      other_tensor = cast_ad_func(other_tensor, lhs_dtype);
+    }
+  }
+
+  // 4. calculation
+  VLOG(6) << "Calling subtract_ad_func in tensor__rsub__method";
+  {
+    eager_gil_scoped_release guard;
+    ret = subtract_ad_func(other_tensor, self_tensor);
+  }
+
+  return ToPyObject(ret);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+PyMethodDef math_op_patch_methods[] = {
+    {"__add__",
+     (PyCFunction)(void (*)(void))tensor__add__method,
+     METH_VARARGS | METH_KEYWORDS,
+     NULL},
+    {"__radd__",
+     (PyCFunction)(void (*)(void))tensor__add__method,
+     METH_VARARGS | METH_KEYWORDS,
+     NULL},
+    {"__sub__",
+     (PyCFunction)(void (*)(void))tensor__sub__method,
+     METH_VARARGS | METH_KEYWORDS,
+     NULL},
+    {"__rsub__",
+     (PyCFunction)(void (*)(void))tensor__rsub__method,
+     METH_VARARGS | METH_KEYWORDS,
+     NULL},
+    {NULL, NULL, 0, NULL}};
+
+}  // namespace pybind
+}  // namespace paddle
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -333,99 +333,102 @@ def monkey_patch_math_varbase():
        ('ndim', _ndim_),
        ('size', _size_),
        ('T', _T_),
-        ('__add__', _binary_creator_('__add__', 'add', False, _scalar_add_,
-                                     True)) if framework._in_eager_mode_ else
        ('__add__',
         _binary_creator_('__add__', 'elementwise_add', False, _scalar_add_)),
-        ##  a+b == b+a. Do not need to reverse explicitly
-        ('__radd__',
-         _binary_creator_('__radd__', 'add', False, _scalar_add_, True))
-        if framework._in_eager_mode_ else
+        #  a+b == b+a. Do not need to reverse explicitly
        ('__radd__',
         _binary_creator_('__radd__', 'elementwise_add', False, _scalar_add_)),
-        ('__sub__',
-         _binary_creator_('__sub__', 'subtract', False, _scalar_sub_, True))
-        if framework._in_eager_mode_ else
        ('__sub__',
         _binary_creator_('__sub__', 'elementwise_sub', False, _scalar_sub_)),
-        ('__rsub__',
-         _binary_creator_('__rsub__', 'subtract', True, _scalar_rsub_, True))
-        if framework._in_eager_mode_ else
        ('__rsub__',
         _binary_creator_('__rsub__', 'elementwise_sub', True, _scalar_rsub_)),
-        ('__mul__',
-         _binary_creator_('__mul__', 'multiply', False, _scalar_mul_, True))
-        if framework._in_eager_mode_ else
        ('__mul__',
         _binary_creator_('__mul__', 'elementwise_mul', False, _scalar_mul_)),
        ## a*b == b*a. Do not need to reverse explicitly
-        ('__rmul__',
-         _binary_creator_('__rmul__', 'multiply', False, _scalar_mul_, True))
-        if framework._in_eager_mode_ else
        ('__rmul__',
         _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
-        ('__div__',
-         _binary_creator_('__div__', 'divide', False, _scalar_div_, True))
-        if framework._in_eager_mode_ else
        ('__div__',
         _binary_creator_('__div__', 'elementwise_div', False, _scalar_div_)),
-        ('__truediv__',
-         _binary_creator_('__truediv__', 'divide', False, _scalar_div_, True))
-        if framework._in_eager_mode_ else
        ('__truediv__',
         _binary_creator_('__truediv__', 'elementwise_div', False,
                          _scalar_div_)),
-        ('__rdiv__', _binary_creator_('__rdiv__', 'divide', True, None, True))
-        if framework._in_eager_mode_ else
-        ('__rdiv__',
-         _binary_creator_('__rdiv__', 'elementwise_div', True, None)),
-        ('__rtruediv__',
-         _binary_creator_('rtruediv__', 'divide', True, None, True))
-        if framework._in_eager_mode_ else
+        ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
+                                      None)),
        ('__rtruediv__',
         _binary_creator_('rtruediv__', 'elementwise_div', True, None)),
-        ('__pow__', _binary_creator_('__pow__', 'pow', False, _C_ops.pow, True))
-        if framework._in_eager_mode_ else
-        ('__pow__',
-         _binary_creator_('__pow__', 'elementwise_pow', False, None)),
+        ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
+                                     None)),
        ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
                                      None)),
-        ('__floordiv__',
-         _binary_creator_('__floordiv__', 'floor_divide', False, None, True))
-        if framework._in_eager_mode_ else
        ('__floordiv__',
         _binary_creator_('__floordiv__', 'elementwise_floordiv', False, None)),
-        ('__mod__', _binary_creator_('__mod__', 'remainder', False, None, True))
-        if framework._in_eager_mode_ else
-        ('__mod__',
-         _binary_creator_('__mod__', 'elementwise_mod', False, None)),
-        ('__matmul__',
-         _binary_creator_('__matmul__', "matmul", False, None, True))
-        if framework._in_eager_mode_ else
-        ('__matmul__',
-         _binary_creator_('__matmul__', "matmul_v2", False, None)),
+        ('__mod__', _binary_creator_('__mod__', 'elementwise_mod', False,
+                                     None)),
+        ('__matmul__', _binary_creator_('__matmul__', "matmul_v2", False,
+                                        None)),
        ## for logical compare
-        ('__eq__', _binary_creator_('__eq__', 'equal', False, None, True))
-        if framework._in_eager_mode_ else
        ('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
-        ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None, True))
-        if framework._in_eager_mode_ else
        ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
-        ('__lt__', _binary_creator_('__lt__', 'less_than', False, None, True))
-        if framework._in_eager_mode_ else
        ('__lt__', _binary_creator_('__lt__', 'less_than', False, None)),
-        ('__le__', _binary_creator_('__le__', 'less_equal', False, None, True))
-        if framework._in_eager_mode_ else
        ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
-        ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None,
-                                    True)) if framework._in_eager_mode_ else
        ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None)),
-        ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None,
-                                    True)) if framework._in_eager_mode_ else
        ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None)),
        ('__array_ufunc__', None)
    ]

+    eager_methods = [
+        ('__neg__', _neg_),
+        ('__float__', _float_),
+        ('__long__', _long_),
+        ('__int__', _int_),
+        ('__len__', _len_),
+        ('__index__', _index_),
+        ('astype', astype),
+        ('dim', lambda x: len(x.shape)),
+        ('ndimension', lambda x: len(x.shape)),
+        ('ndim', _ndim_),
+        ('size', _size_),
+        ('T', _T_),
+        ('__mul__',
+         _binary_creator_('__mul__', 'multiply', False, _scalar_mul_, True)),
+        ('__rmul__',
+         _binary_creator_('__rmul__', 'multiply', False, _scalar_mul_, True)),
+        ('__div__',
+         _binary_creator_('__div__', 'divide', False, _scalar_div_, True)),
+        ('__truediv__',
+         _binary_creator_('__truediv__', 'divide', False, _scalar_div_, True)),
+        ('__rdiv__', _binary_creator_('__rdiv__', 'divide', True, None, True)),
+        ('__rtruediv__',
+         _binary_creator_('rtruediv__', 'divide', True, None, True)),
+        ('__pow__', _binary_creator_('__pow__', 'pow', False, _C_ops.pow,
+                                     True)),
+        ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
+                                      None)),
+        ('__floordiv__',
+         _binary_creator_('__floordiv__', 'floor_divide', False, None, True)),
+        ('__mod__', _binary_creator_('__mod__', 'remainder', False, None,
+                                     True)),
+        ('__matmul__',
+         _binary_creator_('__matmul__', "matmul", False, None, True)),
+        # for logical compare
+        ('__eq__', _binary_creator_('__eq__', 'equal', False, None, True)),
+        ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None, True)),
+        ('__lt__', _binary_creator_('__lt__', 'less_than', False, None, True)),
+        ('__le__', _binary_creator_('__le__', 'less_equal', False, None, True)),
+        ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None,
+                                    True)),
+        ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None,
+                                    True)),
+        ('__array_ufunc__', None)
+    ]
+
+    eager_cpp_level_patch = [
+        "__add__",
+        "__radd__",
+        '__sub__',
+        '__rsub__',
+    ]
+
    global _already_patch_varbase
    global _already_patch_eager_tensor

@@ -439,10 +442,22 @@ def monkey_patch_math_varbase():
        local_tensor = core.VarBase

    if not local_already_patch:
-        for method in varbase_methods:
-            method_name = method[0]
-            method_impl = method[1]
-            setattr(local_tensor, method_name, method_impl)
+        if framework._in_eager_mode_:
+            for method_name in eager_cpp_level_patch:
+                method_impl = getattr(local_tensor, method_name, None)
+                if method_impl:
+                    setattr(local_tensor, method_name, method_impl)
+
+            for method in eager_methods:
+                method_name = method[0]
+                method_impl = method[1]
+                setattr(local_tensor, method_name, method_impl)
+
+        else:
+            for method in varbase_methods:
+                method_name = method[0]
+                method_impl = method[1]
+                setattr(local_tensor, method_name, method_impl)
    else:
        import paddle.tensor
        # Tensor method from module paddle.tensor

--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -20,6 +20,7 @@ import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.framework import _test_eager_guard


 class TestElementwiseAddOp(OpTest):
@@ -693,13 +694,42 @@ class TestBoolAddFloatElementwiseAddop(unittest.TestCase):
        self.assertTrue(c.dtype == core.VarDesc.VarType.FP32)
        paddle.enable_static()

-    def test_dygraph_add(self):
+    def func_dygraph_add(self):
        paddle.disable_static()
        a = 1.5
-        b = paddle.full([4, 5, 6], True, dtype='bool')
+        b = paddle.full([2], True, dtype='bool')
+        # special case: scalar + tensor(bool)
        c = a + b
        self.assertTrue(c.dtype == core.VarDesc.VarType.FP32)

+        np_a = np.random.random((2, 3, 4)).astype(np.float64)
+        np_b = np.random.random((2, 3, 4)).astype(np.float64)
+
+        tensor_a = paddle.to_tensor(np_a, dtype="float32")
+        tensor_b = paddle.to_tensor(np_b, dtype="float32")
+
+        # normal case: tensor + tensor
+        expect_out = np_a + np_b
+        actual_out = tensor_a + tensor_b
+        np.testing.assert_allclose(actual_out, expect_out)
+
+        # normal case: tensor + scalar
+        expect_out = np_a + 1
+        actual_out = tensor_a + 1
+        np.testing.assert_allclose(actual_out, expect_out)
+
+        # normal case: scalar + tenor
+        expect_out = 1 + np_a
+        actual_out = 1 + tensor_a
+        np.testing.assert_allclose(actual_out, expect_out)
+
+        paddle.enable_static()
+
+    def test_dygraph_add(self):
+        with _test_eager_guard():
+            self.func_dygraph_add()
+        self.func_dygraph_add()
+

 if __name__ == '__main__':
    paddle.enable_static()

--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
@@ -19,6 +19,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
+from paddle.fluid.framework import _test_eager_guard


 class TestElementwiseOp(OpTest):
@@ -394,6 +395,49 @@ class TestSubtractInplaceBroadcastError3(TestSubtractInplaceBroadcastError):
        self.y_numpy = np.random.rand(2, 3, 4).astype('float')


+class TestFloatElementwiseSubop(unittest.TestCase):
+
+    def func_dygraph_sub(self):
+        paddle.disable_static()
+
+        np_a = np.random.random((2, 3, 4)).astype(np.float64)
+        np_b = np.random.random((2, 3, 4)).astype(np.float64)
+
+        tensor_a = paddle.to_tensor(np_a, dtype="float32")
+        tensor_b = paddle.to_tensor(np_b, dtype="float32")
+
+        # normal case: tensor - tensor
+        expect_out = np_a - np_b
+        actual_out = tensor_a - tensor_b
+        np.testing.assert_allclose(actual_out,
+                                   expect_out,
+                                   rtol=1e-07,
+                                   atol=1e-07)
+
+        # normal case: tensor - scalar
+        expect_out = np_a - 1
+        actual_out = tensor_a - 1
+        np.testing.assert_allclose(actual_out,
+                                   expect_out,
+                                   rtol=1e-07,
+                                   atol=1e-07)
+
+        # normal case: scalar - tenor
+        expect_out = 1 - np_a
+        actual_out = 1 - tensor_a
+        np.testing.assert_allclose(actual_out,
+                                   expect_out,
+                                   rtol=1e-07,
+                                   atol=1e-07)
+
+        paddle.enable_static()
+
+    def test_dygraph_sub(self):
+        with _test_eager_guard():
+            self.func_dygraph_sub()
+        self.func_dygraph_sub()
+
+
 if __name__ == '__main__':
    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_tensor_type_promotion.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_type_promotion.py
@@ -18,6 +18,7 @@ import unittest
 import numpy as np
 import warnings
 import paddle
+from paddle.fluid.framework import _test_eager_guard


 class TestTensorTypePromotion(unittest.TestCase):
@@ -26,7 +27,7 @@ class TestTensorTypePromotion(unittest.TestCase):
        self.x = paddle.to_tensor([2, 3])
        self.y = paddle.to_tensor([1.0, 2.0])

-    def test_operator(self):
+    def add_operator(self):
        with warnings.catch_warnings(record=True) as context:
            warnings.simplefilter("always")
            self.x + self.y
@@ -34,6 +35,7 @@ class TestTensorTypePromotion(unittest.TestCase):
                "The dtype of left and right variables are not the same" in str(
                    context[-1].message))

+    def sub_operator(self):
        with warnings.catch_warnings(record=True) as context:
            warnings.simplefilter("always")
            self.x - self.y
@@ -41,6 +43,7 @@ class TestTensorTypePromotion(unittest.TestCase):
                "The dtype of left and right variables are not the same" in str(
                    context[-1].message))

+    def mul_operator(self):
        with warnings.catch_warnings(record=True) as context:
            warnings.simplefilter("always")
            self.x * self.y
@@ -48,6 +51,7 @@ class TestTensorTypePromotion(unittest.TestCase):
                "The dtype of left and right variables are not the same" in str(
                    context[-1].message))

+    def div_operator(self):
        with warnings.catch_warnings(record=True) as context:
            warnings.simplefilter("always")
            self.x / self.y
@@ -55,6 +59,18 @@ class TestTensorTypePromotion(unittest.TestCase):
                "The dtype of left and right variables are not the same" in str(
                    context[-1].message))

+    def test_operator(self):
+        with _test_eager_guard():
+            self.setUp()
+            # add and sub has been sunk to cpp level, there is no warnings to catch by this test.
+            self.mul_operator()
+            self.div_operator()
+        self.setUp()
+        self.add_operator()
+        self.sub_operator()
+        self.mul_operator()
+        self.div_operator()
+

 if __name__ == '__main__':
    unittest.main()