[Eager] generate eager core ops, only 4 ops (#37813)

* refine a test case, test=develop * publish python c api for eager, test=develop * revert modify about test_allclose_layer.py, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * delete numpy includes, use pybind11 numpy.h, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * suport eager error msg, and add grad test case, test=develop * refine, test=develop * refine, test=develop * generate eager core ops, only 4 ops, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop

[Eager] generate eager core ops, only 4 ops (#37813)
* refine a test case, test=develop * publish python c api for eager, test=develop * revert modify about test_allclose_layer.py, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * delete numpy includes, use pybind11 numpy.h, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * suport eager error msg, and add grad test case, test=develop * refine, test=develop * refine, test=develop * generate eager core ops, only 4 ops, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop
52f63cd2 · wanghuancoder · GitHub · cf873c39 · 52f63cd2 · 52f63cd2
15 changed file
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -134,17 +134,25 @@ if(WITH_PYTHON)
  add_executable(op_function_generator op_function_generator.cc)
  target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
+  add_executable(eager_op_function_generator eager_op_function_generator.cc)
+  target_link_libraries(eager_op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
  get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
  target_link_libraries(op_function_generator ${os_dependency_modules})
+  target_link_libraries(eager_op_function_generator ${os_dependency_modules})
  if(WITH_ROCM)
    target_link_libraries(op_function_generator ${ROCM_HIPRTC_LIB})
+    target_link_libraries(eager_op_function_generator ${ROCM_HIPRTC_LIB})
  endif()
  set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h)
  set(tmp_impl_file ${impl_file}.tmp)
+  set(eager_impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function_impl.h)
+  set(tmp_eager_impl_file ${eager_impl_file}.tmp)
  set(OP_IMPL_DEPS op_function_generator)
+  set(EAGER_OP_IMPL_DEPS eager_op_function_generator)
  if(WIN32)
    if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
      set(op_impl_path "${CMAKE_CURRENT_BINARY_DIR}")
@@ -168,22 +176,41 @@ if(WITH_PYTHON)
    ")\n"
    "exit /b 0")
+    file(WRITE ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat ""
+    "set build_times=1\n"
+    ":retry\n"
+    "ECHO eager_op_function_generator run %build_times% time\n"
+    "taskkill /f /im eager_op_function_generator.exe 2>NUL\n"
+    "${op_impl_path}/eager_op_function_generator.exe ${tmp_eager_impl_file}\n"
+    "if %ERRORLEVEL% NEQ 0 (\n"
+    "    set /a build_times=%build_times%+1\n"
+    "    if %build_times% GEQ 10 (\n"
+    "        exit /b 1\n"
+    "    ) else (\n"
+    "        goto :retry\n"
+    "    )\n"
+    ")\n"
+    "exit /b 0")
    if(${CBLAS_PROVIDER} STREQUAL MKLML)
      ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/libiomp5md.dll
        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${op_impl_path}
        DEPENDS mklml)
      list(APPEND OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll)
+      list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll)
    else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
      ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/openblas.dll
        COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} ${op_impl_path}
        DEPENDS extern_openblas)
      list(APPEND OP_IMPL_DEPS ${op_impl_path}/openblas.dll)
+      list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/openblas.dll)
    endif()
    if(WITH_MKLDNN)
      ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/mkldnn.dll
        COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${op_impl_path}
        DEPENDS mkldnn)
        list(APPEND OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
+        list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
    endif()
    add_custom_command(OUTPUT ${impl_file}
@@ -191,6 +218,13 @@ if(WITH_PYTHON)
      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file}
      COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
      DEPENDS ${OP_IMPL_DEPS})
+    if(NOT ON_INFER)
+      add_custom_command(OUTPUT ${eager_impl_file}
+        COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file} ${eager_impl_file}
+        COMMENT "copy_if_different ${tmp_eager_impl_file} to ${eager_impl_file}"
+        DEPENDS ${EAGER_OP_IMPL_DEPS})
+    endif()
  else(WIN32)
    # If there are no *.so in /usr/lib or LD_LIBRARY_PATH,
    # copy these *.so to current directory and append current directory to
@@ -201,12 +235,14 @@ if(WITH_PYTHON)
        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${CMAKE_CURRENT_BINARY_DIR}
        DEPENDS mklml)
      list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so)
+      list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so)
    endif()
    if(WITH_MKLDNN)
      ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0
        COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR}
        DEPENDS mkldnn)
      list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0)
+      list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0)
    endif()
    add_custom_command(OUTPUT ${impl_file}
          COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:."
@@ -216,15 +252,34 @@ if(WITH_PYTHON)
          COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
          DEPENDS ${OP_IMPL_DEPS}
          VERBATIM)
+    if(NOT ON_INFER)
+      add_custom_command(OUTPUT ${eager_impl_file}
+            COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:."
+                "${CMAKE_CURRENT_BINARY_DIR}/eager_op_function_generator"
+                "${tmp_eager_impl_file}"
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file} ${eager_impl_file}
+            COMMENT "copy_if_different ${tmp_eager_impl_file} to ${eager_impl_file}"
+            DEPENDS ${EAGER_OP_IMPL_DEPS}
+            VERBATIM)
+      endif()
  endif(WIN32)
  add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file})
+  if(NOT ON_INFER)
+    add_custom_target(eager_op_function_generator_cmd ALL DEPENDS ${eager_impl_file})
+  endif()
  list(APPEND PYBIND_DEPS interpretercore standalone_executor)
+  cc_library(op_function_common SRCS op_function_common.cc DEPS ${PYBIND_DEPS})
+  list(APPEND PYBIND_DEPS op_function_common)
-  cc_library(paddle_eager
+  if(NOT ON_INFER)
-  SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc
+    cc_library(paddle_eager
-  DEPS autograd_meta grad_node_info pten global_utils utils eager_api accumulation_node backward python)
+    SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc
-  list(APPEND PYBIND_DEPS paddle_eager)
+    DEPS eager_api autograd_meta backward grad_node_info pten op_function_common dygraph_function dygraph_node math_cpu linalg_cpu creation_cpu utils_cpu manipulation_cpu accumulation_node global_utils utils python)
+    add_dependencies(paddle_eager eager_codegen)
+    add_dependencies(paddle_eager eager_op_function_generator_cmd)
+    list(APPEND PYBIND_DEPS paddle_eager)
+  endif()
  cc_library(paddle_pybind SHARED
    SRCS ${PYBIND_SRCS}

--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
@@ -27,6 +28,7 @@ limitations under the License. */
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/include/core.h"
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#include "paddle/fluid/pybind/eager_op_function_impl.h"
 namespace paddle {
 namespace pybind {
@@ -126,6 +128,7 @@ void BindEager(pybind11::module* module) {
  }
  BindFunctions(m.ptr());
+  BindEagerOpFunctions(&m);
 }
 }  // namespace pybind

--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -37,6 +37,7 @@ extern PyTypeObject* pEagerTensorType;
 static PyObject* eager_tensor_method_numpy(EagerTensorObject* self,
                                           PyObject* args, PyObject* kwargs) {
  EAGER_TRY
+  self->eagertensor.SyncToTensor();
  if (!self->eagertensor.initialized()) {
    Py_INCREF(Py_None);
    return Py_None;
@@ -93,6 +94,9 @@ static PyObject* eager_tensor_method_is_initialized(EagerTensorObject* self,
                                                    PyObject* args,
                                                    PyObject* kwargs) {
  EAGER_TRY
+  if (self->eagertensor.Var().IsInitialized()) {
+    self->eagertensor.SyncToTensor();
+  }
  return ToPyObject(self->eagertensor.initialized());
  EAGER_CATCH_AND_THROW_RETURN_NULL
 }

--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <set>
+#include <string>
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/pybind/pybind.h"
+#include "paddle/fluid/string/string_helper.h"
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
+#endif
+#include "paddle/fluid/pybind/op_function_generator.h"
+std::set<std::string> gen_list = {"elementwise_add", "reduce_sum", "matmul_v2",
+                                  "sigmoid"};
+// clang-format off
+const char* OUT_INITIALIZER_TEMPLATE =
+    R"({"%s", {std::shared_ptr<imperative::VarBase>(new imperative::VarBase("auto_"+std::to_string(VarBaseUniqueNameID++)+"_"))}})";
+const char* OUT_DUPLICABLE_INITIALIZER_TEMPLATE = R"({"%s", ConstructDuplicableOutput(%s)})";
+const char* INPUT_INITIALIZER_TEMPLATE = R"({"%s", {%s}})";
+const char* INPUT_LIST_INITIALIZER_TEMPLATE = R"({"%s", %s})";
+const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(
+    if (%s != nullptr) {
+      ins["%s"] = {%s};
+    }
+)";
+const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"(
+    if (%s.size() != 0) {
+      ins["%s"] = %s;
+    }
+)";
+const char* OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(
+    outs["%s"] = {%s};
+)";
+const char* OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"(
+    outs["%s"] = %s;
+)";
+// if inputs is list, no need {}
+const char* ARG_OUT_NUM = R"(%sNum)";
+const char* ARG_OUT_NUM_TYPE = R"(size_t )";
+const char* IN_VAR_TYPE = R"(py::handle)";
+const char* IN_VAR_LIST_TYPE = R"(py::handle)";
+const char* OUT_VAR_TYPE = R"(std::shared_ptr<imperative::VarBase>)";
+const char* OUT_VAR_LIST_TYPE = R"(std::vector<std::shared_ptr<imperative::VarBase>>)";
+const char* CAST_VAR_TEMPLATE = R"(
+    auto %s = GetEagerTensorFromArgs("%s", "%s", args, %d, %s);)";
+const char* CAST_VAR_LIST_TEMPLATE = R"(
+    auto %s = GetEagerTensorListFromArgs("%s", "%s", args, %d, %s);)";
+const char* CAST_SIZE_T_TEMPLATE = R"(
+    auto %s = GetUnsignedLongFromArgs("%s", "%s", args, %d, %s);)";
+const char* ARG_TEMPLATE = R"(const %s& %s)";
+const char* RETURN_TUPLE_TYPE = R"(std::tuple<%s>)";
+const char* RETURN_TUPLE_TEMPLATE = R"(std::make_tuple(%s))";
+const char* RETURN_LIST_TEMPLATE = R"(outs["%s"])";
+const char* RETURN_TEMPLATE = R"(outs["%s"][0])";
+const char* FUNCTION_ARGS = R"(%s, const py::args& args)";
+const char* FUNCTION_ARGS_NO_INPUT = R"(const py::args& args)";
+const char* HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT = R"(
+    if (ins.count("%s") && outs.count("%s")) {
+      HandleViewBetweenInputAndOutput(ins["%s"][0], outs["%s"][0]);
+    })";
+const char* OP_FUNCTION_TEMPLATE =
+R"(
+static PyObject * %s(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+  PyThreadState *tstate = nullptr;
+  try
+  {
+    %s
+    framework::AttributeMap attrs;
+    ConstructAttrMapFromPyArgs("%s", args, %d, PyTuple_GET_SIZE(args) , attrs);
+    tstate = PyEval_SaveThread();
+    %s
+    PyEval_RestoreThread(tstate);
+    tstate = nullptr;
+    %s
+  }
+  catch(...) {
+    if (tstate) {
+      PyEval_RestoreThread(tstate);
+    }
+    ThrowExceptionToPython(std::current_exception());
+    return nullptr;
+  }
+})";
+const char* PYBIND_ITEM_TEMPLATE = R"(  {"%s", (PyCFunction)(void(*)(void))%s, METH_VARARGS | METH_KEYWORDS, "C++ interface function for %s in dygraph."},)";
+// clang-format on
+static inline bool FindInsMap(const std::string& op_type,
+                              const std::string& in_name) {
+  return op_ins_map[op_type].count(in_name);
+}
+static inline bool FindOutsMap(const std::string& op_type,
+                               const std::string& out_name) {
+  return op_outs_map[op_type].count(out_name);
+}
+static inline bool FindPassingOutsMap(const std::string& op_type,
+                                      const std::string& out_name) {
+  return op_passing_outs_map[op_type].count(out_name);
+}
+static inline bool FindViewOpMap(const std::string& op_type) {
+  return view_op_map.count(op_type);
+}
+static inline std::string TempName(const std::string& name) {
+  return name + '_';
+}
+std::string GenerateOpFunctionsBody(
+    const paddle::framework::proto::OpProto* op_proto, std::string func_name,
+    bool use_inplace_strategy = false,
+    std::map<std::string, std::string> inplace_map = {}) {
+  auto& op_type = op_proto->type();
+  std::string input_args = "";
+  std::string call_api_str = "auto out = " + op_type + "_dygraph_function(";
+  std::string ins_initializer_with_null = "";
+  std::string py_arg = "";
+  int arg_idx = 0;
+  int input_args_num = 0;
+  std::string ins_cast_str = "";
+  std::string view_strategy_str = "";
+  for (auto& input : op_proto->inputs()) {
+    auto& in_name = input.name();
+    // skip those dispensable inputs, like ResidualData in conv2d
+    if (input.dispensable() && !FindInsMap(op_type, in_name)) {
+      continue;
+    }
+    const auto in_type = input.duplicable() ? IN_VAR_LIST_TYPE : IN_VAR_TYPE;
+    auto input_arg =
+        paddle::string::Sprintf(ARG_TEMPLATE, in_type, TempName(in_name));
+    input_args += input_arg;
+    input_args += ",";
+    input_args_num++;
+    const auto in_cast_type =
+        input.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE;
+    auto dispensable = input.dispensable() ? "true" : "false";
+    ins_cast_str += paddle::string::Sprintf(in_cast_type, in_name, op_type,
+                                            in_name, arg_idx++, dispensable);
+    if (input.dispensable()) {
+      const auto in_template = input.duplicable()
+                                   ? INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST
+                                   : INPUT_INITIALIZER_TEMPLATE_WITH_NULL;
+      ins_initializer_with_null +=
+          paddle::string::Sprintf(in_template, in_name, in_name, in_name);
+    } else {
+      call_api_str += in_name + ", ";
+    }
+  }
+  if (!input_args.empty() && input_args.back() == ',') {
+    input_args.pop_back();
+  }
+  // Generate outs initializer
+  std::string outs_initializer = "{";
+  std::string outs_initializer_with_null = "";
+  std::string return_str = "";
+  int outs_num = 0;
+  for (auto& output : op_proto->outputs()) {
+    auto& out_name = output.name();
+    // skip those dispensable oututs
+    if (output.dispensable() && !FindOutsMap(op_type, out_name)) {
+      continue;
+    }
+    const auto out_type =
+        output.duplicable() ? OUT_VAR_LIST_TYPE : OUT_VAR_TYPE;
+    if (FindPassingOutsMap(op_type, out_name)) {
+      if (input_args != "") {
+        input_args += ",";
+      }
+      input_args += out_type;
+      input_args += out_name;
+      input_args_num++;
+      if (output.dispensable()) {
+        const auto out_template =
+            output.duplicable() ? OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST
+                                : OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL;
+        outs_initializer_with_null +=
+            paddle::string::Sprintf(out_template, out_name, out_name);
+      } else {
+        const auto out_template = output.duplicable()
+                                      ? INPUT_LIST_INITIALIZER_TEMPLATE
+                                      : INPUT_INITIALIZER_TEMPLATE;
+        outs_initializer +=
+            paddle::string::Sprintf(out_template, out_name, out_name);
+        outs_initializer += ",";
+      }
+      const auto in_cast_type =
+          output.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE;
+      auto dispensable = output.dispensable() ? "true" : "false";
+      ins_cast_str += paddle::string::Sprintf(in_cast_type, out_name, op_type,
+                                              out_name, arg_idx++, dispensable);
+    } else {
+      // There are few Operators that have duplicable output, like `Out` in
+      // split op. We need to specify the number of variables for the
+      // duplicable output, as the argument OutNum;
+      if (output.duplicable()) {
+        if (input_args != "") {
+          input_args += ",";
+        }
+        auto out_num_str = paddle::string::Sprintf(ARG_OUT_NUM, out_name);
+        input_args += ARG_OUT_NUM_TYPE;
+        input_args += out_num_str;
+        input_args_num++;
+        outs_initializer += paddle::string::Sprintf(
+            OUT_DUPLICABLE_INITIALIZER_TEMPLATE, out_name, out_num_str);
+        auto dispensable = output.dispensable() ? "true" : "false";
+        ins_cast_str +=
+            paddle::string::Sprintf(CAST_SIZE_T_TEMPLATE, out_num_str, op_type,
+                                    out_num_str, arg_idx++, dispensable);
+        call_api_str += out_num_str + ", ";
+      } else {
+        outs_initializer +=
+            paddle::string::Sprintf(OUT_INITIALIZER_TEMPLATE, out_name);
+      }
+      outs_initializer += ",";
+    }
+    // return_str += paddle::string::Sprintf(return_template, out_name);
+    // return_str += ",";
+    outs_num += 1;
+  }
+  call_api_str += "attrs);";
+  if (outs_initializer.back() == ',') {
+    outs_initializer.pop_back();
+    // return_str.pop_back();
+  }
+  outs_initializer += "}";
+  if (FindViewOpMap(op_type)) {
+    std::string viwe_input_name = view_op_map[op_type].first;
+    std::string viwe_output_name = view_op_map[op_type].second;
+    view_strategy_str += paddle::string::Sprintf(
+        HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT, viwe_input_name, viwe_output_name,
+        viwe_input_name, viwe_output_name);
+  }
+  if (outs_num == 0) {
+    return_str = "Py_INCREF(Py_None);\n    return Py_None;";
+  } else {
+    return_str = "return ToPyObject(out);";
+  }
+  std::string function_args = "";
+  if (input_args == "") {
+    function_args = FUNCTION_ARGS_NO_INPUT;
+  } else {
+    function_args = paddle::string::Sprintf(FUNCTION_ARGS, input_args);
+  }
+  // generate op funtcion body
+  auto op_function_str = paddle::string::Sprintf(
+      OP_FUNCTION_TEMPLATE, func_name, ins_cast_str, op_type, input_args_num,
+      call_api_str, return_str);
+  return op_function_str;
+}
+static std::tuple<std::vector<std::string>, std::vector<std::string>>
+GenerateOpFunctions() {
+  auto& op_info_map = paddle::framework::OpInfoMap::Instance().map();
+  std::vector<std::string> op_function_list, bind_function_list;
+  auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
+  for (auto& pair : op_info_map) {
+    auto& op_info = pair.second;
+    auto op_proto = op_info.proto_;
+    if (op_proto == nullptr) {
+      continue;
+    }
+    auto& op_type = op_proto->type();
+    // Skip ooerator which is not inherit form OperatorWithKernel, like while,
+    // since only OperatorWithKernel can run in dygraph mode.
+    // if the pten lib contains op kernel, we still generate ops method
+    if (!all_kernels.count(op_type) &&
+        !pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) {
+      continue;
+    }
+    if (!gen_list.count(op_type)) {
+      continue;
+    }
+    std::string func_name = "eager_api_" + op_type;
+    std::string op_function_str = GenerateOpFunctionsBody(op_proto, func_name);
+    // generate pybind item
+    auto bind_function_str = paddle::string::Sprintf(
+        PYBIND_ITEM_TEMPLATE, op_type, func_name, op_type);
+    op_function_list.emplace_back(std::move(op_function_str));
+    bind_function_list.emplace_back(std::move(bind_function_str));
+  }
+  return std::make_tuple(op_function_list, bind_function_list);
+}
+int main(int argc, char* argv[]) {
+  if (argc != 2) {
+    std::cerr << "argc must be 2" << std::endl;
+    return -1;
+  }
+#ifdef PADDLE_WITH_ASCEND_CL
+  auto ascend_ptr = paddle::framework::AscendInstance::GetInstance();
+  ascend_ptr->InitGEForUT();
+#endif
+  std::vector<std::string> headers{
+      "\"pybind11/detail/common.h\"",
+      "\"paddle/fluid/pybind/op_function_common.h\"",
+      "\"paddle/fluid/pybind/exception.h\"", "<Python.h>"};
+  std::ofstream out(argv[1], std::ios::out);
+  out << "#pragma once\n\n";
+  for (auto& header : headers) {
+    out << "#include  " + header + "\n";
+  }
+  out << "\n\n";
+  auto op_funcs = GenerateOpFunctions();
+  out << "namespace paddle {\n"
+      << "namespace pybind {\n\n";
+  out << paddle::string::join_strings(std::get<0>(op_funcs), '\n');
+  out << "\n\n";
+  out << "static PyMethodDef ExtestMethods[] = {\n"
+      << paddle::string::join_strings(std::get<1>(op_funcs), '\n')
+      << "\n  {nullptr,nullptr,0,nullptr}"
+      << "};\n\n";
+  out << "inline void BindEagerOpFunctions(pybind11::module *module) {\n"
+      << "  auto m = module->def_submodule(\"ops\");\n"
+      << "  if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n"
+      << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
+         "core.eager.ops failed!\"));\n"
+      << "  }\n\n"
+      << "  InitOpsAttrTypeMap();"
+      << "}\n\n"
+      << "} // namespace pybind\n"
+      << "} // namespace paddle\n";
+  out.close();
+#ifdef PADDLE_WITH_ASCEND_CL
+  ge::GEFinalize();
+#endif
+  return 0;
+}
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -37,6 +37,7 @@ extern PyTypeObject* p_eager_tensor_type;
 PyObject* eager_tensor_properties_get_name(EagerTensorObject* self,
                                           void* closure) {
  EAGER_TRY
+  self->eagertensor.SyncToTensor();
  return ToPyObject(self->eagertensor.name());
  EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -44,6 +45,7 @@ PyObject* eager_tensor_properties_get_name(EagerTensorObject* self,
 int eager_tensor_properties_set_name(EagerTensorObject* self, PyObject* value,
                                     void* closure) {
  EAGER_TRY
+  self->eagertensor.SyncToTensor();
  self->eagertensor.set_name(CastPyArg2AttrString(value, 0));
  return 0;
  EAGER_CATCH_AND_THROW_RETURN_ZERO
@@ -52,6 +54,7 @@ int eager_tensor_properties_set_name(EagerTensorObject* self, PyObject* value,
 PyObject* eager_tensor_properties_get_stop_gradient(EagerTensorObject* self,
                                                    void* closure) {
  EAGER_TRY
+  self->eagertensor.SyncToTensor();
  auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor);
  return ToPyObject(meta->StopGradient());
  EAGER_CATCH_AND_THROW_RETURN_NULL
@@ -60,6 +63,7 @@ PyObject* eager_tensor_properties_get_stop_gradient(EagerTensorObject* self,
 PyObject* eager_tensor_properties_get_grad(EagerTensorObject* self,
                                           void* closure) {
  EAGER_TRY
+  self->eagertensor.SyncToTensor();
  auto meta = egr::EagerUtils::unsafe_autograd_meta(self->eagertensor);
  return ToPyObject(meta->Grad());
  EAGER_CATCH_AND_THROW_RETURN_NULL
@@ -68,6 +72,7 @@ PyObject* eager_tensor_properties_get_grad(EagerTensorObject* self,
 int eager_tensor_properties_set_stop_gradient(EagerTensorObject* self,
                                              PyObject* value, void* closure) {
  EAGER_TRY
+  self->eagertensor.SyncToTensor();
  auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor);
  meta->SetStopGradient(CastPyArg2AttrBoolean(value, 0));
  return 0;
@@ -77,6 +82,7 @@ int eager_tensor_properties_set_stop_gradient(EagerTensorObject* self,
 PyObject* eager_tensor_properties_get_persistable(EagerTensorObject* self,
                                                  void* closure) {
  EAGER_TRY
+  self->eagertensor.SyncToTensor();
  auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor);
  return ToPyObject(meta->Persistable());
  EAGER_CATCH_AND_THROW_RETURN_NULL
@@ -85,6 +91,7 @@ PyObject* eager_tensor_properties_get_persistable(EagerTensorObject* self,
 int eager_tensor_properties_set_persistable(EagerTensorObject* self,
                                            PyObject* value, void* closure) {
  EAGER_TRY
+  self->eagertensor.SyncToTensor();
  auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor);
  meta->SetPersistable(CastPyArg2AttrBoolean(value, 0));
  return 0;
@@ -94,6 +101,7 @@ int eager_tensor_properties_set_persistable(EagerTensorObject* self,
 PyObject* eager_tensor_properties_get_shape(EagerTensorObject* self,
                                            void* closure) {
  EAGER_TRY
+  self->eagertensor.SyncToTensor();
  auto ddim = self->eagertensor.shape();
  std::vector<int64_t> value;
  size_t rank = static_cast<size_t>(ddim.size());
@@ -109,6 +117,7 @@ PyObject* eager_tensor_properties_get_shape(EagerTensorObject* self,
 PyObject* eager_tensor_properties_get_place(EagerTensorObject* self,
                                            void* closure) {
  EAGER_TRY
+  self->eagertensor.SyncToTensor();
  return ToPyObject(self->eagertensor.place());
  EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -116,6 +125,7 @@ PyObject* eager_tensor_properties_get_place(EagerTensorObject* self,
 PyObject* eager_tensor_properties_get_place_str(EagerTensorObject* self,
                                                void* closure) {
  EAGER_TRY
+  self->eagertensor.SyncToTensor();
  std::stringstream ostr;
  ostr << self->eagertensor.place();
  return ToPyObject(ostr.str());
@@ -125,6 +135,7 @@ PyObject* eager_tensor_properties_get_place_str(EagerTensorObject* self,
 PyObject* eager_tensor_properties_get_dtype(EagerTensorObject* self,
                                            void* closure) {
  EAGER_TRY
+  self->eagertensor.SyncToTensor();
  return ToPyObject(pten::DataType2String(self->eagertensor.type()));
  EAGER_CATCH_AND_THROW_RETURN_NULL
 }

--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/op_function_common.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
@@ -369,5 +370,82 @@ PyObject* ToPyObject(const platform::Place& value) {
  return obj.ptr();
 }
+egr::EagerTensor GetEagerTensorFromArgs(const std::string& op_type,
+                                        const std::string& arg_name,
+                                        PyObject* args, ssize_t arg_idx,
+                                        bool dispensable) {
+  PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
+  if (PyTuple_Check(obj)) {
+    obj = PyTuple_GET_ITEM(obj, 0);
+  }
+  if (obj == nullptr || obj == Py_None) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be Tensor, but got None",
+          op_type, arg_name, arg_idx));
+    }
+    egr::EagerTensor emptytensor;
+    return emptytensor;
+  }
+  return reinterpret_cast<EagerTensorObject*>(obj)->eagertensor;
+}
+std::vector<egr::EagerTensor> GetEagerTensorListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable) {
+  PyObject* list = PyTuple_GET_ITEM(args, arg_idx);
+  if (list == nullptr) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of Tensor, but got "
+          "None",
+          op_type, arg_name, arg_idx));
+    }
+    return {};
+  }
+  std::vector<egr::EagerTensor> result;
+  if (PyList_Check(list)) {
+    Py_ssize_t len = PyList_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    for (Py_ssize_t i = 0; i < len; i++) {
+      result.emplace_back(
+          reinterpret_cast<EagerTensorObject*>(PyList_GetItem(list, i))
+              ->eagertensor);
+    }
+  } else if (PyTuple_Check(list)) {
+    Py_ssize_t len = PyTuple_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    for (Py_ssize_t i = 0; i < len; i++) {
+      result.emplace_back(
+          reinterpret_cast<EagerTensorObject*>(PyTuple_GetItem(list, i))
+              ->eagertensor);
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+        "%s",
+        op_type, arg_name, arg_idx,
+        (reinterpret_cast<PyTypeObject*>(list->ob_type))->tp_name));
+  }
+  return result;
+}
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -52,5 +52,38 @@ PyObject* ToPyObject(const std::vector<double>& value);
 PyObject* ToPyObject(const std::vector<egr::EagerTensor>& value);
 PyObject* ToPyObject(const platform::Place& value);
+template <typename Tuple, size_t N>
+struct TupleEagerTensorResult {
+  static void Run(const Tuple& out, PyObject* result) {
+    TupleEagerTensorResult<Tuple, N - 1>::Run(out, result);
+    PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out)));
+  }
+};
+template <typename Tuple>
+struct TupleEagerTensorResult<Tuple, 1> {
+  static void Run(const Tuple& out, PyObject* result) {
+    PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out)));
+  }
+};
+template <typename... Args>
+PyObject* ToPyObject(const std::tuple<Args...>& out) {
+  auto len = sizeof...(Args);
+  PyObject* result = PyTuple_New(len);
+  TupleEagerTensorResult<decltype(out), sizeof...(Args)>::Run(out, result);
+  return result;
+}
+egr::EagerTensor GetEagerTensorFromArgs(const std::string& op_type,
+                                        const std::string& arg_name,
+                                        PyObject* args, ssize_t arg_idx,
+                                        bool dispensable = false);
+std::vector<egr::EagerTensor> GetEagerTensorListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable = false);
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/fluid/pybind/op_function.h
+++ b/paddle/fluid/pybind/op_function.h
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
--- a/paddle/fluid/pybind/op_function_common.h
+++ b/paddle/fluid/pybind/op_function_common.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <pybind11/chrono.h>
+#include <pybind11/complex.h>
+#include <pybind11/functional.h>
+#include <pybind11/stl.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/pybind/imperative.h"
+namespace py = pybind11;
+namespace paddle {
+namespace pybind {
+bool PyObject_CheckBool(PyObject** obj);
+bool PyObject_CheckLongOrToLong(PyObject** obj);
+bool PyObject_CheckFloatOrToFloat(PyObject** obj);
+bool PyObject_CheckString(PyObject* obj);
+void CastPyArg2AttrBoolean(PyObject* obj,
+                           paddle::framework::AttributeMap& attrs,  // NOLINT
+                           const std::string& key, const std::string& op_type,
+                           ssize_t arg_pos);
+void CastPyArg2AttrInt(PyObject* obj,
+                       paddle::framework::AttributeMap& attrs,  // NOLINT
+                       const std::string& key, const std::string& op_type,
+                       ssize_t arg_pos);
+void CastPyArg2AttrLong(PyObject* obj,
+                        paddle::framework::AttributeMap& attrs,  // NOLINT
+                        const std::string& key, const std::string& op_type,
+                        ssize_t arg_pos);
+void CastPyArg2AttrFloat(PyObject* obj,
+                         paddle::framework::AttributeMap& attrs,  // NOLINT
+                         const std::string& key, const std::string& op_type,
+                         ssize_t arg_pos);
+void CastPyArg2AttrString(PyObject* obj,
+                          paddle::framework::AttributeMap& attrs,  // NOLINT
+                          const std::string& key, const std::string& op_type,
+                          ssize_t arg_pos);
+void CastPyArg2AttrBooleans(PyObject* obj,
+                            paddle::framework::AttributeMap& attrs,  // NOLINT
+                            const std::string& key, const std::string& op_type,
+                            ssize_t arg_pos);
+void CastPyArg2AttrInts(PyObject* obj,
+                        paddle::framework::AttributeMap& attrs,  // NOLINT
+                        const std::string& key, const std::string& op_type,
+                        ssize_t arg_pos);
+void CastPyArg2AttrLongs(PyObject* obj,
+                         paddle::framework::AttributeMap& attrs,  // NOLINT
+                         const std::string& key, const std::string& op_type,
+                         ssize_t arg_pos);
+void CastPyArg2AttrFloats(PyObject* obj,
+                          paddle::framework::AttributeMap& attrs,  // NOLINT
+                          const std::string& key, const std::string& op_type,
+                          ssize_t arg_pos);
+void CastPyArg2AttrFloat64s(PyObject* obj,
+                            paddle::framework::AttributeMap& attrs,  // NOLINT
+                            const std::string& key, const std::string& op_type,
+                            ssize_t arg_pos);
+void CastPyArg2AttrStrings(PyObject* obj,
+                           paddle::framework::AttributeMap& attrs,  // NOLINT
+                           const std::string& key, const std::string& op_type,
+                           ssize_t arg_pos);
+void CastPyArg2AttrBlock(PyObject* obj,
+                         paddle::framework::AttributeMap& attrs,  // NOLINT
+                         const std::string& key, const std::string& op_type,
+                         ssize_t arg_pos);
+void ConstructAttrMapFromPyArgs(
+    const std::string& op_type, PyObject* args, ssize_t attr_start,
+    ssize_t attr_end,
+    paddle::framework::AttributeMap& attrs);  // NOLINT
+std::shared_ptr<imperative::VarBase> GetVarBaseFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable = false);
+std::vector<std::shared_ptr<imperative::VarBase>> GetVarBaseListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable = false);
+unsigned long GetUnsignedLongFromArgs(  // NOLINT
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable = false);
+void InitOpsAttrTypeMap();
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -32,77 +32,6 @@
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
 #endif
-// NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
-// generated in C++ automatically.
-// However, some OPs need to pass the outputs from Python instead of generating
-// them in C++. There are mainly 2 reasons for that,
-// (1) Optimizer OPs need to update the input param in-place, like sgd.
-//     So they need to pass the output which is same as input param.
-// (2) Very few python APIs has out in their arguments, like fill_constant.
-//     So they need to pass the python output to C++.
-//     Actually, this is not a good design, since it may break the SSA graph,
-//     especially in declarative mode.
-// For those OPs, we need to manually specify the outs need to pass in this map.
-std::map<std::string, std::set<std::string>> op_passing_outs_map = {
-    {"sgd", {"ParamOut"}},
-    {"adam",
-     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
-      "MasterParamOut"}},
-    {"adamw",
-     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
-      "MasterParamOut"}},
-    {"average_accumulates",
-     {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates",
-      "out_old_num_accumulates", "out_num_updates"}},
-    {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
-    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
-    {"batch_norm", {"MeanOut", "VarianceOut"}},
-    {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
-    {"accuracy", {"Correct", "Total"}},
-    {"fill_constant", {"Out"}},
-    {"recv_v2", {"Out"}},
-    {"partial_recv", {"Out"}},
-    {"matmul", {"Out"}},
-    {"c_broadcast", {"Out"}},
-    {"c_sync_calc_stream", {"Out"}},
-    {"c_sync_comm_stream", {"Out"}},
-    {"c_reduce_sum", {"Out"}},
-    {"c_reduce_max", {"Out"}},
-    {"c_reduce_min", {"Out"}},
-    {"c_reduce_prod", {"Out"}},
-    {"c_reduce", {"Out"}},
-    {"c_scatter", {"Out"}},
-    {"barrier", {"Out"}},
-    {"fake_quantize_dequantize_moving_average_abs_max",
-     {"Out", "OutScale", "OutAccum", "OutState"}},
-    {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
-    {"fake_channel_wise_quantize_dequantize_abs_max", {"Out", "OutScale"}},
-    {"check_finite_and_unscale", {"Out", "FoundInfinite"}},
-    {"update_loss_scaling",
-     {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
-    {"moving_average_abs_max_scale",
-     {"Out", "OutScale", "OutAccum", "OutState"}},
-    {"lamb",
-     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
-    {"rnn", {"DropoutState"}},
-    {"run_program", {"Out", "DOut", "OutScope"}},
-    {"clear_float_status", {"FloatStatusOut"}},
-    {"get_float_status", {"FloatStatusOut"}},
-};
-// NOTE(pangyoki): Tensor View Strategy.
-// In this case, a new output varbase will be created, and this varbase will
-// reuse the input varbase's allocation.
-// It's a map. The key of outer map is the view op name, the value is
-// a pair which implies the mapping relationship between the input and
-// output varbase.
-std::map<std::string, std::pair<std::string, std::string>> view_op_map = {
-    {"squeeze2", {"X", "Out"}},  // "X" -> "Out"
-    {"unsqueeze2", {"X", "Out"}},
-    {"reshape2", {"X", "Out"}},
-    {"flatten_contiguous_range", {"X", "Out"}},
-};
 // NOTE(pangyoki): Inplace OP with duplicable input.
 // The set includes inplace ops that have duplicable input.
 // The first Varbase in input needs to be specified for the inplace strategy

--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -119,3 +119,74 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
      "MasterParamOut"}},
 };
+// NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
+// generated in C++ automatically.
+// However, some OPs need to pass the outputs from Python instead of generating
+// them in C++. There are mainly 2 reasons for that,
+// (1) Optimizer OPs need to update the input param in-place, like sgd.
+//     So they need to pass the output which is same as input param.
+// (2) Very few python APIs has out in their arguments, like fill_constant.
+//     So they need to pass the python output to C++.
+//     Actually, this is not a good design, since it may break the SSA graph,
+//     especially in declarative mode.
+// For those OPs, we need to manually specify the outs need to pass in this map.
+std::map<std::string, std::set<std::string>> op_passing_outs_map = {
+    {"sgd", {"ParamOut"}},
+    {"adam",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
+    {"adamw",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
+    {"average_accumulates",
+     {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates",
+      "out_old_num_accumulates", "out_num_updates"}},
+    {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
+    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
+    {"batch_norm", {"MeanOut", "VarianceOut"}},
+    {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
+    {"accuracy", {"Correct", "Total"}},
+    {"fill_constant", {"Out"}},
+    {"recv_v2", {"Out"}},
+    {"partial_recv", {"Out"}},
+    {"matmul", {"Out"}},
+    {"c_broadcast", {"Out"}},
+    {"c_sync_calc_stream", {"Out"}},
+    {"c_sync_comm_stream", {"Out"}},
+    {"c_reduce_sum", {"Out"}},
+    {"c_reduce_max", {"Out"}},
+    {"c_reduce_min", {"Out"}},
+    {"c_reduce_prod", {"Out"}},
+    {"c_reduce", {"Out"}},
+    {"c_scatter", {"Out"}},
+    {"barrier", {"Out"}},
+    {"fake_quantize_dequantize_moving_average_abs_max",
+     {"Out", "OutScale", "OutAccum", "OutState"}},
+    {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
+    {"fake_channel_wise_quantize_dequantize_abs_max", {"Out", "OutScale"}},
+    {"check_finite_and_unscale", {"Out", "FoundInfinite"}},
+    {"update_loss_scaling",
+     {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
+    {"moving_average_abs_max_scale",
+     {"Out", "OutScale", "OutAccum", "OutState"}},
+    {"lamb",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
+    {"rnn", {"DropoutState"}},
+    {"run_program", {"Out", "DOut", "OutScope"}},
+    {"clear_float_status", {"FloatStatusOut"}},
+    {"get_float_status", {"FloatStatusOut"}},
+};
+// NOTE(pangyoki): Tensor View Strategy.
+// In this case, a new output varbase will be created, and this varbase will
+// reuse the input varbase's allocation.
+// It's a map. The key of outer map is the view op name, the value is
+// a pair which implies the mapping relationship between the input and
+// output varbase.
+std::map<std::string, std::pair<std::string, std::string>> view_op_map = {
+    {"squeeze2", {"X", "Out"}},  // "X" -> "Out"
+    {"unsqueeze2", {"X", "Out"}},
+    {"reshape2", {"X", "Out"}},
+    {"flatten_contiguous_range", {"X", "Out"}},
+};
--- a/python/paddle/_C_ops.py
+++ b/python/paddle/_C_ops.py
@@ -19,3 +19,21 @@ __all__ = []
 for name in dir(core.ops):
    globals()[name] = getattr(core.ops, name)
    __all__.append(name)
+def switch_to_core_ops():
+    for name in dir(core.eager.ops):
+        del globals()[name]
+        __all__.remove(name)
+    for name in dir(core.ops):
+        globals()[name] = getattr(core.ops, name)
+        __all__.append(name)
+def switch_to_eager_ops():
+    for name in dir(core.ops):
+        del globals()[name]
+        __all__.remove(name)
+    for name in dir(core.eager.ops):
+        globals()[name] = getattr(core.eager.ops, name)
+        __all__.append(name)
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -40,6 +40,7 @@ import paddle.version as fluid_version
 import warnings
 import functools
 from .variable_index import _getitem_impl_, _setitem_impl_
+from paddle import _C_ops
 __all__ = [
    'Program',
@@ -82,13 +83,36 @@ _eager_mode_ = False
 @signature_safe_contextmanager
-def eager_guard():
+def eager_mode_place_guard(place):
+    if place is not None:
+        expected_place = _get_paddle_place(place)
+    else:
+        expected_place = _current_expected_place()
+    global _global_expected_place_
+    tmp_place = _global_expected_place_
+    _global_expected_place_ = expected_place
+    _set_expected_place(expected_place)
+    try:
+        yield
+    finally:
+        _global_expected_place_ = tmp_place
+        _set_expected_place(tmp_place)
+@signature_safe_contextmanager
+def eager_guard(place=None):
    global _eager_mode_
    _eager_mode_ = True
+    _C_ops.switch_to_eager_ops()
    try:
-        yield
+        with eager_mode_place_guard(place):
+            yield
    finally:
        _eager_mode_ = False
+        _C_ops.switch_to_core_ops()
 def in_eager_mode():

--- a/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.fluid.core as core
+import paddle.fluid.eager.eager_tensor_patch_methods as eager_tensor_patch_methods
+import paddle
+import numpy as np
+from paddle.fluid import eager_guard
+import unittest
+class EagerOpAPIGenerateTestCase(unittest.TestCase):
+    def test_elementwise_add(self):
+        with eager_guard():
+            paddle.set_device("cpu")
+            np_x = np.ones([4, 16, 16, 32]).astype('float32')
+            np_y = np.ones([4, 16, 16, 32]).astype('float32')
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            out = paddle.add(x, y)
+            out_arr = out.numpy()
+            out_arr_expected = np.add(np_x, np_y)
+            self.assertTrue(np.array_equal(out_arr, out_arr_expected))
+    def test_sum(self):
+        with eager_guard():
+            x_data = np.array(
+                [[0.2, 0.3, 0.5, 0.9], [0.1, 0.2, 0.6, 0.7]]).astype('float32')
+            x = paddle.to_tensor(x_data, 'float32')
+            out = paddle.sum(x, axis=0)
+            out_arr = out.numpy()
+            out_arr_expected = np.sum(x_data, axis=0)
+            self.assertTrue(np.array_equal(out_arr, out_arr_expected))
+    def test_mm(self):
+        with eager_guard():
+            np_input = np.random.random([16, 32]).astype('float32')
+            np_mat2 = np.random.random([32, 32]).astype('float32')
+            input = paddle.to_tensor(np_input)
+            mat2 = paddle.to_tensor(np_mat2)
+            out = paddle.mm(input, mat2)
+            out_arr = out.numpy()
+            out_arr_expected = np.matmul(np_input, np_mat2)
+            self.assertTrue(np.allclose(out_arr, out_arr_expected))
+    def test_sigmoid(self):
+        with eager_guard():
+            np_x = np.array([-0.4, -0.2, 0.1, 0.3]).astype('float32')
+            x = paddle.to_tensor(np_x)
+            out = paddle.nn.functional.sigmoid(x)
+            out_arr = out.numpy()
+            out_arr_expected = np.array(
+                [0.40131234, 0.450166, 0.52497919, 0.57444252]).astype(
+                    'float32')
+            self.assertTrue(np.allclose(out_arr, out_arr_expected))
+if __name__ == "__main__":
+    unittest.main()