Compilation optimization (#44242)

* Compilation optimization

Compilation optimization (#44242)
* Compilation optimization
4baf0dbe · wanghuancoder · GitHub · e9b4d0be · 4baf0dbe · 4baf0dbe
25 changed file
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
@@ -6,7 +6,7 @@ cc_library(
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
  cc_library(
    final_dygraph_node
-    SRCS nodes.cc
+    SRCS nodes.cc ${eager_manual_nodes}
-    DEPS ${eager_deps} ${eager_manual_nodes})
+    DEPS ${eager_deps})
  add_dependencies(final_dygraph_node eager_final_state_codegen)
 endif()
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
@@ -6,7 +6,7 @@ cc_library(
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
  cc_library(
    final_dygraph_function
-    SRCS dygraph_functions.cc
+    SRCS dygraph_functions.cc ${eager_manual_functions}
-    DEPS ${eager_deps} ${eager_manual_functions})
+    DEPS ${eager_deps})
  add_dependencies(final_dygraph_function eager_final_state_codegen)
 endif()
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt
-cc_library(
-  add_n_fwd_func
-  SRCS add_n_fwd_func.cc
-  DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
-add_dependencies(add_n_fwd_func eager_codegen)
-cc_library(
-  conv2d_fwd_function
-  SRCS conv2d_fwd_function.cc
-  DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
-add_dependencies(conv2d_fwd_function eager_codegen)
 set(eager_manual_functions
-    conv2d_fwd_function add_n_fwd_func
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
    PARENT_SCOPE)
--- a/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt
-cc_library(
-  add_n_node
-  SRCS add_n_node.cc
-  DEPS ${eager_deps} ${fluid_deps})
-cc_library(
-  conv2d_nodes
-  SRCS conv2d_nodes.cc
-  DEPS ${eager_deps} ${fluid_deps})
 set(eager_manual_nodes
-    conv2d_nodes add_n_node
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc
    PARENT_SCOPE)
--- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt
-cc_library(
-  fused_gate_attention_fwd_func
-  SRCS fused_gate_attention_fwd_func.cc
-  DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
-add_dependencies(fused_gate_attention_fwd_func eager_codegen
-                 copy_dygraph_forward_functions)
-cc_library(
-  fused_feedforward_fwd_func
-  SRCS fused_feedforward_fwd_func.cc
-  DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
-add_dependencies(fused_feedforward_fwd_func eager_codegen
-                 copy_dygraph_forward_functions)
-cc_library(
-  fused_attention_fwd_func
-  SRCS fused_attention_fwd_func.cc
-  DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
-add_dependencies(fused_attention_fwd_func eager_codegen
-                 copy_dygraph_forward_functions)
 set(fluid_manual_functions
-    fused_gate_attention_fwd_func fused_feedforward_fwd_func
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc
-    fused_attention_fwd_func
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
    PARENT_SCOPE)
--- a/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt
+++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt
-cc_library(
-  fused_gate_attention_node
-  SRCS fused_gate_attention_node.cc
-  DEPS ${eager_deps} ${fluid_deps})
-cc_library(
-  fused_feedforward_node
-  SRCS fused_feedforward_node.cc
-  DEPS ${eager_deps} ${fluid_deps})
-cc_library(
-  fused_attention_node
-  SRCS fused_attention_node.cc
-  DEPS ${eager_deps} ${fluid_deps})
 set(fluid_manual_nodes
-    fused_gate_attention_node fused_feedforward_node fused_attention_node
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_attention_node.cc
    PARENT_SCOPE)
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -3083,27 +3083,44 @@ static std::string ConvertCoreOpsInfosToString(
  return core_ops_returns_info_init_str;
 }
-static std::string GenerateCoreOpsReturnsInfo() {
+static std::string GenerateCoreOpsArgsInfo() {
  const char* Core_Ops_Returns_MAP_TEMPLATE =
      "std::unordered_map<std::string, std::vector<std::string>> "
-      "core_ops_args_info = { %s };\n"
+      "core_ops_args_info = { %s };\n";
-      "std::unordered_map<std::string, std::vector<std::string>> "
-      "core_ops_args_type_info = { %s };\n"
-      "std::unordered_map<std::string, std::vector<std::string>> "
-      "core_ops_returns_info = { %s };\n";
  std::string core_ops_args_info_init_str =
      ConvertCoreOpsInfosToString(core_ops_args_info);
+  std::string core_ops_info_str = paddle::string::Sprintf(
+      Core_Ops_Returns_MAP_TEMPLATE, core_ops_args_info_init_str);
+  return core_ops_info_str;
+}
+static std::string GenerateCoreOpsArgsTypeInfo() {
+  const char* Core_Ops_Returns_MAP_TEMPLATE =
+      "std::unordered_map<std::string, std::vector<std::string>> "
+      "core_ops_args_type_info = { %s };\n";
  std::string core_ops_args_type_info_init_str =
      ConvertCoreOpsInfosToString(core_ops_args_type_info);
+  std::string core_ops_info_str = paddle::string::Sprintf(
+      Core_Ops_Returns_MAP_TEMPLATE, core_ops_args_type_info_init_str);
+  return core_ops_info_str;
+}
+static std::string GenerateCoreOpsReturnsInfo() {
+  const char* Core_Ops_Returns_MAP_TEMPLATE =
+      "std::unordered_map<std::string, std::vector<std::string>> "
+      "core_ops_returns_info = { %s };\n";
  std::string core_ops_returns_info_init_str =
      ConvertCoreOpsInfosToString(core_ops_returns_info);
-  std::string core_ops_info_str =
+  std::string core_ops_info_str = paddle::string::Sprintf(
-      paddle::string::Sprintf(Core_Ops_Returns_MAP_TEMPLATE,
+      Core_Ops_Returns_MAP_TEMPLATE, core_ops_returns_info_init_str);
-                              core_ops_args_info_init_str,
-                              core_ops_args_type_info_init_str,
-                              core_ops_returns_info_init_str);
  return core_ops_info_str;
 }
@@ -3252,6 +3269,12 @@ static void DygraphCodeGeneration(const std::string& output_dir,
  GenerateForwardDygraphFile(
      output_dir + "/forwards/dygraph_forward_functions_args_info.tmp.cc",
+      GenerateCoreOpsArgsInfo());
+  GenerateForwardDygraphFile(
+      output_dir + "/forwards/dygraph_forward_functions_args_type_info.tmp.cc",
+      GenerateCoreOpsArgsTypeInfo());
+  GenerateForwardDygraphFile(
+      output_dir + "/forwards/dygraph_forward_functions_returns_info.tmp.cc",
      GenerateCoreOpsReturnsInfo());
  VLOG(6) << "-------- GenerateNodeCCFile -------";

--- a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
+++ b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
@@ -96,6 +96,11 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count):
                                        "nodes" + str(i + 1) + ".cc"))
    empty_files.append(
        os.path.join(forwards_dir, "dygraph_forward_functions_args_info.cc"))
+    empty_files.append(
+        os.path.join(forwards_dir,
+                     "dygraph_forward_functions_args_type_info.cc"))
+    empty_files.append(
+        os.path.join(forwards_dir, "dygraph_forward_functions_returns_info.cc"))
    for path in empty_files:
        if not os.path.exists(path):
            open(path, 'a').close()
@@ -125,7 +130,7 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count):
        f.write("cc_library(dygraph_node SRCS ")
        for i in range(split_count):
            f.write("nodes" + str(i + 1) + ".cc ")
-        f.write("DEPS ${eager_deps} ${fluid_deps} ${fluid_manual_nodes})\n")
+        f.write("${fluid_manual_nodes} DEPS ${eager_deps} ${fluid_deps})\n")
        f.write("add_dependencies(dygraph_node copy_dygraph_node)")
    with open(forwards_level_cmakelist_path, "w") as f:
@@ -143,6 +148,12 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count):
        f.write(
            "  COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_info.tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_info.cc\"\n"
        )
+        f.write(
+            "  COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_type_info.tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_type_info.cc\"\n"
+        )
+        f.write(
+            "  COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_returns_info.tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_returns_info.cc\"\n"
+        )
        f.write("  DEPENDS eager_codegen\n")
        f.write("  VERBATIM)\n")
@@ -150,8 +161,10 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count):
        for i in range(split_count):
            f.write("dygraph_forward_functions" + str(i + 1) + ".cc ")
        f.write("dygraph_forward_functions_args_info.cc ")
+        f.write("dygraph_forward_functions_args_type_info.cc ")
+        f.write("dygraph_forward_functions_returns_info.cc ")
        f.write(
-            "DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${fluid_manual_functions})\n"
+            "${fluid_manual_functions} DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})\n"
        )
        f.write(
            "add_dependencies(dygraph_function copy_dygraph_forward_functions)")

--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
-set(INTERPRETERCORE_DEPS
+add_subdirectory(workqueue)
+add_subdirectory(garbage_collector)
+set(STANDALONE_EXECUTOR_SRCS
+    data_transfer.cc
+    new_executor_defs.cc
+    interpretercore_util.cc
+    event_manager.cc
+    stream_analyzer.cc
+    interpretercore.cc
+    standalone_executor.cc)
+set(STANDALONE_EXECUTOR_DEPS
    op_registry
    device_context
    scope
@@ -20,62 +32,33 @@ set(INTERPRETERCORE_DEPS
    variable_helper
    timer
    monitor
-    nan_inf_utils)
+    nan_inf_utils
+    enforce
-add_subdirectory(workqueue)
+    scope
-add_subdirectory(garbage_collector)
+    glog
+    enforce
-cc_library(
+    glog
-  data_transfer
+    scope
-  SRCS data_transfer.cc
+    workqueue
-  DEPS enforce scope glog)
+    interpretercore_event_garbage_collector
-cc_library(
+    ${DEVICE_EVENT_LIBS}
-  new_executor_defs
+    glog)
-  SRCS new_executor_defs.cc
-  DEPS enforce glog scope)
-cc_library(
-  interpretercore_util
-  SRCS interpretercore_util.cc
-  DEPS ${INTERPRETERCORE_DEPS} workqueue new_executor_defs data_transfer)
-cc_library(
-  event_manager
-  SRCS event_manager.cc
-  DEPS ${DEVICE_EVENT_LIBS} glog new_executor_defs)
-cc_library(
-  stream_analyzer
-  SRCS stream_analyzer.cc
-  DEPS ${DEVICE_EVENT_LIBS} glog device_context new_executor_defs)
 if(WITH_GPU OR WITH_ROCM)
-  cc_library(
+  set(STANDALONE_EXECUTOR_DEPS ${STANDALONE_EXECUTOR_DEPS}
-    interpretercore
+                               interpretercore_fast_garbage_collector)
-    SRCS interpretercore.cc
-    DEPS workqueue
-         ${DEVICE_EVENT_LIBS}
-         interpretercore_util
-         interpretercore_event_garbage_collector
-         interpretercore_fast_garbage_collector
-         stream_analyzer
-         event_manager)
-else()
-  cc_library(
-    interpretercore
-    SRCS interpretercore.cc
-    DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util
-         interpretercore_event_garbage_collector stream_analyzer event_manager)
 endif()
 cc_library(
  standalone_executor
-  SRCS standalone_executor.cc
+  SRCS ${STANDALONE_EXECUTOR_SRCS}
-  DEPS interpretercore)
+  DEPS ${STANDALONE_EXECUTOR_DEPS})
 cc_library(
  staticgraph_executor_statistics
  SRCS executor_statistics.cc
  DEPS enforce glog os_info)
-# cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 # skip win32 since wget is not installed by default on windows machine.
 if(WITH_GPU
   AND WITH_TESTING
@@ -120,13 +103,7 @@ if(WITH_GPU
  cc_test(
    standalone_executor_test
    SRCS standalone_executor_test.cc
-    DEPS interpretercore
+    DEPS standalone_executor operator op_registry executor ${OPS} ${OP_DEPS})
-         standalone_executor
-         operator
-         op_registry
-         executor
-         ${OPS}
-         ${OP_DEPS})
  set_tests_properties(standalone_executor_test PROPERTIES TIMEOUT 100)
  add_dependencies(standalone_executor_test download_program)

--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -5,7 +5,7 @@ cc_library(
 cc_library(
  var_helper
  SRCS var_helper.cc
-  DEPS tensor phi_api)
+  DEPS tensor selected_rows)
 if(WITH_XPU)
  cc_library(
    prepared_operator
@@ -20,8 +20,8 @@ if(WITH_XPU)
         op_kernel_type
         data_transform
         nan_inf_utils
-         phi_api
+         scalar
-         phi_utils
+         int_array
         var_helper
         profiler)
 else()
@@ -37,21 +37,16 @@ else()
         op_kernel_type
         data_transform
         nan_inf_utils
-         phi_api
+         scalar
-         phi_utils
+         int_array
         var_helper
         profiler)
 endif()
 cc_library(
  layer
  SRCS layer.cc
-  DEPS prepared_operator
+  DEPS prepared_operator math_function imperative_flag variable_helper
-       math_function
+       op_registry var_helper)
-       imperative_flag
-       variable_helper
-       op_registry
-       var_helper
-       phi_api)
 add_subdirectory(jit)
 if(WITH_GPU)
  cc_library(

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -101,7 +101,7 @@ else()
    cc_library(gather_scatter_kernel SRCS gather_scatter_kernel.cc gather_scatter_kernel.cu DEPS tensor)
 endif()
-set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel)
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel backward_infermeta)
 register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op quantize_linear_op
        recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})

--- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
@@ -10,4 +10,4 @@ nv_library(
 nv_test(
  cudnn_helper_test
  SRCS cudnn_helper_test.cc
-  DEPS dynload_cuda phi)
+  DEPS dynload_cuda)
--- a/paddle/fluid/pybind/.gitignore
+++ b/paddle/fluid/pybind/.gitignore
 pybind.h
-op_function.cc
+op_function1.cc
+op_function2.cc
+op_function3.cc
+op_function4.cc
+op_function5.cc
+op_function6.cc
+op_function7.cc
+op_function8.cc
 eager_op_function.cc
 eager_final_state_op_function.cc
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -102,13 +102,16 @@ endif()
 set(PYBIND_SRCS
    pybind.cc
    imperative.cc
-    op_function.cc
    inference_api.cc
    ir.cc
    bind_fleet_executor.cc
    reader_py.cc
    protobuf.cc
    exception.cc
+    op_function_common.cc
+    parallel_executor.cc
+    tensor.cc
+    place.cc
    const_value.cc
    global_value_getter_setter.cc
    fleet_wrapper_py.cc
@@ -124,13 +127,15 @@ set(PYBIND_SRCS
    generator_py.cc
    communication.cc
    cuda_streams_py.cc
-    jit.cc)
+    jit.cc
+    op_function1.cc
-execute_process(
+    op_function2.cc
-  COMMAND
+    op_function3.cc
-    "${PYTHON_EXECUTABLE}"
+    op_function4.cc
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/generate_file_structures.py"
+    op_function5.cc
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/")
+    op_function6.cc
+    op_function7.cc
+    op_function8.cc)
 if(WITH_CUSTOM_DEVICE)
  set(PYBIND_DEPS ${PYBIND_DEPS} phi_capi)
@@ -267,12 +272,35 @@ if(WITH_PYTHON)
    target_link_libraries(kernel_signature_generator ${ROCM_HIPRTC_LIB})
  endif()
-  set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function.cc)
+  set(op_function_output_path ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/)
-  set(tmp_impl_file ${impl_file}.tmp)
+  set(impl_file1 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function1.cc)
+  set(tmp_impl_file1 ${impl_file1}.tmp)
+  set(impl_file2 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function2.cc)
+  set(tmp_impl_file2 ${impl_file2}.tmp)
+  set(impl_file3 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function3.cc)
+  set(tmp_impl_file3 ${impl_file3}.tmp)
+  set(impl_file4 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function4.cc)
+  set(tmp_impl_file4 ${impl_file4}.tmp)
+  set(impl_file5 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function5.cc)
+  set(tmp_impl_file5 ${impl_file5}.tmp)
+  set(impl_file6 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function6.cc)
+  set(tmp_impl_file6 ${impl_file6}.tmp)
+  set(impl_file7 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function7.cc)
+  set(tmp_impl_file7 ${impl_file7}.tmp)
+  set(impl_file8 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function8.cc)
+  set(tmp_impl_file8 ${impl_file8}.tmp)
+  set(CODE_GEN_SPLIT_FILE_COUNT "8")
  set(eager_impl_file
      ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function.cc)
  set(tmp_eager_impl_file ${eager_impl_file}.tmp)
+  execute_process(
+    COMMAND
+      "${PYTHON_EXECUTABLE}"
+      "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/generate_file_structures.py"
+      "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/"
+      "${CODE_GEN_SPLIT_FILE_COUNT}")
  set(OP_IMPL_DEPS op_function_generator)
  set(EAGER_OP_IMPL_DEPS eager_op_function_generator
                         eager_final_state_python_c_codegen)
@@ -292,7 +320,7 @@ if(WITH_PYTHON)
      ":retry\n"
      "ECHO op_function_generator run %build_times% time\n"
      "taskkill /f /im op_function_generator.exe 2>NUL\n"
-      "${op_impl_path}/op_function_generator.exe ${tmp_impl_file}\n"
+      "${op_impl_path}/op_function_generator.exe ${op_function_output_path} ${CODE_GEN_SPLIT_FILE_COUNT}\n"
      "if %ERRORLEVEL% NEQ 0 (\n"
      "    set /a build_times=%build_times%+1\n"
      "    if %build_times% GEQ 10 (\n"
@@ -367,12 +395,33 @@ if(WITH_PYTHON)
    endif()
    add_custom_command(
-      OUTPUT ${impl_file}
+      OUTPUT op_function
      COMMAND
        ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file}
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file1}
-              ${impl_file}
+              ${impl_file1}
-      COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
+      COMMENT "copy_if_different ${tmp_impl_file1} to ${impl_file1}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file2}
+              ${impl_file2}
+      COMMENT "copy_if_different ${tmp_impl_file2} to ${impl_file2}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file3}
+              ${impl_file3}
+      COMMENT "copy_if_different ${tmp_impl_file3} to ${impl_file3}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file4}
+              ${impl_file4}
+      COMMENT "copy_if_different ${tmp_impl_file4} to ${impl_file4}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file5}
+              ${impl_file5}
+      COMMENT "copy_if_different ${tmp_impl_file5} to ${impl_file5}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file6}
+              ${impl_file6}
+      COMMENT "copy_if_different ${tmp_impl_file6} to ${impl_file6}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file7}
+              ${impl_file7}
+      COMMENT "copy_if_different ${tmp_impl_file7} to ${impl_file7}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file8}
+              ${impl_file8}
+      COMMENT "copy_if_different ${tmp_impl_file8} to ${impl_file8}"
      DEPENDS ${OP_IMPL_DEPS})
    if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
      add_custom_command(
@@ -431,13 +480,35 @@ if(WITH_PYTHON)
      list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0)
    endif()
    add_custom_command(
-      OUTPUT ${impl_file}
+      OUTPUT op_function
      COMMAND
        ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:."
-        "${CMAKE_CURRENT_BINARY_DIR}/op_function_generator" "${tmp_impl_file}"
+        "${CMAKE_CURRENT_BINARY_DIR}/op_function_generator"
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file}
+        "${op_function_output_path}" "${CODE_GEN_SPLIT_FILE_COUNT}"
-              ${impl_file}
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file1}
-      COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
+              ${impl_file1}
+      COMMENT "copy_if_different ${tmp_impl_file1} to ${impl_file1}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file2}
+              ${impl_file2}
+      COMMENT "copy_if_different ${tmp_impl_file2} to ${impl_file2}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file3}
+              ${impl_file3}
+      COMMENT "copy_if_different ${tmp_impl_file3} to ${impl_file3}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file4}
+              ${impl_file4}
+      COMMENT "copy_if_different ${tmp_impl_file4} to ${impl_file4}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file5}
+              ${impl_file5}
+      COMMENT "copy_if_different ${tmp_impl_file5} to ${impl_file5}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file6}
+              ${impl_file6}
+      COMMENT "copy_if_different ${tmp_impl_file6} to ${impl_file6}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file7}
+              ${impl_file7}
+      COMMENT "copy_if_different ${tmp_impl_file7} to ${impl_file7}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file8}
+              ${impl_file8}
+      COMMENT "copy_if_different ${tmp_impl_file8} to ${impl_file8}"
      DEPENDS ${OP_IMPL_DEPS}
      VERBATIM)
    if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
@@ -454,19 +525,13 @@ if(WITH_PYTHON)
        VERBATIM)
    endif()
  endif()
-  add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file})
+  add_custom_target(op_function_generator_cmd ALL DEPENDS op_function)
  if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
    add_custom_target(eager_op_function_generator_cmd ALL
                      DEPENDS ${eager_impl_file})
  endif()
-  list(APPEND PYBIND_DEPS interpretercore standalone_executor
+  list(APPEND PYBIND_DEPS standalone_executor staticgraph_executor_statistics)
-       staticgraph_executor_statistics)
-  cc_library(
-    op_function_common
-    SRCS op_function_common.cc
-    DEPS ${PYBIND_DEPS})
-  list(APPEND PYBIND_DEPS op_function_common)
  if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
    set(PYBIND_SRCS eager.cc ${PYBIND_SRCS})
@@ -482,7 +547,6 @@ if(WITH_PYTHON)
    list(APPEND PYBIND_DEPS backward)
    list(APPEND PYBIND_DEPS grad_node_info)
    list(APPEND PYBIND_DEPS phi)
-    list(APPEND PYBIND_DEPS op_function_common)
    list(APPEND PYBIND_DEPS final_dygraph_function)
    list(APPEND PYBIND_DEPS final_dygraph_node)
    list(APPEND PYBIND_DEPS dygraph_function)

--- a/paddle/fluid/pybind/generate_file_structures.py
+++ b/paddle/fluid/pybind/generate_file_structures.py
@@ -16,12 +16,16 @@ import sys
 import os
 if __name__ == "__main__":
-    assert len(sys.argv) == 2
+    assert len(sys.argv) == 3
    pybind_dir = sys.argv[1]
+    split_count = int(sys.argv[2])
    empty_files = [os.path.join(pybind_dir, "eager_final_state_op_function.cc")]
    empty_files.append(os.path.join(pybind_dir, "eager_op_function.cc"))
-    empty_files.append(os.path.join(pybind_dir, "op_function.cc"))
+    for i in range(split_count):
+        empty_files.append(
+            os.path.join(pybind_dir, "op_function" + str(i + 1) + ".cc"))
    for path in empty_files:
        if not os.path.exists(path):

--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -64,6 +64,7 @@ limitations under the License. */
 namespace paddle {
 namespace pybind {
+std::atomic<int> VarBaseUniqueNameID{0};
 PyTypeObject *g_varbase_pytype = nullptr;
 namespace py = ::pybind11;
@@ -497,7 +498,14 @@ static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src,  // NOLINT
 void BindImperative(py::module *m_ptr) {
  auto &m = *m_ptr;
-  BindOpFunctions(&m);
+  BindOpFunctions1(&m);
+  BindOpFunctions2(&m);
+  BindOpFunctions3(&m);
+  BindOpFunctions4(&m);
+  BindOpFunctions5(&m);
+  BindOpFunctions6(&m);
+  BindOpFunctions7(&m);
+  BindOpFunctions8(&m);
 #ifndef _WIN32
  // Dygraph DataLoader signal handler

--- a/paddle/fluid/pybind/op_function.h
+++ b/paddle/fluid/pybind/op_function.h
@@ -257,7 +257,14 @@ PyObject* MakeReturnPyObject(const std::tuple<Args...>& out) {
  return result;
 }
-void BindOpFunctions(pybind11::module* module);
+void BindOpFunctions1(pybind11::module* module);
+void BindOpFunctions2(pybind11::module* module);
+void BindOpFunctions3(pybind11::module* module);
+void BindOpFunctions4(pybind11::module* module);
+void BindOpFunctions5(pybind11::module* module);
+void BindOpFunctions6(pybind11::module* module);
+void BindOpFunctions7(pybind11::module* module);
+void BindOpFunctions8(pybind11::module* module);
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -422,13 +422,17 @@ std::string GenerateOpFunctionsBody(
  return op_function_str;
 }
-static std::tuple<std::vector<std::string>, std::vector<std::string>>
+static std::vector<
-GenerateOpFunctions() {
+    std::tuple<std::vector<std::string>, std::vector<std::string>>>
+GenerateOpFunctions(int split_count) {
  auto& op_info_map = paddle::framework::OpInfoMap::Instance().map();
+  std::vector<std::tuple<std::vector<std::string>, std::vector<std::string>>>
+      result;
  std::vector<std::string> op_function_list, bind_function_list;
  auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
+  paddle::flat_hash_map<std::string, paddle::framework::OpInfo>
+      op_info_map_need_gen;
  for (auto& pair : op_info_map) {
    auto& op_info = pair.second;
    auto op_proto = op_info.proto_;
@@ -444,6 +448,22 @@ GenerateOpFunctions() {
      continue;
    }
+    op_info_map_need_gen.emplace(pair);
+  }
+  int cc_file_api_size = op_info_map_need_gen.size() / split_count;
+  if (op_info_map_need_gen.size() % split_count != 0) {
+    cc_file_api_size++;
+  }
+  int api_index = 0;
+  int file_index = 0;
+  for (auto& pair : op_info_map_need_gen) {
+    auto& op_info = pair.second;
+    auto op_proto = op_info.proto_;
+    auto& op_type = op_proto->type();
    // NOTE(pangyoki): Inplace Strategy.
    // In this case, output will reuse input varbase.
    // Dygraph mode needs to be aligned with the in-place strategy in static
@@ -489,13 +509,24 @@ GenerateOpFunctions() {
      op_function_list.emplace_back(std::move(inplace_op_function_str));
      bind_function_list.emplace_back(std::move(inplace_bind_function_str));
    }
+    api_index++;
+    if (api_index / cc_file_api_size > file_index) {
+      file_index++;
+      result.push_back(std::make_tuple(op_function_list, bind_function_list));
+      op_function_list.clear();
+      bind_function_list.clear();
+    }
  }
-  return std::make_tuple(op_function_list, bind_function_list);
+  result.push_back(std::make_tuple(op_function_list, bind_function_list));
+  return result;
 }
 int main(int argc, char* argv[]) {
-  if (argc != 2) {
+  if (argc != 3) {
-    std::cerr << "argc must be 2" << std::endl;
+    std::cerr << "argc must be 3" << std::endl;
    return -1;
  }
@@ -513,39 +544,45 @@ int main(int argc, char* argv[]) {
                                   "\"paddle/fluid/pybind/op_function.h\"",
                                   "<Python.h>"};
-  std::ofstream out(argv[1], std::ios::out);
+  std::string path = argv[1];
+  int split_count = atoi(argv[2]);
-  for (auto& header : headers) {
+  auto op_funcs = GenerateOpFunctions(split_count);
-    out << "#include  " + header + "\n";
-  }
-  out << "\n\n";
+  for (size_t i = 0; i < op_funcs.size(); i++) {
+    std::ofstream out(path + "op_function" + std::to_string(i + 1) + ".cc.tmp",
-  auto op_funcs = GenerateOpFunctions();
+                      std::ios::out);
-  out << "namespace paddle {\n"
+    for (auto& header : headers) {
-      << "namespace pybind {\n\n";
+      out << "#include  " + header + "\n";
-  out << "std::atomic<int> VarBaseUniqueNameID{0};\n";
+    }
-  out << paddle::string::join_strings(std::get<0>(op_funcs), '\n');
-  out << "\n\n";
+    out << "\n\n";
-  out << "static PyMethodDef ExtestMethods[] = {\n"
+    out << "namespace paddle {\n"
-      << paddle::string::join_strings(std::get<1>(op_funcs), '\n')
+        << "namespace pybind {\n\n";
-      << "\n  {nullptr,nullptr,0,nullptr}"
+    out << "extern std::atomic<int> VarBaseUniqueNameID;\n";
-      << "};\n\n";
+    out << paddle::string::join_strings(std::get<0>(op_funcs[i]), '\n');
+    out << "\n\n";
-  out << "void BindOpFunctions(pybind11::module *module) {\n"
-      << "  auto m = module->def_submodule(\"ops\");\n"
+    out << "static PyMethodDef ExtestMethods[] = {\n"
-      << "  if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n"
+        << paddle::string::join_strings(std::get<1>(op_funcs[i]), '\n')
-      << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
+        << "\n  {nullptr,nullptr,0,nullptr}"
-         "core.ops failed!\"));\n"
+        << "};\n\n";
-      << "  }\n\n"
-      << "  InitOpsAttrTypeMap();"
+    out << "void BindOpFunctions" << i + 1 << "(pybind11::module *module) {\n"
-      << "}\n\n"
+        << "  auto m = module->def_submodule(\"ops\");\n"
-      << "} // namespace pybind\n"
+        << "  if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n"
-      << "} // namespace paddle\n";
+        << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
+           "core.ops failed!\"));\n"
-  out.close();
+        << "  }\n\n"
+        << "  InitOpsAttrTypeMap();"
+        << "}\n\n"
+        << "} // namespace pybind\n"
+        << "} // namespace paddle\n";
+    out.close();
+  }
 #ifdef PADDLE_WITH_ASCEND_CL
  ge::GEFinalize();

--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <Python.h>
+#include <algorithm>
+#include <cctype>
+#include <cstdlib>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT // for call_once
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/executor_cache.h"
+#include "paddle/fluid/framework/executor_gc_helper.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
+#include "paddle/fluid/framework/ir/cost_model.h"
+#include "paddle/fluid/framework/ir/generate_pass.h"
+#include "paddle/fluid/framework/ir/pass_builder.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/new_executor/executor_statistics.h"
+#include "paddle/fluid/framework/new_executor/standalone_executor.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/framework/parallel_executor.h"
+#include "paddle/fluid/framework/phi_utils.h"
+#include "paddle/fluid/framework/prune.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/save_load_util.h"
+#include "paddle/fluid/framework/scope_pool.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/trainer.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/imperative/amp_auto_cast.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/memory/allocation/allocator_strategy.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
+#endif
+#include "paddle/fluid/memory/allocation/mmap_allocator.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/fluid/operators/py_func_op.h"
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/monitor.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/profiler.h"
+#include "paddle/fluid/pybind/cuda_streams_py.h"
+#include "paddle/fluid/pybind/distributed_py.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/pybind/io.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/lod_utils.h"
+#include "paddle/utils/none.h"
+#ifdef PADDLE_WITH_ASCEND
+#include "paddle/fluid/pybind/ascend_wrapper_py.h"
+#endif
+#include "paddle/fluid/pybind/bind_cost_model.h"
+#include "paddle/fluid/pybind/bind_fleet_executor.h"
+#include "paddle/fluid/pybind/box_helper_py.h"
+#include "paddle/fluid/pybind/communication.h"
+#include "paddle/fluid/pybind/compatible.h"
+#include "paddle/fluid/pybind/const_value.h"
+#include "paddle/fluid/pybind/data_set_py.h"
+#include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/fleet_wrapper_py.h"
+#include "paddle/fluid/pybind/generator_py.h"
+#include "paddle/fluid/pybind/global_value_getter_setter.h"
+#include "paddle/fluid/pybind/gloo_context_py.h"
+#include "paddle/fluid/pybind/gloo_wrapper_py.h"
+#include "paddle/fluid/pybind/heter_wrapper_py.h"
+#include "paddle/fluid/pybind/inference_api.h"
+#include "paddle/fluid/pybind/ir.h"
+#include "paddle/fluid/pybind/metrics_py.h"
+#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
+#include "paddle/fluid/pybind/pybind_boost_headers.h"
+#include "paddle/phi/backends/device_manager.h"
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/pybind/nccl_wrapper_py.h"
+#endif
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/pybind/protobuf.h"
+#include "paddle/fluid/pybind/pybind.h"  // NOLINT
+#include "paddle/fluid/pybind/reader_py.h"
+#include "paddle/fluid/pybind/tensor_py.h"
+#include "paddle/fluid/string/to_string.h"
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#endif
+#ifndef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
+#endif
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
+#endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/phi/capi/capi.h"
+#endif
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#endif
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#endif
+#ifdef PADDLE_WITH_CRYPTO
+#include "paddle/fluid/pybind/crypto.h"
+#endif
+#if defined PADDLE_WITH_PSCORE
+#include "paddle/fluid/pybind/fleet_py.h"
+#endif
+#ifdef PADDLE_WITH_CINN
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#endif
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/imperative/layout_autotune.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/parallel_executor.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/phi/kernels/autotune/cache.h"
+#include "paddle/phi/kernels/autotune/switch_autotune.h"
+#include "pybind11/stl.h"
+DECLARE_bool(use_mkldnn);
+// disable auto conversion to list in Python
+PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
+namespace paddle {
+namespace pybind {
+using namespace paddle::framework;                // NOLINT
+void BindParallelExecutor(pybind11::module &m) {  // NOLINT
+  // -- python binds for parallel executor.
+  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
+  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
+    ExecutionStrategy allows the user to more preciously control how to run
+    the program in ParallelExecutor by setting the property.
+    Returns:
+        ExecutionStrategy: An ExecutionStrategy object.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.static as static
+          import paddle.nn.functional as F
+          paddle.enable_static()
+          x = static.data(name='x', shape=[None, 13], dtype='float32')
+          y = static.data(name='y', shape=[None, 1], dtype='float32')
+          y_predict = static.nn.fc(input=x, size=1, act=None)
+          cost = F.square_error_cost(input=y_predict, label=y)
+          avg_loss = paddle.mean(cost)
+          sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+          sgd_optimizer.minimize(avg_loss)
+          exec_strategy = static.ExecutionStrategy()
+          exec_strategy.num_threads = 4
+          train_exe = static.ParallelExecutor(use_cuda=False,
+                                              loss_name=avg_loss.name,
+                                              exec_strategy=exec_strategy)
+        )DOC");
+  py::enum_<paddle::platform::DeviceType>(m, "DeviceType", py::arithmetic())
+      .value("CPU", paddle::platform::DeviceType::CPU)
+      .value("CUDA", paddle::platform::DeviceType::CUDA)
+      .value("XPU", paddle::platform::DeviceType::XPU);
+  exec_strategy.def(py::init())
+      .def_property(
+          "num_threads",
+          [](const ExecutionStrategy &self) { return self.num_threads_; },
+          [](ExecutionStrategy &self, size_t num_threads) {
+            self.num_threads_ = num_threads;
+          },
+          R"DOC(
+            The type is INT, num_threads represents the size of thread pool that
+            used to run the operators of the current program in ParallelExecutor.
+            If :math:`num\_threads=1`, all the operators will execute one by one,
+            but the order maybe difference between iterations.
+            If it is not set, it will be set in ParallelExecutor according to the
+            device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
+            :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
+            if it is not set, ParallelExecutor will get the cpu count by calling
+            `multiprocessing.cpu_count()`. Default 0.
+            Examples:
+                .. code-block:: python
+                    import paddle
+                    import paddle.static as static
+                    paddle.enable_static()
+                    exec_strategy = static.ExecutionStrategy()
+                    exec_strategy.num_threads = 4
+            )DOC")
+      .def_property(
+          "_use_device",
+          [](const ExecutionStrategy &self) { return self.use_device_; },
+          [](ExecutionStrategy &self, paddle::platform::DeviceType use_device) {
+            self.use_device_ = use_device;
+          })  // NOTE(liuyuhui): Doesn't add doc for 'use_device', because
+              // use_device isn‘t exposed to users.
+      .def_property(
+          "allow_op_delay",
+          [](const ExecutionStrategy &self) { return self.allow_op_delay_; },
+          [](ExecutionStrategy &self, bool allow_op_delay) {
+            self.allow_op_delay_ = allow_op_delay;
+          },
+          R"DOC(The type is BOOL, allow_op_delay represents whether to delay the
+                communication operators to run, it may make the execution faster.
+                Note that this option is invalid now, and it will be removed in
+                next version. Default False.)DOC")
+      .def_property(
+          "num_iteration_per_drop_scope",
+          [](const ExecutionStrategy &self) {
+            return self.num_iteration_per_drop_scope_;
+          },
+          [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
+            self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
+          },
+          R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
+                many iterations to clean up the temp variables which
+                is generated during execution. It may make the execution faster,
+                because the temp variable's shape maybe the same between two iterations.
+                Default 100.
+                .. note::
+                    1. If you fetch data when calling the 'run', the ParallelExecutor 
+                    will clean up the temp variables at the end of the current iteration. 
+                    2. In some NLP model, it may cause the GPU memory is insufficient, 
+                    in this case, you should reduce `num_iteration_per_drop_scope`.
+                Examples:
+                    .. code-block:: python
+                        import paddle
+                        import paddle.static as static
+                        paddle.enable_static()
+                        exec_strategy = static.ExecutionStrategy()
+                        exec_strategy.num_iteration_per_drop_scope = 10
+              )DOC")
+      .def_property(
+          "num_iteration_per_run",
+          [](const ExecutionStrategy &self) {
+            return self.num_iteration_per_run_;
+          },
+          [](ExecutionStrategy &self, size_t num_iteration_per_run) {
+            self.num_iteration_per_run_ = num_iteration_per_run;
+          },
+          R"DOC(This config that how many iteration the executor will run when
+                user call exe.run() in python。Default: 1.
+                Examples:
+                    .. code-block:: python
+                        import paddle
+                        import paddle.static as static
+                        paddle.enable_static()
+                        exec_strategy = static.ExecutionStrategy()
+                        exec_strategy.num_iteration_per_run = 10
+              )DOC")
+      .def_property(
+          "use_thread_barrier",
+          [](const ExecutionStrategy &self) { return self.thread_barrier_; },
+          [](ExecutionStrategy &self, bool use_thread_barrier) {
+            self.thread_barrier_ = use_thread_barrier;
+          },
+          R"DOC(This config that the this is distributed training with parameter server
+              )DOC")
+      .def_property(
+          "_dry_run",
+          [](const ExecutionStrategy &self) { return self.dry_run_; },
+          [](ExecutionStrategy &self, bool dry_run) {
+            self.dry_run_ = dry_run;
+          });
+  exec_strategy.def_property(
+      "use_experimental_executor",
+      [](const ExecutionStrategy &self) {
+        return self.type_ == ExecutionStrategy::kExperimental;
+      },
+      [](ExecutionStrategy &self, bool experimental) {
+        self.type_ = experimental ? ExecutionStrategy::kExperimental
+                                  : ExecutionStrategy::kDefault;
+      });
+  py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy", R"DOC(
+    BuildStrategy allows the user to more preciously control how to
+    build the SSA Graph in ParallelExecutor by setting the property.
+    Returns:
+        BuildStrategy: An BuildStrategy object.
+    Examples:
+        .. code-block:: python
+            import os
+            import paddle
+            import paddle.static as static
+            paddle.enable_static()
+            os.environ['CPU_NUM'] = str(2)
+            places = static.cpu_places()
+            data = static.data(name="x", shape=[None, 1], dtype="float32")
+            hidden = static.nn.fc(input=data, size=10)
+            loss = paddle.mean(hidden)
+            paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
+            build_strategy = static.BuildStrategy()
+            build_strategy.enable_inplace = True
+            build_strategy.memory_optimize = True
+            build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
+            program = static.CompiledProgram(static.default_main_program())
+            program = program.with_data_parallel(loss_name=loss.name,
+                                                  build_strategy=build_strategy,
+                                                  places=places)
+)DOC");
+  py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
+      .value("Reduce", BuildStrategy::ReduceStrategy::kReduce)
+      .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce)
+      .value("_NoReduce", BuildStrategy::ReduceStrategy::kNoReduce);
+  py::enum_<BuildStrategy::GradientScaleStrategy>(build_strategy,
+                                                  "GradientScaleStrategy")
+      .value("CoeffNumDevice",
+             BuildStrategy::GradientScaleStrategy::kCoeffNumDevice)
+      .value("One", BuildStrategy::GradientScaleStrategy::kOne)
+      .value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized);
+  build_strategy.def(py::init())
+      .def("_clear_finalized", &BuildStrategy::ClearFinalized)
+      .def_property(
+          "reduce_strategy",
+          [](const BuildStrategy &self) { return self.reduce_; },
+          [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.reduce_ = strategy;
+          },
+          R"DOC((fluid.BuildStrategy.ReduceStrategy, optional): there are two reduce
+                strategies in ParallelExecutor, AllReduce and Reduce. If you want
+                that all the parameters' optimization are done on all devices independently,
+                you should choose AllReduce; otherwise, if you choose Reduce, all the parameters'
+                optimization will be evenly distributed to different devices, and then
+                broadcast the optimized parameter to other devices.
+                Default is 'AllReduce'.
+                Examples:
+                    .. code-block:: python
+                        import paddle
+                        import paddle.static as static
+                        paddle.enable_static()
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
+                  )DOC")
+      .def_property(
+          "gradient_scale_strategy",
+          [](const BuildStrategy &self) { return self.gradient_scale_; },
+          [](BuildStrategy &self,
+             BuildStrategy::GradientScaleStrategy strategy) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.gradient_scale_ = strategy;
+          },
+          R"DOC((paddle.static.BuildStrategy.GradientScaleStrategy, optional): there are three
+                ways of defining :math:`loss@grad` in ParallelExecutor, that is, CoeffNumDevice,
+                One and Customized. By default, ParallelExecutor sets the :math:`loss@grad`
+                according to the number of devices. If you want to customize :math:`loss@grad`,
+                you can choose Customized. Default is 'CoeffNumDevice'.
+                Examples:
+                    .. code-block:: python
+                        import numpy
+                        import os
+                        import paddle
+                        import paddle.static as static
+                        paddle.enable_static()
+                        use_cuda = True
+                        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+                        exe = static.Executor(place)
+                        # NOTE: If you use CPU to run the program, you need
+                        # to specify the CPU_NUM, otherwise, paddle will use
+                        # all the number of the logic core as the CPU_NUM,
+                        # in that case, the batch size of the input should be
+                        # greater than CPU_NUM, if not, the process will be
+                        # failed by an exception.
+                        if not use_cuda:
+                            os.environ['CPU_NUM'] = str(2)
+                            places = static.cpu_places()
+                        else:
+                            places = static.cuda_places()
+                        data = static.data(name='X', shape=[None, 1], dtype='float32')
+                        hidden = static.nn.fc(input=data, size=10)
+                        loss = paddle.mean(hidden)
+                        paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
+                        exe.run(static.default_startup_program())
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.gradient_scale_strategy = \
+                                  static.BuildStrategy.GradientScaleStrategy.Customized
+                        compiled_prog = static.CompiledProgram(
+                                  static.default_main_program()).with_data_parallel(
+                                          loss_name=loss.name, build_strategy=build_strategy,
+                                          places=places)
+                        dev_count =  len(places)
+                        x = numpy.random.random(size=(10, 1)).astype('float32')
+                        loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01
+                        loss_grad_name = loss.name+"@GRAD"
+                        loss_data = exe.run(compiled_prog,
+                                              feed={"X": x, loss_grad_name : loss_grad},
+                                              fetch_list=[loss.name, loss_grad_name])
+                   )DOC")
+      .def_property(
+          "debug_graphviz_path",
+          [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
+          [](BuildStrategy &self, const std::string &path) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.debug_graphviz_path_ = path;
+          },
+          R"DOC((str, optional): debug_graphviz_path indicates the path that
+                writing the SSA Graph to file in the form of graphviz.
+                It is useful for debugging. Default is empty string, that is, ""
+                Examples:
+                    .. code-block:: python
+                        import paddle
+                        import paddle.static as static
+                        paddle.enable_static()
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.debug_graphviz_path = "./graph"
+                    )DOC")
+      .def_property(
+          "enable_sequential_execution",
+          [](const BuildStrategy &self) {
+            return self.enable_sequential_execution_;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.enable_sequential_execution_ = b;
+          },
+          R"DOC((bool, optional): If set True, the execution order of ops would
+                be the same as what is in the program. Default is False.
+                Examples:
+                    .. code-block:: python
+                        import paddle
+                        import paddle.static as static
+                        paddle.enable_static()
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.enable_sequential_execution = True
+          )DOC")
+      .def_property(
+          "remove_unnecessary_lock",
+          [](const BuildStrategy &self) {
+            return self.remove_unnecessary_lock_;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.remove_unnecessary_lock_ = b;
+          },
+          R"DOC((bool, optional): If set True, some locks in GPU ops would be
+                released and ParallelExecutor would run faster. Default is True.
+                Examples:
+                    .. code-block:: python
+                        import paddle
+                        import paddle.static as static
+                        paddle.enable_static()
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.remove_unnecessary_lock = True
+          )DOC")
+      .def_property(
+          "num_trainers",
+          [](const BuildStrategy &self) { return self.num_trainers_; },
+          [](BuildStrategy &self, int num_trainers) {
+#ifdef WIN32
+            PADDLE_THROW(platform::errors::Unavailable(
+                "Distribution mode is not supported on Windows platform."));
+#endif
+            self.num_trainers_ = num_trainers;
+          })
+      .def_property(
+          "trainers_endpoints",
+          [](const BuildStrategy &self) { return self.trainers_endpoints_; },
+          [](BuildStrategy &self,
+             const std::vector<std::string> &trainers_endpoints) {
+            self.trainers_endpoints_ = trainers_endpoints;
+          })
+      .def_property(
+          "trainer_id",
+          [](const BuildStrategy &self) { return self.trainer_id_; },
+          [](BuildStrategy &self, int trainer_id) {
+            self.trainer_id_ = trainer_id;
+          })
+      .def_property(
+          "nccl_comm_num",
+          [](const BuildStrategy &self) { return self.nccl_comm_num_; },
+          [](BuildStrategy &self, int nccl_comm_num) {
+            self.nccl_comm_num_ = nccl_comm_num;
+          })
+      .def_property(
+          "bkcl_comm_num",
+          [](const BuildStrategy &self) { return self.bkcl_comm_num_; },
+          [](BuildStrategy &self, int bkcl_comm_num) {
+            self.bkcl_comm_num_ = bkcl_comm_num;
+          })
+      .def_property(
+          "use_hierarchical_allreduce",
+          [](const BuildStrategy &self) {
+            return self.use_hierarchical_allreduce_;
+          },
+          [](BuildStrategy &self, bool use) {
+            self.use_hierarchical_allreduce_ = use;
+          })
+      .def_property(
+          "hierarchical_allreduce_inter_nranks",
+          [](const BuildStrategy &self) {
+            return self.hierarchical_allreduce_inter_nranks_;
+          },
+          [](BuildStrategy &self, int nranks) {
+            self.hierarchical_allreduce_inter_nranks_ = nranks;
+          })
+      .def_property(
+          "fuse_elewise_add_act_ops",
+          [](const BuildStrategy &self) {
+            return self.fuse_elewise_add_act_ops_;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.fuse_elewise_add_act_ops_ = b;
+          },
+          R"DOC((bool, optional): fuse_elewise_add_act_ops indicate whether
+                to fuse elementwise_add_op and activation_op,
+                it may make the execution faster. Default is False.
+                Examples:
+                    .. code-block:: python
+                        import paddle
+                        import paddle.static as static
+                        paddle.enable_static()
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.fuse_elewise_add_act_ops = True
+                     )DOC")
+      .def_property(
+          "fuse_gemm_epilogue",
+          [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.fuse_gemm_epilogue_ = b;
+          },
+          R"DOC((bool, optional): fuse_gemm_epilogue indicate whether
+                to fuse matmul_op, elemenewist_add_op and activation_op,
+                it may make the execution faster. Default is False.
+                Examples:
+                    .. code-block:: python
+                        import paddle
+                        import paddle.static as static
+                        paddle.enable_static()
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.fuse_gemm_epilogue = True
+                     )DOC")
+      .def_property(
+          "fuse_bn_act_ops",
+          [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.fuse_bn_act_ops_ = b;
+          },
+          R"DOC((bool, optional): fuse_bn_act_ops indicate whether
+                to fuse batch_norm and activation_op,
+                it may make the execution faster. Default is False.
+                Examples:
+                    .. code-block:: python
+                        import paddle
+                        import paddle.static as static
+                        paddle.enable_static()
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.fuse_bn_act_ops = True
+                     )DOC")
+      .def_property(
+          "fuse_bn_add_act_ops",
+          [](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.fuse_bn_add_act_ops_ = b;
+          },
+          R"DOC((bool, optional): fuse_bn_add_act_ops indicate whether
+                to fuse batch_norm, elementwise_add and activation_op,
+                it may make the execution faster. Default is True
+                Examples:
+                    .. code-block:: python
+                        import paddle
+                        import paddle.static as static
+                        paddle.enable_static()
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.fuse_bn_add_act_ops = True
+                     )DOC")
+      .def_property(
+          "enable_auto_fusion",
+          [](const BuildStrategy &self) { return self.enable_auto_fusion_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.enable_auto_fusion_ = b;
+          },
+          R"DOC((bool, optional): Whether to enable fusing subgraph to a
+                fusion_group. Now we only support fusing subgraph that composed
+                of elementwise-like operators, such as elementwise_add/mul
+                without broadcast and activations.
+                Examples:
+                    .. code-block:: python
+                        import paddle
+                        import paddle.static as static
+                        paddle.enable_static()
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.enable_auto_fusion = True
+                    )DOC")
+      .def_property(
+          "fuse_relu_depthwise_conv",
+          [](const BuildStrategy &self) {
+            return self.fuse_relu_depthwise_conv_;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.fuse_relu_depthwise_conv_ = b;
+          },
+          R"DOC((bool, optional): fuse_relu_depthwise_conv indicate whether
+                to fuse relu and depthwise_conv2d,
+                it will save GPU memory and may make the execution faster.
+                This options is only available in GPU devices.
+                Default is False.
+                Examples:
+                    .. code-block:: python
+                        import paddle
+                        import paddle.static as static
+                        paddle.enable_static()
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.fuse_relu_depthwise_conv = True
+          )DOC")
+      .def_property(
+          "fuse_broadcast_ops",
+          [](const BuildStrategy &self) {
+            return self.fuse_broadcast_ops_ == true ||
+                   self.fuse_broadcast_ops_ == paddle::none;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, "
+                                  "cannot be configured again."));
+            self.fuse_broadcast_ops_ = b;
+          },
+          R"DOC((bool, optional): fuse_broadcast_op indicates whether
+                      to fuse the broadcast ops. Note that, in Reduce mode,
+                      fusing broadcast ops may make the program faster. Because
+                      fusing broadcast OP equals delaying the execution of all
+                      broadcast Ops, in this case, all nccl streams are used only
+                      for NCCLReduce operations for a period of time. Default False.
+                      Examples:
+                          .. code-block:: python
+                              import paddle
+                              import paddle.static as static
+                              paddle.enable_static()
+                              build_strategy = static.BuildStrategy()
+                              build_strategy.fuse_broadcast_ops = True
+                    )DOC")
+      .def_property(
+          "fuse_all_optimizer_ops",
+          [](const BuildStrategy &self) {
+            return self.fuse_all_optimizer_ops_ == true ||
+                   self.fuse_all_optimizer_ops_ == paddle::none;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, "
+                                  "cannot be configured again."));
+            self.fuse_all_optimizer_ops_ = b;
+          })
+      .def_property(
+          "sync_batch_norm",
+          [](const BuildStrategy &self) { return self.sync_batch_norm_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.sync_batch_norm_ = b;
+          },
+          R"DOC((bool, optional): sync_batch_norm indicates whether to use
+                synchronous batch normalization which synchronizes the mean
+                and variance through multi-devices in training phase.
+                Current implementation doesn't support FP16 training and CPU.
+                And only synchronous on one machine, not all machines. 
+                Default is False.
+                Examples:
+                    .. code-block:: python
+                        import paddle
+                        import paddle.static as static
+                        paddle.enable_static()
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.sync_batch_norm = True
+                )DOC")
+      .def_property(
+          "memory_optimize",
+          [](const BuildStrategy &self) -> py::object {
+            if (self.memory_optimize_) {
+              return py::cast(self.memory_optimize_.get());
+            } else {
+              return py::cast(nullptr);
+            }
+          },
+          [](BuildStrategy &self, const py::handle &value) {
+            auto *py_obj = value.ptr();
+            if (py_obj == nullptr || py_obj == Py_None) {
+              self.memory_optimize_ = paddle::none;
+            } else if (PyBool_Check(py_obj)) {
+              self.memory_optimize_ = (py_obj == Py_True);
+            } else {
+              PADDLE_THROW(platform::errors::InvalidArgument(
+                  "BuildStrategy.memory_optimize must be set to None, False "
+                  "or True"));
+            }
+          },
+          R"DOC((bool, optional): memory opitimize aims to save total memory
+                consumption, set to True to enable it.
+                Default None. None means framework would choose to use or not use 
+                this strategy automatically. Currently, None means that it is 
+                enabled when GC is disabled, and disabled when GC is enabled. 
+                True means enabling and False means disabling. Default is None.
+                Examples:
+                    .. code-block:: python
+                        import paddle
+                        import paddle.static as static
+                        paddle.enable_static()
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.memory_optimize = True
+                )DOC")
+      .def_property(
+          "is_distribution",
+          [](const BuildStrategy &self) { return self.is_distribution_; },
+          [](BuildStrategy &self, bool b) {
+#ifdef WIN32
+            if (b) {
+              PADDLE_THROW(platform::errors::Unavailable(
+                  "Distribution mode is not supported on Windows platform."));
+            }
+#else
+            self.is_distribution_ = b;
+#endif
+          })
+      .def_property(
+          "async_mode",
+          [](const BuildStrategy &self) { return self.async_mode_; },
+          [](BuildStrategy &self, bool b) { self.async_mode_ = b; })
+      .def_property(
+          "enable_inplace",
+          [](const BuildStrategy &self) { return self.enable_inplace_; },
+          [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; })
+      .def_property(
+          "enable_addto",
+          [](const BuildStrategy &self) { return self.enable_addto_; },
+          [](BuildStrategy &self, bool b) { self.enable_addto_ = b; })
+      .def_property(
+          "fuse_all_reduce_ops",
+          [](const BuildStrategy &self) {
+            return self.fuse_all_reduce_ops_ == true ||
+                   self.fuse_all_reduce_ops_ == paddle::none;
+          },
+          [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; })
+      .def_property(
+          "enable_backward_optimizer_op_deps",
+          [](const BuildStrategy &self) {
+            return self.enable_backward_optimizer_op_deps_;
+          },
+          [](BuildStrategy &self, bool b) {
+            self.enable_backward_optimizer_op_deps_ = b;
+          })
+      .def_property(
+          "cache_runtime_context",
+          [](const BuildStrategy &self) { return self.cache_runtime_context_; },
+          [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; })
+      .def_property(
+          "mkldnn_enabled_op_types",
+          [](const BuildStrategy &self) {
+            return self.mkldnn_enabled_op_types_;
+          },
+          [](BuildStrategy &self,
+             const std::unordered_set<std::string> &mkldnn_enabled_op_types) {
+            self.mkldnn_enabled_op_types_ = mkldnn_enabled_op_types;
+          })
+      .def_property(
+          "fix_op_run_order",
+          [](const BuildStrategy &self) { return self.fix_op_run_order_; },
+          [](BuildStrategy &self, bool fix_op_run_order) {
+            self.fix_op_run_order_ = fix_op_run_order;
+          })
+      .def_property(
+          "allow_cuda_graph_capture",
+          [](const BuildStrategy &self) {
+            return self.allow_cuda_graph_capture_;
+          },
+          [](BuildStrategy &self, bool allow_cuda_graph_capture) {
+            self.allow_cuda_graph_capture_ = allow_cuda_graph_capture;
+          })
+      .def("_copy",
+           [](const BuildStrategy &self) {
+             auto new_bs = self;
+             new_bs.ClearFinalized();
+             return new_bs;
+           })
+      .def(
+          "_finalize_strategy_and_create_passes",
+          [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
+            return self.CreatePassesFromStrategy(true);
+          },
+          R"DOC(Allow user to customized passes. Normally model-specific
+                optimization passes should be defined in this way. BuildStrategy
+                cannot be updated after being finalized.)DOC");
+  m.def("_set_cached_executor_build_strategy",
+        [](int64_t program_id, const BuildStrategy &build_strategy) {
+          auto &cached_exe_info = framework::ExecutorInfoCache::Instance();
+          cached_exe_info.SetBuildStrategy(program_id, build_strategy);
+        });
+  pe.def(py::init<const std::vector<platform::Place> &,
+                  const std::vector<std::string> &,
+                  const std::string &,
+                  Scope *,
+                  std::vector<Scope *> &,
+                  const ExecutionStrategy &,
+                  const BuildStrategy &,
+                  ir::Graph *>())
+      // NOTE: even we return a vec<Scope*>* to Python use reference policy.
+      // We still cannot get local_scope from this vector, since the element
+      // of vec<Scope*> will be freed by Python GC. We can only return Scope*
+      // one by one and mark them as reference.
+      .def(
+          "local_scopes",
+          [](ParallelExecutor &self) -> std::vector<Scope *> * {
+            return &self.GetLocalScopes();
+          },
+          py::return_value_policy::reference)
+      .def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes)
+      .def("_need_create_local_exe_scopes",
+           &ParallelExecutor::NeedCreateLocalExeScope)
+      .def("feed_tensors_into_local_scopes",
+           &ParallelExecutor::FeedTensorsIntoLocalScopes)
+      .def("feed_and_split_tensor_into_local_scopes",
+           &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
+      .def("run",
+           [](ParallelExecutor &self,
+              const std::vector<std::string> &fetch_tensors,
+              bool return_merged) -> py::object {
+             if (return_merged) {
+               paddle::framework::FetchList ret;
+               /*gil_scoped_release*/ {
+                 pybind11::gil_scoped_release release;
+                 ret = self.RunAndMerge(fetch_tensors);
+               }
+               return py::cast(std::move(ret));
+             } else {
+               paddle::framework::FetchUnmergedList ret;
+               /*gil_scoped_release*/ {
+                 pybind11::gil_scoped_release release;
+                 ret = self.Run(fetch_tensors);
+               }
+               return py::cast(std::move(ret));
+             }
+           })
+      .def("device_count", &ParallelExecutor::DeviceCount);
+  using VarQuantScale =
+      std::unordered_map<std::string, std::pair<bool, LoDTensor>>;
+  py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
+  pass.def(py::init())
+      .def("has", &ir::Pass::Has)
+      .def("set_not_owned",
+           [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) {
+             self.SetNotOwned<ProgramDesc>(attr_name, &attr);
+           })
+      .def(
+          "set",
+          [](ir::Pass &self, const std::string &name, const std::string &attr) {
+            self.Set<std::string>(name, new std::string(attr));
+          })
+      .def("set",
+           [](ir::Pass &self, const std::string &name, bool val) {
+             self.Set<bool>(name, new bool(val));
+           })
+      .def("set",
+           [](ir::Pass &self, const std::string &name, int val) {
+             self.Set<const int>(name, new int(val));
+           })
+      .def("set",
+           [](ir::Pass &self,
+              const std::string &name,
+              std::vector<std::string> set) {
+             self.Set(name, new std::vector<std::string>(set));
+           })
+      .def("set",
+           [](ir::Pass &self,
+              const std::string &name,
+              std::unordered_set<std::string> set) {
+             self.Set(name, new std::unordered_set<std::string>(set));
+           })
+      .def("set",
+           [](ir::Pass &self,
+              const std::string &name,
+              std::unordered_set<int> set) {
+             self.Set(name, new std::unordered_set<int>(set));
+           })
+      .def("set",
+           [](ir::Pass &self, const std::string &name, VarQuantScale scales) {
+             self.Set(name, new VarQuantScale(scales));
+           })
+      .def("type", &ir::Pass::Type)
+      .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
+        self.Apply(graph.get());
+      });
+  py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
+      m, "PassBuilder");
+  pb.def(py::init())
+      .def("append_pass",
+           [](ir::PassBuilder &self,
+              const std::string &pass_type) -> std::shared_ptr<ir::Pass> {
+             return self.AppendPass(pass_type);
+           })
+      .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); })
+      .def("insert_pass",
+           [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) {
+             return self.InsertPass(idx, pass_type);
+           })
+      .def("remove_pass",
+           [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
+}
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/parallel_executor.h
+++ b/paddle/fluid/pybind/parallel_executor.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "pybind11/pybind11.h"
+namespace paddle {
+namespace pybind {
+void BindParallelExecutor(pybind11::module& m);  // NOLINT
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <Python.h>
+#include <algorithm>
+#include <cctype>
+#include <cstdlib>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT // for call_once
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/executor_cache.h"
+#include "paddle/fluid/framework/executor_gc_helper.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
+#include "paddle/fluid/framework/ir/cost_model.h"
+#include "paddle/fluid/framework/ir/generate_pass.h"
+#include "paddle/fluid/framework/ir/pass_builder.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/new_executor/executor_statistics.h"
+#include "paddle/fluid/framework/new_executor/standalone_executor.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/framework/parallel_executor.h"
+#include "paddle/fluid/framework/phi_utils.h"
+#include "paddle/fluid/framework/prune.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/save_load_util.h"
+#include "paddle/fluid/framework/scope_pool.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/trainer.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/imperative/amp_auto_cast.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/memory/allocation/allocator_strategy.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
+#endif
+#include "paddle/fluid/memory/allocation/mmap_allocator.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/fluid/operators/py_func_op.h"
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/monitor.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/profiler.h"
+#include "paddle/fluid/pybind/cuda_streams_py.h"
+#include "paddle/fluid/pybind/distributed_py.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/pybind/io.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/lod_utils.h"
+#include "paddle/utils/none.h"
+#ifdef PADDLE_WITH_ASCEND
+#include "paddle/fluid/pybind/ascend_wrapper_py.h"
+#endif
+#include "paddle/fluid/pybind/bind_cost_model.h"
+#include "paddle/fluid/pybind/bind_fleet_executor.h"
+#include "paddle/fluid/pybind/box_helper_py.h"
+#include "paddle/fluid/pybind/communication.h"
+#include "paddle/fluid/pybind/compatible.h"
+#include "paddle/fluid/pybind/const_value.h"
+#include "paddle/fluid/pybind/data_set_py.h"
+#include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/fleet_wrapper_py.h"
+#include "paddle/fluid/pybind/generator_py.h"
+#include "paddle/fluid/pybind/global_value_getter_setter.h"
+#include "paddle/fluid/pybind/gloo_context_py.h"
+#include "paddle/fluid/pybind/gloo_wrapper_py.h"
+#include "paddle/fluid/pybind/heter_wrapper_py.h"
+#include "paddle/fluid/pybind/inference_api.h"
+#include "paddle/fluid/pybind/ir.h"
+#include "paddle/fluid/pybind/metrics_py.h"
+#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
+#include "paddle/fluid/pybind/pybind_boost_headers.h"
+#include "paddle/phi/backends/device_manager.h"
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/pybind/nccl_wrapper_py.h"
+#endif
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/pybind/protobuf.h"
+#include "paddle/fluid/pybind/pybind.h"  // NOLINT
+#include "paddle/fluid/pybind/reader_py.h"
+#include "paddle/fluid/pybind/tensor_py.h"
+#include "paddle/fluid/string/to_string.h"
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#endif
+#ifndef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
+#endif
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
+#endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/phi/capi/capi.h"
+#endif
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#endif
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#endif
+#ifdef PADDLE_WITH_CRYPTO
+#include "paddle/fluid/pybind/crypto.h"
+#endif
+#if defined PADDLE_WITH_PSCORE
+#include "paddle/fluid/pybind/fleet_py.h"
+#endif
+#ifdef PADDLE_WITH_CINN
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#endif
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/imperative/layout_autotune.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/place.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/phi/kernels/autotune/cache.h"
+#include "paddle/phi/kernels/autotune/switch_autotune.h"
+#include "pybind11/stl.h"
+DECLARE_bool(use_mkldnn);
+// disable auto conversion to list in Python
+PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
+namespace paddle {
+namespace pybind {
+PyTypeObject *g_place_pytype = nullptr;
+PyTypeObject *g_customplace_pytype = nullptr;
+PyTypeObject *g_cudaplace_pytype = nullptr;
+PyTypeObject *g_cpuplace_pytype = nullptr;
+PyTypeObject *g_xpuplace_pytype = nullptr;
+PyTypeObject *g_npuplace_pytype = nullptr;
+PyTypeObject *g_cudapinnedplace_pytype = nullptr;
+PyTypeObject *g_mluplace_pytype = nullptr;
+template <typename PlaceType>
+static inline int PlaceIndex(const PlaceType &p) {  // NOLINT
+  return static_cast<int>(paddle::platform::Place(p).GetType());
+}
+template <typename PlaceType1, typename PlaceType2>
+static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) {
+  return paddle::platform::Place(p1) == paddle::platform::Place(p2);
+}
+void BindPlace(pybind11::module &m) {  // NOLINT
+  using namespace paddle::framework;   // NOLINT
+  py::class_<platform::CustomPlace> customplace(m,
+                                                "CustomPlace",
+                                                R"DOC(
+    CustomPlace is a descriptor of a device.
+    It represents a custom device on which a tensor will be allocated and a model will run.
+    Examples:
+        .. code-block:: python
+          import paddle
+          fake_cpu_place = paddle.CustomPlace("FakeCPU", 0)
+                                             )DOC");
+  g_customplace_pytype = reinterpret_cast<PyTypeObject *>(customplace.ptr());
+  customplace
+      .def("__init__",
+           [](platform::CustomPlace &self,
+              const std::string &device_type,
+              int dev_id) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid CustomPlace(%s, %d), device id must be 0 "
+                   "or "
+                   "positive integer",
+                   device_type,
+                   dev_id);
+               std::exit(-1);
+             }
+             if (LIKELY(phi::DeviceManager::HasDeviceType(device_type) &&
+                        phi::DeviceManager::IsCustom(device_type))) {
+               int dev_count = static_cast<int>(
+                   phi::DeviceManager::GetDeviceCount(device_type));
+               if (UNLIKELY(dev_id >= dev_count)) {
+                 if (dev_count == 0) {
+                   LOG(ERROR) << "Cannot use " << device_type
+                              << " because there is no " << device_type
+                              << " detected on your "
+                                 "machine.";
+                   std::exit(-1);
+                 } else {
+                   LOG(ERROR) << string::Sprintf(
+                       "Invalid CustomPlace(%s, %d), dev_id must "
+                       "inside "
+                       "[0, %d), because %s "
+                       "number on your machine is %d",
+                       device_type,
+                       dev_id,
+                       dev_count,
+                       device_type,
+                       dev_count);
+                   std::exit(-1);
+                 }
+               }
+               new (&self) platform::CustomPlace(device_type, dev_id);
+             } else {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid CustomPlace(%s, %d), the device type is "
+                   "not registered "
+                   "as a custom device.",
+                   device_type,
+                   dev_id);
+               std::exit(-1);
+             }
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use CustomDevice because you have installed CPU/GPU"
+                 "version PaddlePaddle.\n"
+                 "If you want to use CustomDevice, please try to install"
+                 "CustomDevice version "
+                 "PaddlePaddle by: pip install paddlepaddle\n"
+                 "If you only have CPU, please change "
+                 "CustomPlace(%s, %d) to be CPUPlace().\n",
+                 device_type, dev_id);
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::CustomPlace>)
+      .def("get_device_id",
+           [](const platform::CustomPlace &self) { return self.GetDeviceId(); })
+      .def("get_device_type",
+           [](const platform::CustomPlace &self) {
+             return self.GetDeviceType();
+           })
+      .def("__repr__", string::to_string<const platform::CustomPlace &>)
+      .def("__str__", string::to_string<const platform::CustomPlace &>);
+  py::class_<platform::CUDAPlace> cudaplace(m, "CUDAPlace", R"DOC(
+    CUDAPlace is a descriptor of a device.
+    It represents a GPU device allocated or to be allocated with Tensor or LoDTensor.
+    Each CUDAPlace has a dev_id to indicate the graphics card ID represented by the current CUDAPlace,
+    staring from 0.
+    The memory of CUDAPlace with different dev_id is not accessible.
+    Numbering here refers to the logical ID of the visible graphics card, not the actual ID of the graphics card.
+    You can set visible GPU devices by setting the `CUDA_VISIBLE_DEVICES` environment variable.
+    When the program starts, visible GPU devices will be numbered from 0.
+    If `CUDA_VISIBLE_DEVICES` is not set, all devices are visible by default,
+    and the logical ID is the same as the actual ID.
+    Parameters:
+        id (int): GPU device ID.
+    Examples:
+        .. code-block:: python
+          import paddle
+          place = paddle.CUDAPlace(0)
+        )DOC");
+  g_cudaplace_pytype = reinterpret_cast<PyTypeObject *>(cudaplace.ptr());
+  cudaplace
+      .def("__init__",
+           [](platform::CUDAPlace &self, int dev_id) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid CUDAPlace(%d), device id must be 0 or "
+                   "positive integer",
+                   dev_id);
+               std::exit(-1);
+             }
+             if (UNLIKELY(dev_id >= platform::GetGPUDeviceCount())) {
+               if (platform::GetGPUDeviceCount() == 0) {
+                 LOG(ERROR) << "Cannot use GPU because there is no GPU "
+                               "detected on your "
+                               "machine.";
+                 std::exit(-1);
+               } else {
+                 LOG(ERROR) << string::Sprintf(
+                     "Invalid CUDAPlace(%d), must inside [0, %d), because GPU "
+                     "number on your machine is %d",
+                     dev_id,
+                     platform::GetGPUDeviceCount(),
+                     platform::GetGPUDeviceCount());
+                 std::exit(-1);
+               }
+             }
+             new (&self) platform::CUDAPlace(dev_id);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use GPU because you have installed CPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use GPU, please try to install GPU version "
+                 "PaddlePaddle by: pip install paddlepaddle-gpu\n"
+                 "If you only have CPU, please change CUDAPlace(%d) to be "
+                 "CPUPlace().\n",
+                 dev_id);
+             std::exit(-1);
+#endif
+           })
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      .def("get_device_id",
+           [](const platform::CUDAPlace &self) { return self.GetDeviceId(); })
+      .def("_type", &PlaceIndex<platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::MLUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
+      .def("_get_device_id",
+           [](platform::CUDAPlace &self) -> int { return self.GetDeviceId(); })
+#endif
+      .def("__repr__", string::to_string<const platform::CUDAPlace &>)
+      .def("__str__", string::to_string<const platform::CUDAPlace &>);
+  py::class_<platform::XPUPlace> xpuplace(m, "XPUPlace", R"DOC(
+    **Note**:
+    Examples:
+        .. code-block:: python
+          import paddle.fluid as fluid
+          xpu_place = fluid.XPUPlace(0)
+        )DOC");
+  g_xpuplace_pytype = reinterpret_cast<PyTypeObject *>(xpuplace.ptr());
+  xpuplace
+      .def("__init__",
+           [](platform::XPUPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_XPU
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid XPUPlace(%d), device id must be 0 or "
+                   "positive integer",
+                   dev_id);
+               std::exit(-1);
+             }
+             if (UNLIKELY(dev_id >= platform::GetXPUDeviceCount())) {
+               if (platform::GetXPUDeviceCount() == 0) {
+                 LOG(ERROR) << "Cannot use XPU because there is no XPU "
+                               "detected on your "
+                               "machine.";
+                 std::exit(-1);
+               } else {
+                 LOG(ERROR) << string::Sprintf(
+                     "Invalid XPUPlace(%d), must inside [0, %d), because XPU "
+                     "number on your machine is %d",
+                     dev_id,
+                     platform::GetXPUDeviceCount(),
+                     platform::GetXPUDeviceCount());
+                 std::exit(-1);
+               }
+             }
+             new (&self) platform::XPUPlace(dev_id);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use XPU because you have installed CPU/GPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use XPU, please try to install XPU version "
+                 "PaddlePaddle by: pip install paddlepaddle-xpu\n"
+                 "If you only have CPU, please change XPUPlace(%d) to be "
+                 "CPUPlace().\n",
+                 dev_id);
+             std::exit(-1);
+#endif
+           })
+#ifdef PADDLE_WITH_XPU
+      .def("_type", &PlaceIndex<platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::XPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::XPUPlace, platform::CUDAPinnedPlace>)
+      .def("get_device_id",
+           [](const platform::XPUPlace &self) { return self.GetDeviceId(); })
+#endif
+      .def("__repr__", string::to_string<const platform::XPUPlace &>)
+      .def("__str__", string::to_string<const platform::XPUPlace &>);
+#ifdef PADDLE_WITH_XPU
+  py::enum_<phi::backends::xpu::XPUVersion>(m, "XPUVersion", py::arithmetic())
+      .value("XPU1", phi::backends::xpu::XPUVersion::XPU1)
+      .value("XPU2", phi::backends::xpu::XPUVersion::XPU2)
+      .export_values();
+  m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
+  m.def("get_xpu_device_version",
+        [](int device_id) { return platform::get_xpu_version(device_id); });
+#ifdef PADDLE_WITH_XPU_KP
+  m.def("get_xpu_device_op_support_types",
+        [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
+          return platform::get_xpu_kp_op_support_type(op_name, version);
+        });
+#else
+  m.def("get_xpu_device_op_support_types",
+        [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
+          return platform::get_xpu_op_support_type(op_name, version);
+        });
+#endif
+  m.def("get_xpu_device_op_list", [](phi::backends::xpu::XPUVersion version) {
+    return platform::get_xpu_op_list(version);
+  });
+  m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool {
+    // XPUs with Compute Capability > xpu2 support float16 and bfloat16
+    return platform::get_xpu_version(place.device) >
+           phi::backends::xpu::XPUVersion::XPU1;
+  });
+  m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool {
+    // XPUs with Compute Capability > xpu2 support float16 and bfloat16
+    return platform::get_xpu_version(place.device) >
+           phi::backends::xpu::XPUVersion::XPU1;
+  });
+#endif
+  py::class_<paddle::platform::CPUPlace> cpuplace(m, "CPUPlace", R"DOC(
+    CPUPlace is a descriptor of a device.
+    It represents a CPU device on which a tensor will be allocated and a model will run.
+    Examples:
+        .. code-block:: python
+          import paddle
+          cpu_place = paddle.CPUPlace()
+        )DOC");
+  g_cpuplace_pytype = reinterpret_cast<PyTypeObject *>(cpuplace.ptr());
+  cpuplace.def(py::init<>())
+      .def("_type", &PlaceIndex<platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CPUPlace, platform::CUDAPinnedPlace>)
+      .def("__repr__", string::to_string<const platform::CPUPlace &>)
+      .def("__str__", string::to_string<const platform::CPUPlace &>);
+  py::class_<paddle::platform::CUDAPinnedPlace> cudapinnedplace(
+      m, "CUDAPinnedPlace", R"DOC(
+    CUDAPinnedPlace is a descriptor of a device.
+    It refers to the page locked memory allocated by the CUDA function `cudaHostAlloc()` in the host memory.
+    The host operating system will not paging and exchanging the memory.
+    It can be accessed through direct memory access technology to speed up the copy of data between the host and GPU.
+    For more information on CUDA data transfer and `pinned memory`,
+    please refer to `official document <https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#pinned-memory>`_ .
+    Examples:
+        .. code-block:: python
+          import paddle
+          place = paddle.CUDAPinnedPlace()
+        )DOC");
+  g_cudapinnedplace_pytype =
+      reinterpret_cast<PyTypeObject *>(cudapinnedplace.ptr());
+  cudapinnedplace
+      .def("__init__",
+           [](platform::CUDAPinnedPlace &self) {
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+             PADDLE_THROW(platform::errors::PermissionDenied(
+                 "Cannot use CUDAPinnedPlace in CPU only version, "
+                 "Please recompile or reinstall Paddle with CUDA support."));
+#endif
+             new (&self) platform::CUDAPinnedPlace();
+           })
+      .def("_type", &PlaceIndex<platform::CUDAPinnedPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::NPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>)
+      .def("__repr__", string::to_string<const platform::CUDAPinnedPlace &>)
+      .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
+  // NPUPlace
+  py::class_<platform::NPUPlace> npuplace(m, "NPUPlace", R"DOC(
+    NPUPlace is a descriptor of a device.
+    It represents a NPU device on which a tensor will be allocated and a model will run.
+    Examples:
+        .. code-block:: python
+          import paddle
+          npu_place = paddle.NPUPlace(0)
+        )DOC");
+  g_npuplace_pytype = reinterpret_cast<PyTypeObject *>(npuplace.ptr());
+  npuplace
+      .def("__init__",
+           [](platform::NPUPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_ASCEND_CL
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid NPUPlace(%d), device id must be 0 or "
+                   "positive integer",
+                   dev_id);
+               std::exit(-1);
+             }
+             if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) {
+               if (platform::GetNPUDeviceCount() == 0) {
+                 LOG(ERROR) << "Cannot use NPU because there is no NPU "
+                               "detected on your "
+                               "machine.";
+                 std::exit(-1);
+               } else {
+                 LOG(ERROR) << string::Sprintf(
+                     "Invalid NPUPlace(%d), must inside [0, %d), because NPU "
+                     "number on your machine is %d",
+                     dev_id,
+                     platform::GetNPUDeviceCount(),
+                     platform::GetNPUDeviceCount());
+                 std::exit(-1);
+               }
+             }
+             new (&self) platform::NPUPlace(dev_id);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use NPU because you have installed CPU/GPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use NPU, please try to install NPU version "
+                 "PaddlePaddle by: pip install paddlepaddle-npu\n"
+                 "If you only have CPU, please change NPUPlace(%d) to be "
+                 "CPUPlace().\n",
+                 dev_id);
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::NPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::NPUPlace, platform::CUDAPinnedPlace>)
+      .def("get_device_id",
+           [](const platform::NPUPlace &self) { return self.GetDeviceId(); })
+      .def("__str__", string::to_string<const platform::NPUPlace &>);
+  // IPUPlace
+  py::class_<platform::IPUPlace>(m, "IPUPlace", R"DOC(
+    IPUPlace is a descriptor of a device.
+    It represents a IPU device on which a tensor will be allocated and a model will run.
+    Examples:
+        .. code-block:: python
+          import paddle
+          # required: ipu
+          ipu_place = paddle.IPUPlace()
+        )DOC")
+      .def("__init__",
+           [](platform::IPUPlace &self) {
+#ifdef PADDLE_WITH_IPU
+             if (platform::GetIPUDeviceCount() == 0) {
+               LOG(ERROR) << "Cannot use IPU because there is no IPU "
+                             "detected on your "
+                             "machine.";
+               std::exit(-1);
+             }
+             // use ipu(0) to comile, while run with the number user configure
+             // in sharding and pipline.
+             new (&self) platform::IPUPlace(0);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use IPU because you didn't install IPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use IPU, please try to install IPU version "
+                 "PaddlePaddle by: pip install paddlepaddle*\n"
+                 "If you only have CPU, please change IPUPlace to be "
+                 "CPUPlace().\n");
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::IPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::IPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::IPUPlace, platform::CUDAPinnedPlace>)
+#ifdef PADDLE_WITH_IPU
+      .def("get_device_id",
+           [](const platform::IPUPlace &self) { return self.GetDeviceId(); })
+#endif
+      .def("__str__", string::to_string<const platform::IPUPlace &>);
+  // MLUPlace
+  py::class_<platform::MLUPlace> mluplace(m, "MLUPlace", R"DOC(
+    MLUPlace is a descriptor of a device.
+    It represents a MLU device on which a tensor will be allocated and a model will run.
+    Examples:
+        .. code-block:: python
+          import paddle
+          # required: mlu
+          mlu_place = paddle.MLUPlace(0)
+        )DOC");
+  g_mluplace_pytype = reinterpret_cast<PyTypeObject *>(mluplace.ptr());
+  mluplace
+      .def("__init__",
+           [](platform::MLUPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_MLU
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid MLUPlace(%d), device id must be 0 or "
+                   "positive integer",
+                   dev_id);
+               std::exit(-1);
+             }
+             if (UNLIKELY(dev_id >= platform::GetMLUDeviceCount())) {
+               if (platform::GetMLUDeviceCount() == 0) {
+                 LOG(ERROR) << "Cannot use MLU because there is no MLU "
+                               "detected on your "
+                               "machine.";
+                 std::exit(-1);
+               } else {
+                 LOG(ERROR) << string::Sprintf(
+                     "Invalid MLUPlace(%d), must inside [0, %d), because MLU "
+                     "number on your machine is %d",
+                     dev_id,
+                     platform::GetMLUDeviceCount(),
+                     platform::GetMLUDeviceCount());
+                 std::exit(-1);
+               }
+             }
+             new (&self) platform::MLUPlace(dev_id);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use MLU because you have installed CPU/GPU/... "
+                 "version "
+                 "PaddlePaddle.\n"
+                 "If you want to use MLU, please try to install MLU version "
+                 "PaddlePaddle by: pip install paddlepaddle-mlu\n"
+                 "If you only have CPU, please change MLUPlace(%d) to be "
+                 "CPUPlace().\n",
+                 dev_id);
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::MLUPlace>)
+#ifdef PADDLE_WITH_MLU
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::IPUPlace>)
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::MLUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::MLUPlace, platform::CUDAPinnedPlace>)
+      .def("get_device_id",
+           [](const platform::MLUPlace &self) { return self.GetDeviceId(); })
+#endif
+      .def("__str__", string::to_string<const platform::MLUPlace &>);
+  py::class_<platform::Place> platformplace(m, "Place");
+  g_place_pytype = reinterpret_cast<PyTypeObject *>(platformplace.ptr());
+  platformplace.def(py::init<>())
+      .def("_type", &PlaceIndex<platform::Place>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::IPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::MLUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::CustomPlace>)
+      .def("is_gpu_place",
+           [](platform::Place &self) { return platform::is_gpu_place(self); })
+      .def("is_cpu_place",
+           [](platform::Place &self) { return platform::is_cpu_place(self); })
+      .def("is_xpu_place",
+           [](platform::Place &self) { return platform::is_xpu_place(self); })
+      .def("is_npu_place",
+           [](platform::Place &self) { return platform::is_npu_place(self); })
+      .def("is_ipu_place",
+           [](platform::Place &self) { return platform::is_ipu_place(self); })
+      .def("is_cuda_pinned_place",
+           [](platform::Place &self) {
+             return platform::is_cuda_pinned_place(self);
+           })
+      .def("is_mlu_place",
+           [](platform::Place &self) { return platform::is_mlu_place(self); })
+      .def(
+          "is_custom_place",
+          [](platform::Place &self) { return platform::is_custom_place(self); })
+      .def("gpu_device_id", [](platform::Place &self) { return self.device; })
+      .def("xpu_device_id", [](platform::Place &self) { return self.device; })
+      .def("npu_device_id", [](platform::Place &self) { return self.device; })
+      .def("ipu_device_id", [](platform::Place &self) { return self.device; })
+      .def("mlu_device_id", [](platform::Place &self) { return self.device; })
+      .def("custom_device_id",
+           [](platform::Place &self) { return self.device; })
+      .def("set_place",
+           [](platform::Place &self, const platform::Place &other) {
+             self = other;
+           })
+      .def("set_place",
+           [](platform::Place &self, const platform::CPUPlace &cpu_place) {
+             self = cpu_place;
+           })
+      .def("set_place",
+           [](platform::Place &self, const platform::XPUPlace &xpu_place) {
+             self = xpu_place;
+           })
+      .def("set_place",
+           [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
+             self = gpu_place;
+           })
+      .def("set_place",
+           [](platform::Place &self,
+              const platform::CUDAPinnedPlace &cuda_pinned_place) {
+             self = cuda_pinned_place;
+           })
+      .def("set_place",
+           [](platform::Place &self, const platform::NPUPlace &npu_place) {
+             self = npu_place;
+           })
+      .def("set_place",
+           [](platform::Place &self, const platform::IPUPlace &ipu_place) {
+             self = ipu_place;
+           })
+      .def("set_place",
+           [](platform::Place &self, const platform::MLUPlace &mlu_place) {
+             self = mlu_place;
+           })
+      .def("set_place",
+           [](platform::Place &self, const platform::CustomPlace &plug_place) {
+             self = plug_place;
+           })
+      .def("__repr__", string::to_string<const platform::Place &>)
+      .def("__str__", string::to_string<const platform::Place &>);
+}
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/place.h
+++ b/paddle/fluid/pybind/place.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "pybind11/pybind11.h"
+namespace paddle {
+namespace pybind {
+void BindPlace(pybind11::module& m);  // NOLINT
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -122,9 +122,12 @@ limitations under the License. */
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
 #endif
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/pybind/parallel_executor.h"
+#include "paddle/fluid/pybind/place.h"
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/fluid/pybind/reader_py.h"
+#include "paddle/fluid/pybind/tensor.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -194,16 +197,7 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
 namespace paddle {
 namespace pybind {
-PyTypeObject *g_place_pytype = nullptr;
 PyTypeObject *g_framework_scope_pytype = nullptr;
-PyTypeObject *g_cudaplace_pytype = nullptr;
-PyTypeObject *g_cpuplace_pytype = nullptr;
-PyTypeObject *g_xpuplace_pytype = nullptr;
-PyTypeObject *g_npuplace_pytype = nullptr;
-PyTypeObject *g_cudapinnedplace_pytype = nullptr;
-PyTypeObject *g_mluplace_pytype = nullptr;
-PyTypeObject *g_customplace_pytype = nullptr;
-PyTypeObject *g_framework_tensor_pytype = nullptr;
 PyTypeObject *g_framework_lodtensorarray_pytype = nullptr;
 PyTypeObject *g_custom_op_kernel_ctx_pytype = nullptr;
@@ -349,16 +343,6 @@ bool IsCompiledWithDIST() {
 #endif
 }
-template <typename PlaceType1, typename PlaceType2>
-static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) {
-  return paddle::platform::Place(p1) == paddle::platform::Place(p2);
-}
-template <typename PlaceType>
-static inline int PlaceIndex(const PlaceType &p) {
-  return static_cast<int>(paddle::platform::Place(p).GetType());
-}
 static PyObject *GetPythonAttribute(PyObject *obj, const char *attr_name) {
  // NOTE(zjl): PyObject_GetAttrString would return nullptr when attr_name
  // is not inside obj, but it would also set the error flag of Python.
@@ -541,19 +525,6 @@ static int GetNCCLVersion() {
 }
 #endif
-template <typename PlaceType>
-static void TensorCopyFrom(framework::Tensor *dst,
-                           const framework::Tensor &src,
-                           const PlaceType &place,
-                           int64_t batch_size) {
-  if (batch_size < 0) {
-    framework::TensorCopy(src, place, dst);
-  } else {
-    auto sliced = src.Slice(0, batch_size);
-    framework::TensorCopy(sliced, place, dst);
-  }
-}
 #ifdef PADDLE_WITH_AVX
 PYBIND11_MODULE(core_avx, m) {
 #else
@@ -854,897 +825,6 @@ PYBIND11_MODULE(core_noavx, m) {
             self.EmplaceBackAttr(attr);
           });
-  py::class_<framework::Tensor> framework_tensor(
-      m, "Tensor", py::buffer_protocol());
-  g_framework_tensor_pytype =
-      reinterpret_cast<PyTypeObject *>(framework_tensor.ptr());
-  framework_tensor
-      .def("__array__",
-           [](framework::Tensor &self) { return TensorToPyArray(self); })
-      .def("_ptr",
-           [](const framework::Tensor &self) {
-             return reinterpret_cast<uintptr_t>(self.data());
-           })
-      .def("_slice", &framework::Tensor::Slice)
-      .def("_numel", &framework::Tensor::numel)
-      .def("_is_initialized",
-           [](const framework::Tensor &self) { return self.IsInitialized(); })
-      .def("_get_dims",
-           [](const framework::Tensor &self) { return vectorize(self.dims()); })
-      .def("_set_dims",
-           [](framework::Tensor &self, const std::vector<int64_t> &dim) {
-             self.Resize(phi::make_ddim(dim));
-           })
-      .def("_set_layout",
-           [](framework::Tensor &self, const std::string &layout) {
-             self.set_layout(StringToDataLayout(layout));
-           })
-      .def("_alloc_float",
-           [](framework::Tensor &self, paddle::platform::CustomPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_alloc_float",
-           [](framework::Tensor &self, paddle::platform::CUDAPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_alloc_float",
-           [](framework::Tensor &self, paddle::platform::XPUPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_alloc_float",
-           [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_alloc_float",
-           [](framework::Tensor &self, paddle::platform::NPUPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_alloc_float",
-           [](framework::Tensor &self, paddle::platform::MLUPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_alloc_double",
-           [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
-             self.mutable_data<double>(place);
-           })
-      .def("_alloc_int",
-           [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_int",
-           [](framework::Tensor &self, paddle::platform::CustomPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_int",
-           [](framework::Tensor &self, paddle::platform::XPUPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_int",
-           [](framework::Tensor &self, paddle::platform::CUDAPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_int",
-           [](framework::Tensor &self, paddle::platform::MLUPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_int",
-           [](framework::Tensor &self,
-              paddle::platform::CUDAPinnedPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_float",
-           [](framework::Tensor &self,
-              paddle::platform::CUDAPinnedPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_mutable_data",
-           [](framework::Tensor &self,
-              paddle::platform::CPUPlace &place,
-              paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(
-                 self.mutable_data(place, framework::TransToPhiDataType(type)));
-           })
-      .def("_mutable_data",
-           [](framework::Tensor &self,
-              paddle::platform::CustomPlace &place,
-              paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(
-                 self.mutable_data(place, framework::TransToPhiDataType(type)));
-           })
-      .def("_mutable_data",
-           [](framework::Tensor &self,
-              paddle::platform::XPUPlace &place,
-              paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(
-                 self.mutable_data(place, framework::TransToPhiDataType(type)));
-           })
-      .def("_mutable_data",
-           [](framework::Tensor &self,
-              paddle::platform::CUDAPlace &place,
-              paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(
-                 self.mutable_data(place, framework::TransToPhiDataType(type)));
-           })
-      .def("_mutable_data",
-           [](framework::Tensor &self,
-              paddle::platform::CUDAPinnedPlace &place,
-              paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(
-                 self.mutable_data(place, framework::TransToPhiDataType(type)));
-           })
-      .def("_mutable_data",
-           [](framework::Tensor &self,
-              paddle::platform::MLUPlace &place,
-              paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(
-                 self.mutable_data(place, framework::TransToPhiDataType(type)));
-           })
-      .def("_clear", &framework::Tensor::clear)
-      .def("_mutable_data",
-           [](framework::Tensor &self,
-              paddle::platform::NPUPlace &place,
-              paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(
-                 self.mutable_data(place, framework::TransToPhiDataType(type)));
-           })
-      .def("_copy_from",
-           &TensorCopyFrom<paddle::platform::CPUPlace>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("batch_size") = -1)
-      .def("_copy_from",
-           &TensorCopyFrom<paddle::platform::CustomPlace>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("batch_size") = -1)
-      .def("_copy_from",
-           &TensorCopyFrom<paddle::platform::XPUPlace>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("batch_size") = -1)
-      .def("_copy_from",
-           &TensorCopyFrom<paddle::platform::CUDAPlace>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("batch_size") = -1)
-      .def("_copy_from",
-           &TensorCopyFrom<paddle::platform::NPUPlace>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("batch_size") = -1)
-      .def("_copy_from",
-           &TensorCopyFrom<paddle::platform::CUDAPinnedPlace>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("batch_size") = -1)
-      .def("_copy_from",
-           &TensorCopyFrom<paddle::platform::MLUPlace>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("batch_size") = -1)
-      .def("_copy_from",
-           &TensorCopyFrom<paddle::platform::Place>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("batch_size") = -1)
-      .def("set",
-           SetTensorFromPyArray<paddle::platform::CPUPlace>,
-           py::arg("array"),
-           py::arg("place"),
-           py::arg("zero_copy") = false)
-      .def("set",
-           SetTensorFromPyArray<paddle::platform::CustomPlace>,
-           py::arg("array"),
-           py::arg("place"),
-           py::arg("zero_copy") = false)
-      .def("set",
-           SetTensorFromPyArray<paddle::platform::XPUPlace>,
-           py::arg("array"),
-           py::arg("place"),
-           py::arg("zero_copy") = false)
-      .def("set",
-           SetTensorFromPyArray<paddle::platform::CUDAPlace>,
-           py::arg("array"),
-           py::arg("place"),
-           py::arg("zero_copy") = false)
-      .def("set",
-           SetTensorFromPyArray<paddle::platform::NPUPlace>,
-           py::arg("array"),
-           py::arg("place"),
-           py::arg("zero_copy") = false)
-      .def("set",
-           SetTensorFromPyArray<paddle::platform::IPUPlace>,
-           py::arg("array"),
-           py::arg("place"),
-           py::arg("zero_copy") = false)
-      .def("set",
-           SetTensorFromPyArray<paddle::platform::MLUPlace>,
-           py::arg("array"),
-           py::arg("place"),
-           py::arg("zero_copy") = false)
-      .def("set",
-           SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
-           py::arg("array"),
-           py::arg("place"),
-           py::arg("zero_copy") = false,
-           R"DOC(
-        Set the data of Tensor on place with given numpy array.
-        Args:
-          lod (numpy.ndarray): The data to set.
-          place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace|MLUPlace): The place where the
-          Tensor is to be set.
-          zero_copy (bool, optional): Whether to share memory with the input numpy array.
-          This parameter only works with CPUPlace. Default: False.
-        Returns:
-            None.
-        Examples:
-            .. code-block:: python
-                import paddle.fluid as fluid
-                import numpy as np
-                t = fluid.Tensor()
-                t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-          )DOC")
-      .def(
-          "shape",
-          [](framework::Tensor &self) { return vectorize(self.dims()); },
-          R"DOC(
-           Return the shape of Tensor.
-           Returns:
-               list[int]: The shape of Tensor.
-           Examples:
-               .. code-block:: python
-                  import paddle.fluid as fluid
-                  import numpy as np
-                  t = fluid.Tensor()
-                  t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                  print(t.shape())  # [5, 30]
-           )DOC")
-      .def("_to_dlpack",
-           [](framework::Tensor &self) {
-             DLPackTensor dlpack_tensor(self, 1);
-             DLManagedTensor *dmt = dlpack_tensor.ToDLManagedTensor();
-             auto capsule = py::capsule(
-                 static_cast<void *>(dmt), "dltensor", [](PyObject *ptr) {
-                   if (ptr) {
-                     auto dltensor = new DLManagedTensor;
-                     try {
-                       dltensor = reinterpret_cast<DLManagedTensor *>(
-                           PyCapsule_GetPointer(ptr, "used_dltensor"));
-                       return;
-                     } catch (...) {
-                       dltensor = reinterpret_cast<DLManagedTensor *>(
-                           PyCapsule_GetPointer(ptr, "dltensor"));
-                     }
-                     dltensor->deleter(dltensor);
-                   }
-                 });
-             return capsule;
-           })
-      .def("_set_float_element", TensorSetElement<float>)
-      .def("_get_float_element", TensorGetElement<float>)
-      .def("_set_double_element", TensorSetElement<double>)
-      .def("_get_double_element", TensorGetElement<double>)
-      .def("_place", [](framework::Tensor &self) { return self.place(); })
-      .def("_dtype",
-           [](framework::Tensor &self) {
-             return framework::TransToProtoVarType(self.type());
-           })
-      .def("_layout",
-           [](framework::Tensor &self) {
-             return DataLayoutToString(self.layout());
-           })
-      .def("_share_data_with", &framework::Tensor::ShareDataWith)
-      .def("__getitem__", PySliceTensor, py::return_value_policy::reference)
-      .def("__str__",
-           [](const framework::Tensor &self) {
-             std::stringstream ostr;
-             ostr << self;
-             return ostr.str();
-           }) /* ------ End of original Tensor ------ */
-      .def("__init__",
-           [](framework::Tensor &instance,
-              const std::vector<std::vector<size_t>>
-                  &recursive_sequence_lengths) {
-             LoD new_lod;
-             new_lod.reserve(recursive_sequence_lengths.size());
-             std::copy(recursive_sequence_lengths.begin(),
-                       recursive_sequence_lengths.end(),
-                       std::back_inserter(new_lod));
-             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
-             PADDLE_ENFORCE_EQ(
-                 CheckLoD(new_offset_lod, -1),
-                 true,
-                 platform::errors::InvalidArgument(
-                     "The provided recursive_sequence_lengths info is "
-                     "invalid, "
-                     "the LoD converted by recursive_sequence_lengths is %s",
-                     new_lod));
-             new (&instance) framework::Tensor(new_offset_lod);
-           })
-      .def("__init__",
-           [](framework::Tensor &instance) {
-             new (&instance) framework::Tensor();
-           })
-      // We implement offset based LOD in C++ while we use length based with
-      // Python API. So we changed set_lod to set_recursive_sequence_lengths
-      // to
-      // avoid misuse.
-      // The discussion is here:
-      // https://github.com/PaddlePaddle/Paddle/issues/10855
-      .def(
-          "set_lod",
-          [](framework::Tensor &self,
-             const std::vector<std::vector<size_t>> &lod) {
-            // the input lod is offset-based level-of-detail info
-            LoD new_lod;
-            new_lod.reserve(lod.size());
-            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-            PADDLE_ENFORCE_EQ(
-                CheckLoD(new_lod, vectorize(self.dims()).front()),
-                true,
-                platform::errors::InvalidArgument(
-                    "The provided LoD is invalid, the LoD is %s", new_lod));
-            self.set_lod(new_lod);
-          },
-          py::arg("lod"),
-          R"DOC(
-           Set LoD of the Tensor.
-           Args:
-               lod (list[list[int]]): The lod to set.
-           Returns:
-                None.
-           Examples:
-               .. code-block:: python
-                 import paddle.fluid as fluid
-                 import numpy as np
-                 t = fluid.Tensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_lod([[0, 2, 5]])
-                 print(t.lod()) # [[0, 2, 5]]
-           )DOC")
-      .def(
-          "set_recursive_sequence_lengths",
-          [](framework::Tensor &self,
-             const std::vector<std::vector<size_t>>
-                 &recursive_sequence_lengths) {
-            // the input recursive_sequence_lengths is length-based
-            // level-of-detail info
-            LoD new_lod;
-            new_lod.reserve(recursive_sequence_lengths.size());
-            std::copy(recursive_sequence_lengths.begin(),
-                      recursive_sequence_lengths.end(),
-                      std::back_inserter(new_lod));
-            LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
-            PADDLE_ENFORCE_EQ(
-                CheckLoD(new_offset_lod, vectorize(self.dims()).front()),
-                true,
-                platform::errors::InvalidArgument(
-                    "The provided recursive_sequence_lengths info is "
-                    "invalid, "
-                    "the LoD converted by recursive_sequence_lengths is "
-                    "%s",
-                    new_lod));
-            self.set_lod(new_offset_lod);
-          },
-          py::arg("recursive_sequence_lengths"),
-          R"DOC(
-           Set LoD of the Tensor according to recursive sequence lengths.
-           For example, if recursive_sequence_lengths=[[2, 3]], which means
-           there are two sequences with length 2 and 3 respectively, the
-           corresponding lod would be [[0, 2, 2+3]], i.e., [[0, 2, 5]].
-           Args:
-                recursive_sequence_lengths (list[list[int]]): The recursive sequence lengths.
-           Returns:
-                None.
-           Examples:
-               .. code-block:: python
-                 import paddle.fluid as fluid
-                 import numpy as np
-                 t = fluid.Tensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_recursive_sequence_lengths([[2, 3]])
-                 print(t.recursive_sequence_lengths())  # [[2, 3]]
-                 print(t.lod())  # [[0, 2, 5]]
-           )DOC")
-      .def(
-          "lod",
-          [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
-            // output the offset-based lod info
-            LoD lod = self.lod();
-            std::vector<std::vector<size_t>> new_lod;
-            new_lod.reserve(lod.size());
-            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-            return new_lod;
-          },
-          R"DOC(
-           Return the LoD of the Tensor.
-           Returns:
-               list[list[int]]: The lod of the Tensor.
-           Examples:
-               .. code-block:: python
-                 import paddle.fluid as fluid
-                 import numpy as np
-                 t = fluid.Tensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_lod([[0, 2, 5]])
-                 print(t.lod()) # [[0, 2, 5]]
-           )DOC")
-      // Set above comments of set_lod.
-      .def(
-          "recursive_sequence_lengths",
-          [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
-            // output the length-based lod info
-            LoD lod = phi::ConvertToLengthBasedLoD(self.lod());
-            std::vector<std::vector<size_t>> new_lod;
-            new_lod.reserve(lod.size());
-            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-            return new_lod;
-          },
-          R"DOC(
-           Return the recursive sequence lengths corresponding to of the LodD 
-           of the Tensor.
-           Returns:
-                list[list[int]]: The recursive sequence lengths.
-           Examples:
-               .. code-block:: python
-                 import paddle.fluid as fluid
-                 import numpy as np
-                 t = fluid.Tensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_recursive_sequence_lengths([[2, 3]])
-                 print(t.recursive_sequence_lengths()) # [[2, 3]]
-           )DOC")
-      .def(
-          "has_valid_recursive_sequence_lengths",
-          [](framework::Tensor &self) -> bool {
-            // Check that the lod info is valid and match the outermost
-            // dimension of the Tensor data
-            return CheckLoD(self.lod(), vectorize(self.dims()).front());
-          },
-          R"DOC(
-           Check whether the LoD of the Tensor is valid.
-           Returns:
-               bool: Whether the LoD is valid.
-           Examples:
-               .. code-block:: python
-                 import paddle.fluid as fluid
-                 import numpy as np
-                 t = fluid.Tensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_recursive_sequence_lengths([[2, 3]])
-                 print(t.has_valid_recursive_sequence_lengths()) # True
-           )DOC")
-      .def("_as_type",
-           [](const framework::Tensor &self,
-              paddle::framework::proto::VarType::Type type) {
-             framework::Tensor dst;
-             if (self.IsInitialized() && self.numel() > 0) {
-               TransDataType(self, type, &dst);
-             }
-             return dst;
-           })
-      .def("_copy",
-           [](const framework::Tensor &self, const platform::Place &place) {
-             // follow fetch_op's inplementation
-             framework::Tensor dst;
-             if (self.IsInitialized() && self.numel() > 0) {
-               TensorCopySync(self, place, &dst);
-             } else {
-               // Not copy, if the src tensor is empty.
-               dst.clear();
-               dst.Resize({0});
-             }
-             dst.set_lod(self.lod());
-             return dst;
-#ifdef _WIN32
-           });
-#else
-           })
-#ifdef PADDLE_WITH_CUDA
-      .def("_share_buffer_with",
-           [](framework::Tensor &self, const framework::Tensor src,
-              py::tuple t) {
-             auto *cuda_ipc_allocation =
-                 dynamic_cast<memory::allocation::CudaIpcAllocation *>(
-                     src.Holder().get());
-             PADDLE_ENFORCE_NOT_NULL(
-                 cuda_ipc_allocation,
-                 platform::errors::PreconditionNotMet(
-                     "Tensor is not Cuda IPC shared tensor. "
-                     "Now only Tensor shared by cuda ipc could use this "
-                     "api."));
-             size_t size = t[0].cast<size_t>();
-             auto dtype =
-                 static_cast<paddle::experimental::DataType>(t[1].cast<int>());
-             auto dims = phi::make_ddim(t[2].cast<std::vector<int>>());
-             auto lod_info = t[3].cast<framework::LoD>();
-             auto device_id = t[4].cast<int>();
-             auto shared_reader_holder =
-                 std::make_shared<memory::allocation::Allocation>(
-                     cuda_ipc_allocation->ptr(),
-                     cuda_ipc_allocation->base_ptr(), size,
-                     platform::CUDAPlace(device_id));
-             self.ResetHolderWithType(shared_reader_holder, dtype);
-             self.Resize(dims);
-             self.set_lod(lod_info);
-             VLOG(6) << "Reconstructed tensor with buffer shared!";
-           },
-           R"DOC(
-           Deserialize GPU Tensor for existed shared Cuda IPC tensor.
-           Params:
-               tensor: Shared Cuda IPC tensor.
-               tuple: contrains data size, data type,
-                      tensor dims, lod information, device index.
-       )DOC")
-      .def("_share_cuda",
-           [](framework::Tensor self) {
-             if (!self.IsInitialized() || self.numel() == 0)
-               throw std::runtime_error(
-                   "Tensor not initialized or numel is 0.  could not pass "
-                   "to shared memory. ");
-             auto *holder = dynamic_cast<memory::allocation::Allocation *>(
-                 self.Holder().get());
-             PADDLE_ENFORCE_EQ(
-                 platform::is_gpu_place(holder->place()), true,
-                 platform::errors::InvalidArgument(
-                     "Tensor is not on GPU. share_cuda only support GPU "
-                     "Tensor, share_filename is for CPU tensor."));
-             void *base_ptr = holder->base_ptr();
-             ptrdiff_t offset_bytes = reinterpret_cast<char *>(holder->ptr()) -
-                                      reinterpret_cast<char *>(base_ptr);
-             cudaIpcMemHandle_t handle;
-             PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcGetMemHandle(&handle, base_ptr));
-             auto _handle = py::bytes(reinterpret_cast<char *>(&handle),
-                                      (py::ssize_t)CUDA_IPC_HANDLE_SIZE);
-             // TODO(ZHUI): use cuda event, to avoid sync.
-             const auto &device_id = paddle::platform::GetCurrentDeviceId();
-             auto stream =
-                 paddle::platform::stream::get_current_stream(device_id);
-             stream->Synchronize();
-             int type_idx = static_cast<int>(self.type());
-             size_t data_size =
-                 self.numel() *
-                 framework::SizeOfType(
-                     framework::TransToProtoVarType(self.type()));
-             return py::make_tuple(_handle, (py::size_t)offset_bytes, data_size,
-                                   type_idx, vectorize(self.dims()), self.lod(),
-                                   device_id);
-           },
-           R"DOC(
-           Serialize GPU Tensor by cudaIpcMemHandle.
-           Returns:
-               tuple: contrains handle, data size, data type,
-                      tensor dims, lod information, device index.
-           Examples:
-               .. code-block:: python
-                 import paddle
-                 tensor = paddle.ones([3,3])
-                 metainfo = tensor.value().get_tensor()._share_cuda()
-      )DOC")
-      .def("_new_shared_cuda",
-           [](py::tuple t) {
-             if (t.size() != 7)
-               throw std::runtime_error(
-                   "Invalid Tensor meta info for shared cuda tensor!");
-             // 1. Create a new C++ instance
-             framework::Tensor tensor;
-             // 2. Rebuild Allocation from handle
-             const std::string &handle = t[0].cast<std::string>();
-             ptrdiff_t offset_bytes = (ptrdiff_t)t[1].cast<int64_t>();
-             auto device_id = t[6].cast<int>();
-             auto base_ptr = memory::allocation::GetIpcBasePtr(handle);
-             size_t size = t[2].cast<size_t>();
-             void *dev = base_ptr.get();
-             dev = reinterpret_cast<char *>(dev) + offset_bytes;
-             auto shared_reader_holder =
-                 std::make_shared<memory::allocation::CudaIpcAllocation>(
-                     dev, size, device_id, std::move(base_ptr));
-             // 3. Rebuild Tensor
-             tensor.ResetHolderWithType(
-                 shared_reader_holder,
-                 static_cast<paddle::experimental::DataType>(t[3].cast<int>()));
-             tensor.Resize(phi::make_ddim(t[4].cast<std::vector<int>>()));
-             tensor.set_lod(t[5].cast<framework::LoD>());
-             return tensor;
-           },
-           R"DOC(
-           Deserialize GPU lod tensor from cudaIpcMemHandle.
-           Params:
-               tuple: contrains handle, data size, data type,
-                      tensor dims, lod information, device index.
-           Examples:
-               .. code-block:: python
-                 import paddle
-                 tensor = paddle.ones([3,3])
-                 metainfo = tensor.value().get_tensor()._share_cuda()
-                 tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_cuda(metainfo))
-        )DOC")
-#endif
-      .def("_share_filename",
-           [](framework::Tensor &self) {
-             if (!self.IsInitialized() || self.numel() == 0)
-               throw std::runtime_error(
-                   "Tensor not initialized or numel is 0. could not pass to "
-                   "shared memory. ");
-             auto holder = self.Holder();
-             PADDLE_ENFORCE_EQ(
-                 platform::is_cpu_place(holder->place()) ||
-                     platform::is_cuda_pinned_place(holder->place()),
-                 true, platform::errors::InvalidArgument(
-                           "Tensor is not on CPU. share_filename only "
-                           "support CPU Tensor."));
-             auto *mmap_allocation = dynamic_cast<
-                 memory::allocation::RefcountedMemoryMapAllocation *>(
-                 holder.get());
-             // If the tensor is not shared, allocate memory map allocation.
-             if (mmap_allocation == nullptr) {
-               void *data_ptr = self.data();
-               size_t data_size =
-                   self.numel() *
-                   framework::SizeOfType(
-                       framework::TransToProtoVarType(self.type()));
-               int flags = memory::allocation::MAPPED_SHAREDMEM |
-                           memory::allocation::MAPPED_EXCLUSIVE;
-               std::string handle = memory::allocation::GetIPCName();
-               auto shared_holder =
-                   memory::allocation::AllocateRefcountedMemoryMapAllocation(
-                       handle, flags, data_size);
-               // copy data & reset holder
-               if (platform::is_cuda_pinned_place(holder->place())) {
-#ifdef PADDLE_WITH_CUDA
-                 memory::Copy(platform::CPUPlace(), shared_holder->ptr(),
-                              platform::CUDAPinnedPlace(), data_ptr, data_size);
-#endif
-               } else {
-                 memory::Copy(platform::CPUPlace(), shared_holder->ptr(),
-                              platform::CPUPlace(), data_ptr, data_size);
-               }
-               self.ResetHolder(shared_holder);
-               mmap_allocation = shared_holder.get();
-             }
-             int type_idx = static_cast<int>(self.type());
-             return py::make_tuple(mmap_allocation->ipc_name(),
-                                   mmap_allocation->size(), type_idx,
-                                   vectorize(self.dims()), self.lod());
-           },
-           R"DOC(
-           Serialize CPU lod tensor in shared memory to tuple.
-           If the tensor is not in shared memory, we will copy it first.
-           Returns:
-               tuple: contrains ipc name, data size, data type,
-                      tensor dims and lod imformation.
-           Examples:
-               .. code-block:: python
-                 import paddle
-                 tensor = paddle.ones([3,3])
-                 metainfo = tensor.value().get_tensor()._share_filename()
-       )DOC")
-      .def("_new_shared_filename",
-           [](py::tuple t) {  // __setstate__
-             if (t.size() != 5)
-               throw std::runtime_error("Invalid Tensor meta info state!");
-             framework::Tensor tensor;
-             // 2. Rebuild Allocation
-             const std::string &ipc_name = t[0].cast<std::string>();
-             size_t size = t[1].cast<size_t>();
-             int flags = memory::allocation::MAPPED_SHAREDMEM |
-                         memory::allocation::MAPPED_NOCREATE;
-             auto shared_holder =
-                 memory::allocation::AllocateRefcountedMemoryMapAllocation(
-                     ipc_name, flags, size);
-             // 3. Rebuild Tensor
-             tensor.ResetHolderWithType(
-                 shared_holder,
-                 static_cast<paddle::experimental::DataType>(t[2].cast<int>()));
-             tensor.Resize(phi::make_ddim(t[3].cast<std::vector<int>>()));
-             tensor.set_lod(t[4].cast<framework::LoD>());
-             return tensor;
-           },
-           R"DOC(
-           Deserialize CPU lod tensor from shared memory.
-           Params:
-               tuple: contrains ipc file name, data size, data type,
-                      tensor dims and lod information.
-           Examples:
-               .. code-block:: python
-                 import paddle
-                 tensor = paddle.ones([3,3])
-                 metainfo = tensor.value().get_tensor()._share_filename()
-                 tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_filename(metainfo))
-        )DOC")
-      .def("_shared_incref",
-           [](framework::Tensor &self) {
-             auto *mmap_allocation = dynamic_cast<
-                 memory::allocation::RefcountedMemoryMapAllocation *>(
-                 self.Holder().get());
-             if (mmap_allocation) {
-               mmap_allocation->incref();
-             }
-           },
-           R"DOC(
-            Increase reference count of share_filename tensor.
-      )DOC")
-      .def("_shared_decref",
-           [](framework::Tensor &self) {
-             auto *mmap_allocation = dynamic_cast<
-                 memory::allocation::RefcountedMemoryMapAllocation *>(
-                 self.Holder().get());
-             if (mmap_allocation) {
-               mmap_allocation->decref();
-             }
-           },
-           R"DOC(
-            Decrease reference count of share_filename tensor.
-      )DOC")
-      .def(py::pickle(
-          [](const framework::Tensor &t) {  // __getstate__
-            auto holder = t.Holder();
-            PADDLE_ENFORCE_EQ(platform::is_cpu_place(holder->place()), true,
-                              platform::errors::PreconditionNotMet(
-                                  "Tensor is not on CPU."
-                                  "Now only Tensor on CPU can be serialized."));
-            auto *mmap_writer_allocation =
-                dynamic_cast<memory::allocation::MemoryMapWriterAllocation *>(
-                    holder.get());
-            PADDLE_ENFORCE_NOT_NULL(
-                mmap_writer_allocation,
-                platform::errors::PreconditionNotMet(
-                    "Tensor is not in shared memory."
-                    "Now only Tensor on shared memory can be serialized."));
-            int type_idx = static_cast<int>(t.type());
-            return py::make_tuple(mmap_writer_allocation->ipc_name(),
-                                  mmap_writer_allocation->size(), type_idx,
-                                  vectorize(t.dims()), t.lod());
-          },
-          [](py::tuple t) {  // __setstate__
-            if (t.size() != 5)
-              throw std::runtime_error("Invalid Tensor state!");
-            // 1. Create a new C++ instance
-            framework::Tensor tensor;
-            // 2. Rebuild Allocation
-            const std::string &ipc_name = t[0].cast<std::string>();
-            size_t size = t[1].cast<size_t>();
-            auto shared_reader_holder =
-                memory::allocation::RebuildMemoryMapReaderAllocation(ipc_name,
-                                                                     size);
-            // 3. Maintain global fd set
-            VLOG(3) << "Tensor ipc name: " << ipc_name;
-            memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
-            // 4. Rebuild Tensor
-            tensor.ResetHolderWithType(
-                shared_reader_holder,
-                static_cast<paddle::experimental::DataType>(t[2].cast<int>()));
-            tensor.Resize(phi::make_ddim(t[3].cast<std::vector<int>>()));
-            tensor.set_lod(t[4].cast<framework::LoD>());
-            return tensor;
-          }));
-#endif
-  py::class_<phi::SelectedRows>(m, "SelectedRows")
-      .def("__init__",
-           [](phi::SelectedRows &instance) {
-             new (&instance) phi::SelectedRows();
-           })
-      .def("__init__",
-           [](phi::SelectedRows &instance,
-              const std::vector<int64_t> rows,
-              const int64_t &height) {
-             new (&instance) phi::SelectedRows(rows, height);
-           })
-      .def(
-          "get_tensor",
-          [](phi::SelectedRows &self) { return self.mutable_value(); },
-          py::return_value_policy::reference)
-      .def("numel",
-           [](phi::SelectedRows &self) -> int64_t {
-             return self.value().numel();
-           })
-      .def("set_height", &phi::SelectedRows::set_height)
-      .def("height", &phi::SelectedRows::height)
-      .def("set_rows",
-           [](phi::SelectedRows &self, std::vector<int64_t> rows) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
-             self.set_rows(rows);
-#else
-        Vector<int64_t> new_rows(rows);
-        self.set_rows(new_rows);
-#endif
-           })
-      .def("sync_index",
-           [](phi::SelectedRows &instance) { instance.SyncIndex(); })
-      .def("rows", [](phi::SelectedRows &self) {
-        auto rows = self.rows();
-        std::vector<int64_t> new_rows;
-        new_rows.reserve(rows.size());
-        std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
-        return new_rows;
-      });
  py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
 All parameter, weight, gradient are variables in Paddle.
@@ -2272,603 +1352,6 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
    return devices;
  });
-  py::class_<platform::CustomPlace> customplace(m,
-                                                "CustomPlace",
-                                                R"DOC(
-    CustomPlace is a descriptor of a device.
-    It represents a custom device on which a tensor will be allocated and a model will run.
-    Examples:
-        .. code-block:: python
-          import paddle
-          fake_cpu_place = paddle.CustomPlace("FakeCPU", 0)
-                                             )DOC");
-  g_customplace_pytype = reinterpret_cast<PyTypeObject *>(customplace.ptr());
-  customplace
-      .def("__init__",
-           [](platform::CustomPlace &self,
-              const std::string &device_type,
-              int dev_id) {
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-             if (UNLIKELY(dev_id < 0)) {
-               LOG(ERROR) << string::Sprintf(
-                   "Invalid CustomPlace(%s, %d), device id must be 0 "
-                   "or "
-                   "positive integer",
-                   device_type,
-                   dev_id);
-               std::exit(-1);
-             }
-             if (LIKELY(phi::DeviceManager::HasDeviceType(device_type) &&
-                        phi::DeviceManager::IsCustom(device_type))) {
-               int dev_count = static_cast<int>(
-                   phi::DeviceManager::GetDeviceCount(device_type));
-               if (UNLIKELY(dev_id >= dev_count)) {
-                 if (dev_count == 0) {
-                   LOG(ERROR) << "Cannot use " << device_type
-                              << " because there is no " << device_type
-                              << " detected on your "
-                                 "machine.";
-                   std::exit(-1);
-                 } else {
-                   LOG(ERROR) << string::Sprintf(
-                       "Invalid CustomPlace(%s, %d), dev_id must "
-                       "inside "
-                       "[0, %d), because %s "
-                       "number on your machine is %d",
-                       device_type,
-                       dev_id,
-                       dev_count,
-                       device_type,
-                       dev_count);
-                   std::exit(-1);
-                 }
-               }
-               new (&self) platform::CustomPlace(device_type, dev_id);
-             } else {
-               LOG(ERROR) << string::Sprintf(
-                   "Invalid CustomPlace(%s, %d), the device type is "
-                   "not registered "
-                   "as a custom device.",
-                   device_type,
-                   dev_id);
-               std::exit(-1);
-             }
-#else
-             LOG(ERROR) << string::Sprintf(
-                 "Cannot use CustomDevice because you have installed CPU/GPU"
-                 "version PaddlePaddle.\n"
-                 "If you want to use CustomDevice, please try to install"
-                 "CustomDevice version "
-                 "PaddlePaddle by: pip install paddlepaddle\n"
-                 "If you only have CPU, please change "
-                 "CustomPlace(%s, %d) to be CPUPlace().\n",
-                 device_type, dev_id);
-             std::exit(-1);
-#endif
-           })
-      .def("_type", &PlaceIndex<platform::CustomPlace>)
-      .def("get_device_id",
-           [](const platform::CustomPlace &self) { return self.GetDeviceId(); })
-      .def("get_device_type",
-           [](const platform::CustomPlace &self) {
-             return self.GetDeviceType();
-           })
-      .def("__repr__", string::to_string<const platform::CustomPlace &>)
-      .def("__str__", string::to_string<const platform::CustomPlace &>);
-  py::class_<platform::CUDAPlace> cudaplace(m, "CUDAPlace", R"DOC(
-    CUDAPlace is a descriptor of a device.
-    It represents a GPU device allocated or to be allocated with Tensor or LoDTensor.
-    Each CUDAPlace has a dev_id to indicate the graphics card ID represented by the current CUDAPlace,
-    staring from 0.
-    The memory of CUDAPlace with different dev_id is not accessible.
-    Numbering here refers to the logical ID of the visible graphics card, not the actual ID of the graphics card.
-    You can set visible GPU devices by setting the `CUDA_VISIBLE_DEVICES` environment variable.
-    When the program starts, visible GPU devices will be numbered from 0.
-    If `CUDA_VISIBLE_DEVICES` is not set, all devices are visible by default,
-    and the logical ID is the same as the actual ID.
-    Parameters:
-        id (int): GPU device ID.
-    Examples:
-        .. code-block:: python
-          import paddle
-          place = paddle.CUDAPlace(0)
-        )DOC");
-  g_cudaplace_pytype = reinterpret_cast<PyTypeObject *>(cudaplace.ptr());
-  cudaplace
-      .def("__init__",
-           [](platform::CUDAPlace &self, int dev_id) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-             if (UNLIKELY(dev_id < 0)) {
-               LOG(ERROR) << string::Sprintf(
-                   "Invalid CUDAPlace(%d), device id must be 0 or "
-                   "positive integer",
-                   dev_id);
-               std::exit(-1);
-             }
-             if (UNLIKELY(dev_id >= platform::GetGPUDeviceCount())) {
-               if (platform::GetGPUDeviceCount() == 0) {
-                 LOG(ERROR) << "Cannot use GPU because there is no GPU "
-                               "detected on your "
-                               "machine.";
-                 std::exit(-1);
-               } else {
-                 LOG(ERROR) << string::Sprintf(
-                     "Invalid CUDAPlace(%d), must inside [0, %d), because GPU "
-                     "number on your machine is %d",
-                     dev_id,
-                     platform::GetGPUDeviceCount(),
-                     platform::GetGPUDeviceCount());
-                 std::exit(-1);
-               }
-             }
-             new (&self) platform::CUDAPlace(dev_id);
-#else
-             LOG(ERROR) << string::Sprintf(
-                 "Cannot use GPU because you have installed CPU version "
-                 "PaddlePaddle.\n"
-                 "If you want to use GPU, please try to install GPU version "
-                 "PaddlePaddle by: pip install paddlepaddle-gpu\n"
-                 "If you only have CPU, please change CUDAPlace(%d) to be "
-                 "CPUPlace().\n",
-                 dev_id);
-             std::exit(-1);
-#endif
-           })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      .def("get_device_id",
-           [](const platform::CUDAPlace &self) { return self.GetDeviceId(); })
-      .def("_type", &PlaceIndex<platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::NPUPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::MLUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
-      .def("_get_device_id",
-           [](platform::CUDAPlace &self) -> int { return self.GetDeviceId(); })
-#endif
-      .def("__repr__", string::to_string<const platform::CUDAPlace &>)
-      .def("__str__", string::to_string<const platform::CUDAPlace &>);
-  py::class_<platform::XPUPlace> xpuplace(m, "XPUPlace", R"DOC(
-    **Note**:
-    Examples:
-        .. code-block:: python
-          import paddle.fluid as fluid
-          xpu_place = fluid.XPUPlace(0)
-        )DOC");
-  g_xpuplace_pytype = reinterpret_cast<PyTypeObject *>(xpuplace.ptr());
-  xpuplace
-      .def("__init__",
-           [](platform::XPUPlace &self, int dev_id) {
-#ifdef PADDLE_WITH_XPU
-             if (UNLIKELY(dev_id < 0)) {
-               LOG(ERROR) << string::Sprintf(
-                   "Invalid XPUPlace(%d), device id must be 0 or "
-                   "positive integer",
-                   dev_id);
-               std::exit(-1);
-             }
-             if (UNLIKELY(dev_id >= platform::GetXPUDeviceCount())) {
-               if (platform::GetXPUDeviceCount() == 0) {
-                 LOG(ERROR) << "Cannot use XPU because there is no XPU "
-                               "detected on your "
-                               "machine.";
-                 std::exit(-1);
-               } else {
-                 LOG(ERROR) << string::Sprintf(
-                     "Invalid XPUPlace(%d), must inside [0, %d), because XPU "
-                     "number on your machine is %d",
-                     dev_id,
-                     platform::GetXPUDeviceCount(),
-                     platform::GetXPUDeviceCount());
-                 std::exit(-1);
-               }
-             }
-             new (&self) platform::XPUPlace(dev_id);
-#else
-             LOG(ERROR) << string::Sprintf(
-                 "Cannot use XPU because you have installed CPU/GPU version "
-                 "PaddlePaddle.\n"
-                 "If you want to use XPU, please try to install XPU version "
-                 "PaddlePaddle by: pip install paddlepaddle-xpu\n"
-                 "If you only have CPU, please change XPUPlace(%d) to be "
-                 "CPUPlace().\n",
-                 dev_id);
-             std::exit(-1);
-#endif
-           })
-#ifdef PADDLE_WITH_XPU
-      .def("_type", &PlaceIndex<platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::XPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::XPUPlace, platform::CUDAPinnedPlace>)
-      .def("get_device_id",
-           [](const platform::XPUPlace &self) { return self.GetDeviceId(); })
-#endif
-      .def("__repr__", string::to_string<const platform::XPUPlace &>)
-      .def("__str__", string::to_string<const platform::XPUPlace &>);
-#ifdef PADDLE_WITH_XPU
-  py::enum_<phi::backends::xpu::XPUVersion>(m, "XPUVersion", py::arithmetic())
-      .value("XPU1", phi::backends::xpu::XPUVersion::XPU1)
-      .value("XPU2", phi::backends::xpu::XPUVersion::XPU2)
-      .export_values();
-  m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
-  m.def("get_xpu_device_version",
-        [](int device_id) { return platform::get_xpu_version(device_id); });
-#ifdef PADDLE_WITH_XPU_KP
-  m.def("get_xpu_device_op_support_types",
-        [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
-          return platform::get_xpu_kp_op_support_type(op_name, version);
-        });
-#else
-  m.def("get_xpu_device_op_support_types",
-        [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
-          return platform::get_xpu_op_support_type(op_name, version);
-        });
-#endif
-  m.def("get_xpu_device_op_list", [](phi::backends::xpu::XPUVersion version) {
-    return platform::get_xpu_op_list(version);
-  });
-  m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool {
-    // XPUs with Compute Capability > xpu2 support float16 and bfloat16
-    return platform::get_xpu_version(place.device) >
-           phi::backends::xpu::XPUVersion::XPU1;
-  });
-  m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool {
-    // XPUs with Compute Capability > xpu2 support float16 and bfloat16
-    return platform::get_xpu_version(place.device) >
-           phi::backends::xpu::XPUVersion::XPU1;
-  });
-#endif
-  py::class_<paddle::platform::CPUPlace> cpuplace(m, "CPUPlace", R"DOC(
-    CPUPlace is a descriptor of a device.
-    It represents a CPU device on which a tensor will be allocated and a model will run.
-    Examples:
-        .. code-block:: python
-          import paddle
-          cpu_place = paddle.CPUPlace()
-        )DOC");
-  g_cpuplace_pytype = reinterpret_cast<PyTypeObject *>(cpuplace.ptr());
-  cpuplace.def(py::init<>())
-      .def("_type", &PlaceIndex<platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::NPUPlace>)
-      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CPUPlace, platform::CUDAPinnedPlace>)
-      .def("__repr__", string::to_string<const platform::CPUPlace &>)
-      .def("__str__", string::to_string<const platform::CPUPlace &>);
-  py::class_<paddle::platform::CUDAPinnedPlace> cudapinnedplace(
-      m, "CUDAPinnedPlace", R"DOC(
-    CUDAPinnedPlace is a descriptor of a device.
-    It refers to the page locked memory allocated by the CUDA function `cudaHostAlloc()` in the host memory.
-    The host operating system will not paging and exchanging the memory.
-    It can be accessed through direct memory access technology to speed up the copy of data between the host and GPU.
-    For more information on CUDA data transfer and `pinned memory`,
-    please refer to `official document <https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#pinned-memory>`_ .
-    Examples:
-        .. code-block:: python
-          import paddle
-          place = paddle.CUDAPinnedPlace()
-        )DOC");
-  g_cudapinnedplace_pytype =
-      reinterpret_cast<PyTypeObject *>(cudapinnedplace.ptr());
-  cudapinnedplace
-      .def("__init__",
-           [](platform::CUDAPinnedPlace &self) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
-             PADDLE_THROW(platform::errors::PermissionDenied(
-                 "Cannot use CUDAPinnedPlace in CPU only version, "
-                 "Please recompile or reinstall Paddle with CUDA support."));
-#endif
-             new (&self) platform::CUDAPinnedPlace();
-           })
-      .def("_type", &PlaceIndex<platform::CUDAPinnedPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPinnedPlace, platform::NPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>)
-      .def("__repr__", string::to_string<const platform::CUDAPinnedPlace &>)
-      .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
-  // NPUPlace
-  py::class_<platform::NPUPlace> npuplace(m, "NPUPlace", R"DOC(
-    NPUPlace is a descriptor of a device.
-    It represents a NPU device on which a tensor will be allocated and a model will run.
-    Examples:
-        .. code-block:: python
-          import paddle
-          npu_place = paddle.NPUPlace(0)
-        )DOC");
-  g_npuplace_pytype = reinterpret_cast<PyTypeObject *>(npuplace.ptr());
-  npuplace
-      .def("__init__",
-           [](platform::NPUPlace &self, int dev_id) {
-#ifdef PADDLE_WITH_ASCEND_CL
-             if (UNLIKELY(dev_id < 0)) {
-               LOG(ERROR) << string::Sprintf(
-                   "Invalid NPUPlace(%d), device id must be 0 or "
-                   "positive integer",
-                   dev_id);
-               std::exit(-1);
-             }
-             if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) {
-               if (platform::GetNPUDeviceCount() == 0) {
-                 LOG(ERROR) << "Cannot use NPU because there is no NPU "
-                               "detected on your "
-                               "machine.";
-                 std::exit(-1);
-               } else {
-                 LOG(ERROR) << string::Sprintf(
-                     "Invalid NPUPlace(%d), must inside [0, %d), because NPU "
-                     "number on your machine is %d",
-                     dev_id,
-                     platform::GetNPUDeviceCount(),
-                     platform::GetNPUDeviceCount());
-                 std::exit(-1);
-               }
-             }
-             new (&self) platform::NPUPlace(dev_id);
-#else
-             LOG(ERROR) << string::Sprintf(
-                 "Cannot use NPU because you have installed CPU/GPU version "
-                 "PaddlePaddle.\n"
-                 "If you want to use NPU, please try to install NPU version "
-                 "PaddlePaddle by: pip install paddlepaddle-npu\n"
-                 "If you only have CPU, please change NPUPlace(%d) to be "
-                 "CPUPlace().\n",
-                 dev_id);
-             std::exit(-1);
-#endif
-           })
-      .def("_type", &PlaceIndex<platform::NPUPlace>)
-      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::NPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::NPUPlace, platform::CUDAPinnedPlace>)
-      .def("get_device_id",
-           [](const platform::NPUPlace &self) { return self.GetDeviceId(); })
-      .def("__str__", string::to_string<const platform::NPUPlace &>);
-  // IPUPlace
-  py::class_<platform::IPUPlace>(m, "IPUPlace", R"DOC(
-    IPUPlace is a descriptor of a device.
-    It represents a IPU device on which a tensor will be allocated and a model will run.
-    Examples:
-        .. code-block:: python
-          import paddle
-          # required: ipu
-          ipu_place = paddle.IPUPlace()
-        )DOC")
-      .def("__init__",
-           [](platform::IPUPlace &self) {
-#ifdef PADDLE_WITH_IPU
-             if (platform::GetIPUDeviceCount() == 0) {
-               LOG(ERROR) << "Cannot use IPU because there is no IPU "
-                             "detected on your "
-                             "machine.";
-               std::exit(-1);
-             }
-             // use ipu(0) to comile, while run with the number user configure
-             // in sharding and pipline.
-             new (&self) platform::IPUPlace(0);
-#else
-             LOG(ERROR) << string::Sprintf(
-                 "Cannot use IPU because you didn't install IPU version "
-                 "PaddlePaddle.\n"
-                 "If you want to use IPU, please try to install IPU version "
-                 "PaddlePaddle by: pip install paddlepaddle*\n"
-                 "If you only have CPU, please change IPUPlace to be "
-                 "CPUPlace().\n");
-             std::exit(-1);
-#endif
-           })
-      .def("_type", &PlaceIndex<platform::IPUPlace>)
-      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::NPUPlace>)
-      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::IPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::IPUPlace, platform::CUDAPinnedPlace>)
-#ifdef PADDLE_WITH_IPU
-      .def("get_device_id",
-           [](const platform::IPUPlace &self) { return self.GetDeviceId(); })
-#endif
-      .def("__str__", string::to_string<const platform::IPUPlace &>);
-  // MLUPlace
-  py::class_<platform::MLUPlace> mluplace(m, "MLUPlace", R"DOC(
-    MLUPlace is a descriptor of a device.
-    It represents a MLU device on which a tensor will be allocated and a model will run.
-    Examples:
-        .. code-block:: python
-          import paddle
-          # required: mlu
-          mlu_place = paddle.MLUPlace(0)
-        )DOC");
-  g_mluplace_pytype = reinterpret_cast<PyTypeObject *>(mluplace.ptr());
-  mluplace
-      .def("__init__",
-           [](platform::MLUPlace &self, int dev_id) {
-#ifdef PADDLE_WITH_MLU
-             if (UNLIKELY(dev_id < 0)) {
-               LOG(ERROR) << string::Sprintf(
-                   "Invalid MLUPlace(%d), device id must be 0 or "
-                   "positive integer",
-                   dev_id);
-               std::exit(-1);
-             }
-             if (UNLIKELY(dev_id >= platform::GetMLUDeviceCount())) {
-               if (platform::GetMLUDeviceCount() == 0) {
-                 LOG(ERROR) << "Cannot use MLU because there is no MLU "
-                               "detected on your "
-                               "machine.";
-                 std::exit(-1);
-               } else {
-                 LOG(ERROR) << string::Sprintf(
-                     "Invalid MLUPlace(%d), must inside [0, %d), because MLU "
-                     "number on your machine is %d",
-                     dev_id,
-                     platform::GetMLUDeviceCount(),
-                     platform::GetMLUDeviceCount());
-                 std::exit(-1);
-               }
-             }
-             new (&self) platform::MLUPlace(dev_id);
-#else
-             LOG(ERROR) << string::Sprintf(
-                 "Cannot use MLU because you have installed CPU/GPU/... "
-                 "version "
-                 "PaddlePaddle.\n"
-                 "If you want to use MLU, please try to install MLU version "
-                 "PaddlePaddle by: pip install paddlepaddle-mlu\n"
-                 "If you only have CPU, please change MLUPlace(%d) to be "
-                 "CPUPlace().\n",
-                 dev_id);
-             std::exit(-1);
-#endif
-           })
-      .def("_type", &PlaceIndex<platform::MLUPlace>)
-#ifdef PADDLE_WITH_MLU
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::NPUPlace>)
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::IPUPlace>)
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::MLUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::MLUPlace, platform::CUDAPinnedPlace>)
-      .def("get_device_id",
-           [](const platform::MLUPlace &self) { return self.GetDeviceId(); })
-#endif
-      .def("__str__", string::to_string<const platform::MLUPlace &>);
-  py::class_<platform::Place> platformplace(m, "Place");
-  g_place_pytype = reinterpret_cast<PyTypeObject *>(platformplace.ptr());
-  platformplace.def(py::init<>())
-      .def("_type", &PlaceIndex<platform::Place>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::IPUPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::MLUPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::CustomPlace>)
-      .def("is_gpu_place",
-           [](platform::Place &self) { return platform::is_gpu_place(self); })
-      .def("is_cpu_place",
-           [](platform::Place &self) { return platform::is_cpu_place(self); })
-      .def("is_xpu_place",
-           [](platform::Place &self) { return platform::is_xpu_place(self); })
-      .def("is_npu_place",
-           [](platform::Place &self) { return platform::is_npu_place(self); })
-      .def("is_ipu_place",
-           [](platform::Place &self) { return platform::is_ipu_place(self); })
-      .def("is_cuda_pinned_place",
-           [](platform::Place &self) {
-             return platform::is_cuda_pinned_place(self);
-           })
-      .def("is_mlu_place",
-           [](platform::Place &self) { return platform::is_mlu_place(self); })
-      .def(
-          "is_custom_place",
-          [](platform::Place &self) { return platform::is_custom_place(self); })
-      .def("gpu_device_id", [](platform::Place &self) { return self.device; })
-      .def("xpu_device_id", [](platform::Place &self) { return self.device; })
-      .def("npu_device_id", [](platform::Place &self) { return self.device; })
-      .def("ipu_device_id", [](platform::Place &self) { return self.device; })
-      .def("mlu_device_id", [](platform::Place &self) { return self.device; })
-      .def("custom_device_id",
-           [](platform::Place &self) { return self.device; })
-      .def("set_place",
-           [](platform::Place &self, const platform::Place &other) {
-             self = other;
-           })
-      .def("set_place",
-           [](platform::Place &self, const platform::CPUPlace &cpu_place) {
-             self = cpu_place;
-           })
-      .def("set_place",
-           [](platform::Place &self, const platform::XPUPlace &xpu_place) {
-             self = xpu_place;
-           })
-      .def("set_place",
-           [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
-             self = gpu_place;
-           })
-      .def("set_place",
-           [](platform::Place &self,
-              const platform::CUDAPinnedPlace &cuda_pinned_place) {
-             self = cuda_pinned_place;
-           })
-      .def("set_place",
-           [](platform::Place &self, const platform::NPUPlace &npu_place) {
-             self = npu_place;
-           })
-      .def("set_place",
-           [](platform::Place &self, const platform::IPUPlace &ipu_place) {
-             self = ipu_place;
-           })
-      .def("set_place",
-           [](platform::Place &self, const platform::MLUPlace &mlu_place) {
-             self = mlu_place;
-           })
-      .def("set_place",
-           [](platform::Place &self, const platform::CustomPlace &plug_place) {
-             self = plug_place;
-           })
-      .def("__repr__", string::to_string<const platform::Place &>)
-      .def("__str__", string::to_string<const platform::Place &>);
  py::class_<OperatorBase>(m, "Operator")
      .def_static("create",
@@ -3661,927 +2144,6 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("clear_executor_cache",
        []() { framework::ExecutorInfoCache::Instance().Finalize(); });
-  using VarQuantScale =
-      std::unordered_map<std::string, std::pair<bool, LoDTensor>>;
-  py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
-  pass.def(py::init())
-      .def("has", &ir::Pass::Has)
-      .def("set_not_owned",
-           [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) {
-             self.SetNotOwned<ProgramDesc>(attr_name, &attr);
-           })
-      .def(
-          "set",
-          [](ir::Pass &self, const std::string &name, const std::string &attr) {
-            self.Set<std::string>(name, new std::string(attr));
-          })
-      .def("set",
-           [](ir::Pass &self, const std::string &name, bool val) {
-             self.Set<bool>(name, new bool(val));
-           })
-      .def("set",
-           [](ir::Pass &self, const std::string &name, int val) {
-             self.Set<const int>(name, new int(val));
-           })
-      .def("set",
-           [](ir::Pass &self,
-              const std::string &name,
-              std::vector<std::string> set) {
-             self.Set(name, new std::vector<std::string>(set));
-           })
-      .def("set",
-           [](ir::Pass &self,
-              const std::string &name,
-              std::unordered_set<std::string> set) {
-             self.Set(name, new std::unordered_set<std::string>(set));
-           })
-      .def("set",
-           [](ir::Pass &self,
-              const std::string &name,
-              std::unordered_set<int> set) {
-             self.Set(name, new std::unordered_set<int>(set));
-           })
-      .def("set",
-           [](ir::Pass &self, const std::string &name, VarQuantScale scales) {
-             self.Set(name, new VarQuantScale(scales));
-           })
-      .def("type", &ir::Pass::Type)
-      .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
-        self.Apply(graph.get());
-      });
-  py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
-      m, "PassBuilder");
-  pb.def(py::init())
-      .def("append_pass",
-           [](ir::PassBuilder &self,
-              const std::string &pass_type) -> std::shared_ptr<ir::Pass> {
-             return self.AppendPass(pass_type);
-           })
-      .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); })
-      .def("insert_pass",
-           [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) {
-             return self.InsertPass(idx, pass_type);
-           })
-      .def("remove_pass",
-           [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
-  // -- python binds for parallel executor.
-  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
-  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
-    ExecutionStrategy allows the user to more preciously control how to run
-    the program in ParallelExecutor by setting the property.
-    Returns:
-        ExecutionStrategy: An ExecutionStrategy object.
-    Examples:
-        .. code-block:: python
-          import paddle
-          import paddle.static as static
-          import paddle.nn.functional as F
-          paddle.enable_static()
-          x = static.data(name='x', shape=[None, 13], dtype='float32')
-          y = static.data(name='y', shape=[None, 1], dtype='float32')
-          y_predict = static.nn.fc(input=x, size=1, act=None)
-          cost = F.square_error_cost(input=y_predict, label=y)
-          avg_loss = paddle.mean(cost)
-          sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001)
-          sgd_optimizer.minimize(avg_loss)
-          exec_strategy = static.ExecutionStrategy()
-          exec_strategy.num_threads = 4
-          train_exe = static.ParallelExecutor(use_cuda=False,
-                                              loss_name=avg_loss.name,
-                                              exec_strategy=exec_strategy)
-        )DOC");
-  py::enum_<paddle::platform::DeviceType>(m, "DeviceType", py::arithmetic())
-      .value("CPU", paddle::platform::DeviceType::CPU)
-      .value("CUDA", paddle::platform::DeviceType::CUDA)
-      .value("XPU", paddle::platform::DeviceType::XPU);
-  exec_strategy.def(py::init())
-      .def_property(
-          "num_threads",
-          [](const ExecutionStrategy &self) { return self.num_threads_; },
-          [](ExecutionStrategy &self, size_t num_threads) {
-            self.num_threads_ = num_threads;
-          },
-          R"DOC(
-            The type is INT, num_threads represents the size of thread pool that
-            used to run the operators of the current program in ParallelExecutor.
-            If :math:`num\_threads=1`, all the operators will execute one by one,
-            but the order maybe difference between iterations.
-            If it is not set, it will be set in ParallelExecutor according to the
-            device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
-            :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
-            if it is not set, ParallelExecutor will get the cpu count by calling
-            `multiprocessing.cpu_count()`. Default 0.
-            Examples:
-                .. code-block:: python
-                    import paddle
-                    import paddle.static as static
-                    paddle.enable_static()
-                    exec_strategy = static.ExecutionStrategy()
-                    exec_strategy.num_threads = 4
-            )DOC")
-      .def_property(
-          "_use_device",
-          [](const ExecutionStrategy &self) { return self.use_device_; },
-          [](ExecutionStrategy &self, paddle::platform::DeviceType use_device) {
-            self.use_device_ = use_device;
-          })  // NOTE(liuyuhui): Doesn't add doc for 'use_device', because
-              // use_device isn‘t exposed to users.
-      .def_property(
-          "allow_op_delay",
-          [](const ExecutionStrategy &self) { return self.allow_op_delay_; },
-          [](ExecutionStrategy &self, bool allow_op_delay) {
-            self.allow_op_delay_ = allow_op_delay;
-          },
-          R"DOC(The type is BOOL, allow_op_delay represents whether to delay the
-                communication operators to run, it may make the execution faster.
-                Note that this option is invalid now, and it will be removed in
-                next version. Default False.)DOC")
-      .def_property(
-          "num_iteration_per_drop_scope",
-          [](const ExecutionStrategy &self) {
-            return self.num_iteration_per_drop_scope_;
-          },
-          [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
-            self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
-          },
-          R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
-                many iterations to clean up the temp variables which
-                is generated during execution. It may make the execution faster,
-                because the temp variable's shape maybe the same between two iterations.
-                Default 100.
-                .. note::
-                    1. If you fetch data when calling the 'run', the ParallelExecutor 
-                    will clean up the temp variables at the end of the current iteration. 
-                    2. In some NLP model, it may cause the GPU memory is insufficient, 
-                    in this case, you should reduce `num_iteration_per_drop_scope`.
-                Examples:
-                    .. code-block:: python
-                        import paddle
-                        import paddle.static as static
-                        paddle.enable_static()
-                        exec_strategy = static.ExecutionStrategy()
-                        exec_strategy.num_iteration_per_drop_scope = 10
-              )DOC")
-      .def_property(
-          "num_iteration_per_run",
-          [](const ExecutionStrategy &self) {
-            return self.num_iteration_per_run_;
-          },
-          [](ExecutionStrategy &self, size_t num_iteration_per_run) {
-            self.num_iteration_per_run_ = num_iteration_per_run;
-          },
-          R"DOC(This config that how many iteration the executor will run when
-                user call exe.run() in python。Default: 1.
-                Examples:
-                    .. code-block:: python
-                        import paddle
-                        import paddle.static as static
-                        paddle.enable_static()
-                        exec_strategy = static.ExecutionStrategy()
-                        exec_strategy.num_iteration_per_run = 10
-              )DOC")
-      .def_property(
-          "use_thread_barrier",
-          [](const ExecutionStrategy &self) { return self.thread_barrier_; },
-          [](ExecutionStrategy &self, bool use_thread_barrier) {
-            self.thread_barrier_ = use_thread_barrier;
-          },
-          R"DOC(This config that the this is distributed training with parameter server
-              )DOC")
-      .def_property(
-          "_dry_run",
-          [](const ExecutionStrategy &self) { return self.dry_run_; },
-          [](ExecutionStrategy &self, bool dry_run) {
-            self.dry_run_ = dry_run;
-          });
-  exec_strategy.def_property(
-      "use_experimental_executor",
-      [](const ExecutionStrategy &self) {
-        return self.type_ == ExecutionStrategy::kExperimental;
-      },
-      [](ExecutionStrategy &self, bool experimental) {
-        self.type_ = experimental ? ExecutionStrategy::kExperimental
-                                  : ExecutionStrategy::kDefault;
-      });
-  py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy", R"DOC(
-    BuildStrategy allows the user to more preciously control how to
-    build the SSA Graph in ParallelExecutor by setting the property.
-    Returns:
-        BuildStrategy: An BuildStrategy object.
-    Examples:
-        .. code-block:: python
-            import os
-            import paddle
-            import paddle.static as static
-            paddle.enable_static()
-            os.environ['CPU_NUM'] = str(2)
-            places = static.cpu_places()
-            data = static.data(name="x", shape=[None, 1], dtype="float32")
-            hidden = static.nn.fc(input=data, size=10)
-            loss = paddle.mean(hidden)
-            paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
-            build_strategy = static.BuildStrategy()
-            build_strategy.enable_inplace = True
-            build_strategy.memory_optimize = True
-            build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
-            program = static.CompiledProgram(static.default_main_program())
-            program = program.with_data_parallel(loss_name=loss.name,
-                                                  build_strategy=build_strategy,
-                                                  places=places)
-)DOC");
-  py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
-      .value("Reduce", BuildStrategy::ReduceStrategy::kReduce)
-      .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce)
-      .value("_NoReduce", BuildStrategy::ReduceStrategy::kNoReduce);
-  py::enum_<BuildStrategy::GradientScaleStrategy>(build_strategy,
-                                                  "GradientScaleStrategy")
-      .value("CoeffNumDevice",
-             BuildStrategy::GradientScaleStrategy::kCoeffNumDevice)
-      .value("One", BuildStrategy::GradientScaleStrategy::kOne)
-      .value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized);
-  build_strategy.def(py::init())
-      .def("_clear_finalized", &BuildStrategy::ClearFinalized)
-      .def_property(
-          "reduce_strategy",
-          [](const BuildStrategy &self) { return self.reduce_; },
-          [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.reduce_ = strategy;
-          },
-          R"DOC((fluid.BuildStrategy.ReduceStrategy, optional): there are two reduce
-                strategies in ParallelExecutor, AllReduce and Reduce. If you want
-                that all the parameters' optimization are done on all devices independently,
-                you should choose AllReduce; otherwise, if you choose Reduce, all the parameters'
-                optimization will be evenly distributed to different devices, and then
-                broadcast the optimized parameter to other devices.
-                Default is 'AllReduce'.
-                Examples:
-                    .. code-block:: python
-                        import paddle
-                        import paddle.static as static
-                        paddle.enable_static()
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
-                  )DOC")
-      .def_property(
-          "gradient_scale_strategy",
-          [](const BuildStrategy &self) { return self.gradient_scale_; },
-          [](BuildStrategy &self,
-             BuildStrategy::GradientScaleStrategy strategy) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.gradient_scale_ = strategy;
-          },
-          R"DOC((paddle.static.BuildStrategy.GradientScaleStrategy, optional): there are three
-                ways of defining :math:`loss@grad` in ParallelExecutor, that is, CoeffNumDevice,
-                One and Customized. By default, ParallelExecutor sets the :math:`loss@grad`
-                according to the number of devices. If you want to customize :math:`loss@grad`,
-                you can choose Customized. Default is 'CoeffNumDevice'.
-                Examples:
-                    .. code-block:: python
-                        import numpy
-                        import os
-                        import paddle
-                        import paddle.static as static
-                        paddle.enable_static()
-                        use_cuda = True
-                        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
-                        exe = static.Executor(place)
-                        # NOTE: If you use CPU to run the program, you need
-                        # to specify the CPU_NUM, otherwise, paddle will use
-                        # all the number of the logic core as the CPU_NUM,
-                        # in that case, the batch size of the input should be
-                        # greater than CPU_NUM, if not, the process will be
-                        # failed by an exception.
-                        if not use_cuda:
-                            os.environ['CPU_NUM'] = str(2)
-                            places = static.cpu_places()
-                        else:
-                            places = static.cuda_places()
-                        data = static.data(name='X', shape=[None, 1], dtype='float32')
-                        hidden = static.nn.fc(input=data, size=10)
-                        loss = paddle.mean(hidden)
-                        paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
-                        exe.run(static.default_startup_program())
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.gradient_scale_strategy = \
-                                  static.BuildStrategy.GradientScaleStrategy.Customized
-                        compiled_prog = static.CompiledProgram(
-                                  static.default_main_program()).with_data_parallel(
-                                          loss_name=loss.name, build_strategy=build_strategy,
-                                          places=places)
-                        dev_count =  len(places)
-                        x = numpy.random.random(size=(10, 1)).astype('float32')
-                        loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01
-                        loss_grad_name = loss.name+"@GRAD"
-                        loss_data = exe.run(compiled_prog,
-                                              feed={"X": x, loss_grad_name : loss_grad},
-                                              fetch_list=[loss.name, loss_grad_name])
-                   )DOC")
-      .def_property(
-          "debug_graphviz_path",
-          [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
-          [](BuildStrategy &self, const std::string &path) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.debug_graphviz_path_ = path;
-          },
-          R"DOC((str, optional): debug_graphviz_path indicates the path that
-                writing the SSA Graph to file in the form of graphviz.
-                It is useful for debugging. Default is empty string, that is, ""
-                Examples:
-                    .. code-block:: python
-                        import paddle
-                        import paddle.static as static
-                        paddle.enable_static()
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.debug_graphviz_path = "./graph"
-                    )DOC")
-      .def_property(
-          "enable_sequential_execution",
-          [](const BuildStrategy &self) {
-            return self.enable_sequential_execution_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.enable_sequential_execution_ = b;
-          },
-          R"DOC((bool, optional): If set True, the execution order of ops would
-                be the same as what is in the program. Default is False.
-                Examples:
-                    .. code-block:: python
-                        import paddle
-                        import paddle.static as static
-                        paddle.enable_static()
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.enable_sequential_execution = True
-          )DOC")
-      .def_property(
-          "remove_unnecessary_lock",
-          [](const BuildStrategy &self) {
-            return self.remove_unnecessary_lock_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.remove_unnecessary_lock_ = b;
-          },
-          R"DOC((bool, optional): If set True, some locks in GPU ops would be
-                released and ParallelExecutor would run faster. Default is True.
-                Examples:
-                    .. code-block:: python
-                        import paddle
-                        import paddle.static as static
-                        paddle.enable_static()
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.remove_unnecessary_lock = True
-          )DOC")
-      .def_property(
-          "num_trainers",
-          [](const BuildStrategy &self) { return self.num_trainers_; },
-          [](BuildStrategy &self, int num_trainers) {
-#ifdef WIN32
-            PADDLE_THROW(platform::errors::Unavailable(
-                "Distribution mode is not supported on Windows platform."));
-#endif
-            self.num_trainers_ = num_trainers;
-          })
-      .def_property(
-          "trainers_endpoints",
-          [](const BuildStrategy &self) { return self.trainers_endpoints_; },
-          [](BuildStrategy &self,
-             const std::vector<std::string> &trainers_endpoints) {
-            self.trainers_endpoints_ = trainers_endpoints;
-          })
-      .def_property(
-          "trainer_id",
-          [](const BuildStrategy &self) { return self.trainer_id_; },
-          [](BuildStrategy &self, int trainer_id) {
-            self.trainer_id_ = trainer_id;
-          })
-      .def_property(
-          "nccl_comm_num",
-          [](const BuildStrategy &self) { return self.nccl_comm_num_; },
-          [](BuildStrategy &self, int nccl_comm_num) {
-            self.nccl_comm_num_ = nccl_comm_num;
-          })
-      .def_property(
-          "bkcl_comm_num",
-          [](const BuildStrategy &self) { return self.bkcl_comm_num_; },
-          [](BuildStrategy &self, int bkcl_comm_num) {
-            self.bkcl_comm_num_ = bkcl_comm_num;
-          })
-      .def_property(
-          "use_hierarchical_allreduce",
-          [](const BuildStrategy &self) {
-            return self.use_hierarchical_allreduce_;
-          },
-          [](BuildStrategy &self, bool use) {
-            self.use_hierarchical_allreduce_ = use;
-          })
-      .def_property(
-          "hierarchical_allreduce_inter_nranks",
-          [](const BuildStrategy &self) {
-            return self.hierarchical_allreduce_inter_nranks_;
-          },
-          [](BuildStrategy &self, int nranks) {
-            self.hierarchical_allreduce_inter_nranks_ = nranks;
-          })
-      .def_property(
-          "fuse_elewise_add_act_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_elewise_add_act_ops_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.fuse_elewise_add_act_ops_ = b;
-          },
-          R"DOC((bool, optional): fuse_elewise_add_act_ops indicate whether
-                to fuse elementwise_add_op and activation_op,
-                it may make the execution faster. Default is False.
-                Examples:
-                    .. code-block:: python
-                        import paddle
-                        import paddle.static as static
-                        paddle.enable_static()
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_elewise_add_act_ops = True
-                     )DOC")
-      .def_property(
-          "fuse_gemm_epilogue",
-          [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.fuse_gemm_epilogue_ = b;
-          },
-          R"DOC((bool, optional): fuse_gemm_epilogue indicate whether
-                to fuse matmul_op, elemenewist_add_op and activation_op,
-                it may make the execution faster. Default is False.
-                Examples:
-                    .. code-block:: python
-                        import paddle
-                        import paddle.static as static
-                        paddle.enable_static()
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_gemm_epilogue = True
-                     )DOC")
-      .def_property(
-          "fuse_bn_act_ops",
-          [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.fuse_bn_act_ops_ = b;
-          },
-          R"DOC((bool, optional): fuse_bn_act_ops indicate whether
-                to fuse batch_norm and activation_op,
-                it may make the execution faster. Default is False.
-                Examples:
-                    .. code-block:: python
-                        import paddle
-                        import paddle.static as static
-                        paddle.enable_static()
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_bn_act_ops = True
-                     )DOC")
-      .def_property(
-          "fuse_bn_add_act_ops",
-          [](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.fuse_bn_add_act_ops_ = b;
-          },
-          R"DOC((bool, optional): fuse_bn_add_act_ops indicate whether
-                to fuse batch_norm, elementwise_add and activation_op,
-                it may make the execution faster. Default is True
-                Examples:
-                    .. code-block:: python
-                        import paddle
-                        import paddle.static as static
-                        paddle.enable_static()
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_bn_add_act_ops = True
-                     )DOC")
-      .def_property(
-          "enable_auto_fusion",
-          [](const BuildStrategy &self) { return self.enable_auto_fusion_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.enable_auto_fusion_ = b;
-          },
-          R"DOC((bool, optional): Whether to enable fusing subgraph to a
-                fusion_group. Now we only support fusing subgraph that composed
-                of elementwise-like operators, such as elementwise_add/mul
-                without broadcast and activations.
-                Examples:
-                    .. code-block:: python
-                        import paddle
-                        import paddle.static as static
-                        paddle.enable_static()
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.enable_auto_fusion = True
-                    )DOC")
-      .def_property(
-          "fuse_relu_depthwise_conv",
-          [](const BuildStrategy &self) {
-            return self.fuse_relu_depthwise_conv_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.fuse_relu_depthwise_conv_ = b;
-          },
-          R"DOC((bool, optional): fuse_relu_depthwise_conv indicate whether
-                to fuse relu and depthwise_conv2d,
-                it will save GPU memory and may make the execution faster.
-                This options is only available in GPU devices.
-                Default is False.
-                Examples:
-                    .. code-block:: python
-                        import paddle
-                        import paddle.static as static
-                        paddle.enable_static()
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_relu_depthwise_conv = True
-          )DOC")
-      .def_property(
-          "fuse_broadcast_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_broadcast_ops_ == true ||
-                   self.fuse_broadcast_ops_ == paddle::none;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, "
-                                  "cannot be configured again."));
-            self.fuse_broadcast_ops_ = b;
-          },
-          R"DOC((bool, optional): fuse_broadcast_op indicates whether
-                      to fuse the broadcast ops. Note that, in Reduce mode,
-                      fusing broadcast ops may make the program faster. Because
-                      fusing broadcast OP equals delaying the execution of all
-                      broadcast Ops, in this case, all nccl streams are used only
-                      for NCCLReduce operations for a period of time. Default False.
-                      Examples:
-                          .. code-block:: python
-                              import paddle
-                              import paddle.static as static
-                              paddle.enable_static()
-                              build_strategy = static.BuildStrategy()
-                              build_strategy.fuse_broadcast_ops = True
-                    )DOC")
-      .def_property(
-          "fuse_all_optimizer_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_all_optimizer_ops_ == true ||
-                   self.fuse_all_optimizer_ops_ == paddle::none;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, "
-                                  "cannot be configured again."));
-            self.fuse_all_optimizer_ops_ = b;
-          })
-      .def_property(
-          "sync_batch_norm",
-          [](const BuildStrategy &self) { return self.sync_batch_norm_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.sync_batch_norm_ = b;
-          },
-          R"DOC((bool, optional): sync_batch_norm indicates whether to use
-                synchronous batch normalization which synchronizes the mean
-                and variance through multi-devices in training phase.
-                Current implementation doesn't support FP16 training and CPU.
-                And only synchronous on one machine, not all machines. 
-                Default is False.
-                Examples:
-                    .. code-block:: python
-                        import paddle
-                        import paddle.static as static
-                        paddle.enable_static()
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.sync_batch_norm = True
-                )DOC")
-      .def_property(
-          "memory_optimize",
-          [](const BuildStrategy &self) -> py::object {
-            if (self.memory_optimize_) {
-              return py::cast(self.memory_optimize_.get());
-            } else {
-              return py::cast(nullptr);
-            }
-          },
-          [](BuildStrategy &self, const py::handle &value) {
-            auto *py_obj = value.ptr();
-            if (py_obj == nullptr || py_obj == Py_None) {
-              self.memory_optimize_ = paddle::none;
-            } else if (PyBool_Check(py_obj)) {
-              self.memory_optimize_ = (py_obj == Py_True);
-            } else {
-              PADDLE_THROW(platform::errors::InvalidArgument(
-                  "BuildStrategy.memory_optimize must be set to None, False "
-                  "or True"));
-            }
-          },
-          R"DOC((bool, optional): memory opitimize aims to save total memory
-                consumption, set to True to enable it.
-                Default None. None means framework would choose to use or not use 
-                this strategy automatically. Currently, None means that it is 
-                enabled when GC is disabled, and disabled when GC is enabled. 
-                True means enabling and False means disabling. Default is None.
-                Examples:
-                    .. code-block:: python
-                        import paddle
-                        import paddle.static as static
-                        paddle.enable_static()
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.memory_optimize = True
-                )DOC")
-      .def_property(
-          "is_distribution",
-          [](const BuildStrategy &self) { return self.is_distribution_; },
-          [](BuildStrategy &self, bool b) {
-#ifdef WIN32
-            if (b) {
-              PADDLE_THROW(platform::errors::Unavailable(
-                  "Distribution mode is not supported on Windows platform."));
-            }
-#else
-            self.is_distribution_ = b;
-#endif
-          })
-      .def_property(
-          "async_mode",
-          [](const BuildStrategy &self) { return self.async_mode_; },
-          [](BuildStrategy &self, bool b) { self.async_mode_ = b; })
-      .def_property(
-          "enable_inplace",
-          [](const BuildStrategy &self) { return self.enable_inplace_; },
-          [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; })
-      .def_property(
-          "enable_addto",
-          [](const BuildStrategy &self) { return self.enable_addto_; },
-          [](BuildStrategy &self, bool b) { self.enable_addto_ = b; })
-      .def_property(
-          "fuse_all_reduce_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_all_reduce_ops_ == true ||
-                   self.fuse_all_reduce_ops_ == paddle::none;
-          },
-          [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; })
-      .def_property(
-          "enable_backward_optimizer_op_deps",
-          [](const BuildStrategy &self) {
-            return self.enable_backward_optimizer_op_deps_;
-          },
-          [](BuildStrategy &self, bool b) {
-            self.enable_backward_optimizer_op_deps_ = b;
-          })
-      .def_property(
-          "cache_runtime_context",
-          [](const BuildStrategy &self) { return self.cache_runtime_context_; },
-          [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; })
-      .def_property(
-          "mkldnn_enabled_op_types",
-          [](const BuildStrategy &self) {
-            return self.mkldnn_enabled_op_types_;
-          },
-          [](BuildStrategy &self,
-             const std::unordered_set<std::string> &mkldnn_enabled_op_types) {
-            self.mkldnn_enabled_op_types_ = mkldnn_enabled_op_types;
-          })
-      .def_property(
-          "fix_op_run_order",
-          [](const BuildStrategy &self) { return self.fix_op_run_order_; },
-          [](BuildStrategy &self, bool fix_op_run_order) {
-            self.fix_op_run_order_ = fix_op_run_order;
-          })
-      .def_property(
-          "allow_cuda_graph_capture",
-          [](const BuildStrategy &self) {
-            return self.allow_cuda_graph_capture_;
-          },
-          [](BuildStrategy &self, bool allow_cuda_graph_capture) {
-            self.allow_cuda_graph_capture_ = allow_cuda_graph_capture;
-          })
-      .def("_copy",
-           [](const BuildStrategy &self) {
-             auto new_bs = self;
-             new_bs.ClearFinalized();
-             return new_bs;
-           })
-      .def(
-          "_finalize_strategy_and_create_passes",
-          [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
-            return self.CreatePassesFromStrategy(true);
-          },
-          R"DOC(Allow user to customized passes. Normally model-specific
-                optimization passes should be defined in this way. BuildStrategy
-                cannot be updated after being finalized.)DOC");
-  m.def("_set_cached_executor_build_strategy",
-        [](int64_t program_id, const BuildStrategy &build_strategy) {
-          auto &cached_exe_info = framework::ExecutorInfoCache::Instance();
-          cached_exe_info.SetBuildStrategy(program_id, build_strategy);
-        });
-  pe.def(py::init<const std::vector<platform::Place> &,
-                  const std::vector<std::string> &,
-                  const std::string &,
-                  Scope *,
-                  std::vector<Scope *> &,
-                  const ExecutionStrategy &,
-                  const BuildStrategy &,
-                  ir::Graph *>())
-      // NOTE: even we return a vec<Scope*>* to Python use reference policy.
-      // We still cannot get local_scope from this vector, since the element
-      // of vec<Scope*> will be freed by Python GC. We can only return Scope*
-      // one by one and mark them as reference.
-      .def(
-          "local_scopes",
-          [](ParallelExecutor &self) -> std::vector<Scope *> * {
-            return &self.GetLocalScopes();
-          },
-          py::return_value_policy::reference)
-      .def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes)
-      .def("_need_create_local_exe_scopes",
-           &ParallelExecutor::NeedCreateLocalExeScope)
-      .def("feed_tensors_into_local_scopes",
-           &ParallelExecutor::FeedTensorsIntoLocalScopes)
-      .def("feed_and_split_tensor_into_local_scopes",
-           &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
-      .def("run",
-           [](ParallelExecutor &self,
-              const std::vector<std::string> &fetch_tensors,
-              bool return_merged) -> py::object {
-             if (return_merged) {
-               paddle::framework::FetchList ret;
-               /*gil_scoped_release*/ {
-                 pybind11::gil_scoped_release release;
-                 ret = self.RunAndMerge(fetch_tensors);
-               }
-               return py::cast(std::move(ret));
-             } else {
-               paddle::framework::FetchUnmergedList ret;
-               /*gil_scoped_release*/ {
-                 pybind11::gil_scoped_release release;
-                 ret = self.Run(fetch_tensors);
-               }
-               return py::cast(std::move(ret));
-             }
-           })
-      .def("device_count", &ParallelExecutor::DeviceCount);
 #ifdef PADDLE_WITH_IPU
  py::class_<platform::ipu::IpuBackend,
             std::unique_ptr<platform::ipu::IpuBackend, py::nodelete>>(
@@ -4790,6 +2352,9 @@ All parameter, weight, gradient are variables in Paddle.
  BindFleetWrapper(&m);
  BindIO(&m);
+  BindParallelExecutor(m);
+  BindPlace(m);
+  BindTensor(m);
 #if defined(PADDLE_WITH_PSLIB) && !defined(PADDLE_WITH_HETERPS)
  BindHeterWrapper(&m);

--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <Python.h>
+#include <algorithm>
+#include <cctype>
+#include <cstdlib>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT // for call_once
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/executor_cache.h"
+#include "paddle/fluid/framework/executor_gc_helper.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
+#include "paddle/fluid/framework/ir/cost_model.h"
+#include "paddle/fluid/framework/ir/generate_pass.h"
+#include "paddle/fluid/framework/ir/pass_builder.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/new_executor/executor_statistics.h"
+#include "paddle/fluid/framework/new_executor/standalone_executor.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/framework/parallel_executor.h"
+#include "paddle/fluid/framework/phi_utils.h"
+#include "paddle/fluid/framework/prune.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/save_load_util.h"
+#include "paddle/fluid/framework/scope_pool.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/trainer.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/imperative/amp_auto_cast.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/memory/allocation/allocator_strategy.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
+#endif
+#include "paddle/fluid/memory/allocation/mmap_allocator.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/fluid/operators/py_func_op.h"
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/monitor.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/profiler.h"
+#include "paddle/fluid/pybind/cuda_streams_py.h"
+#include "paddle/fluid/pybind/distributed_py.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/pybind/io.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/lod_utils.h"
+#include "paddle/utils/none.h"
+#ifdef PADDLE_WITH_ASCEND
+#include "paddle/fluid/pybind/ascend_wrapper_py.h"
+#endif
+#include "paddle/fluid/pybind/bind_cost_model.h"
+#include "paddle/fluid/pybind/bind_fleet_executor.h"
+#include "paddle/fluid/pybind/box_helper_py.h"
+#include "paddle/fluid/pybind/communication.h"
+#include "paddle/fluid/pybind/compatible.h"
+#include "paddle/fluid/pybind/const_value.h"
+#include "paddle/fluid/pybind/data_set_py.h"
+#include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/fleet_wrapper_py.h"
+#include "paddle/fluid/pybind/generator_py.h"
+#include "paddle/fluid/pybind/global_value_getter_setter.h"
+#include "paddle/fluid/pybind/gloo_context_py.h"
+#include "paddle/fluid/pybind/gloo_wrapper_py.h"
+#include "paddle/fluid/pybind/heter_wrapper_py.h"
+#include "paddle/fluid/pybind/inference_api.h"
+#include "paddle/fluid/pybind/ir.h"
+#include "paddle/fluid/pybind/metrics_py.h"
+#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
+#include "paddle/fluid/pybind/pybind_boost_headers.h"
+#include "paddle/phi/backends/device_manager.h"
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/pybind/nccl_wrapper_py.h"
+#endif
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/pybind/protobuf.h"
+#include "paddle/fluid/pybind/pybind.h"  // NOLINT
+#include "paddle/fluid/pybind/reader_py.h"
+#include "paddle/fluid/pybind/tensor_py.h"
+#include "paddle/fluid/string/to_string.h"
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#endif
+#ifndef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
+#endif
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
+#endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/phi/capi/capi.h"
+#endif
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#endif
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#endif
+#ifdef PADDLE_WITH_CRYPTO
+#include "paddle/fluid/pybind/crypto.h"
+#endif
+#if defined PADDLE_WITH_PSCORE
+#include "paddle/fluid/pybind/fleet_py.h"
+#endif
+#ifdef PADDLE_WITH_CINN
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#endif
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/imperative/layout_autotune.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/tensor.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/phi/kernels/autotune/cache.h"
+#include "paddle/phi/kernels/autotune/switch_autotune.h"
+#include "pybind11/stl.h"
+DECLARE_bool(use_mkldnn);
+// disable auto conversion to list in Python
+PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
+namespace paddle {
+namespace pybind {
+PyTypeObject *g_framework_tensor_pytype = nullptr;
+template <typename PlaceType>
+static void TensorCopyFrom(framework::Tensor *dst,
+                           const framework::Tensor &src,
+                           const PlaceType &place,
+                           int64_t batch_size) {
+  if (batch_size < 0) {
+    framework::TensorCopy(src, place, dst);
+  } else {
+    auto sliced = src.Slice(0, batch_size);
+    framework::TensorCopy(sliced, place, dst);
+  }
+}
+void BindTensor(pybind11::module &m) {  // NOLINT
+  using namespace paddle::framework;    // NOLINT
+  py::class_<framework::Tensor> framework_tensor(
+      m, "Tensor", py::buffer_protocol());
+  g_framework_tensor_pytype =
+      reinterpret_cast<PyTypeObject *>(framework_tensor.ptr());
+  framework_tensor
+      .def("__array__",
+           [](framework::Tensor &self) { return TensorToPyArray(self); })
+      .def("_ptr",
+           [](const framework::Tensor &self) {
+             return reinterpret_cast<uintptr_t>(self.data());
+           })
+      .def("_slice", &framework::Tensor::Slice)
+      .def("_numel", &framework::Tensor::numel)
+      .def("_is_initialized",
+           [](const framework::Tensor &self) { return self.IsInitialized(); })
+      .def("_get_dims",
+           [](const framework::Tensor &self) { return vectorize(self.dims()); })
+      .def("_set_dims",
+           [](framework::Tensor &self, const std::vector<int64_t> &dim) {
+             self.Resize(phi::make_ddim(dim));
+           })
+      .def("_set_layout",
+           [](framework::Tensor &self, const std::string &layout) {
+             self.set_layout(StringToDataLayout(layout));
+           })
+      .def("_alloc_float",
+           [](framework::Tensor &self, paddle::platform::CustomPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("_alloc_float",
+           [](framework::Tensor &self, paddle::platform::CUDAPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("_alloc_float",
+           [](framework::Tensor &self, paddle::platform::XPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("_alloc_float",
+           [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("_alloc_float",
+           [](framework::Tensor &self, paddle::platform::NPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("_alloc_float",
+           [](framework::Tensor &self, paddle::platform::MLUPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("_alloc_double",
+           [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
+             self.mutable_data<double>(place);
+           })
+      .def("_alloc_int",
+           [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("_alloc_int",
+           [](framework::Tensor &self, paddle::platform::CustomPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("_alloc_int",
+           [](framework::Tensor &self, paddle::platform::XPUPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("_alloc_int",
+           [](framework::Tensor &self, paddle::platform::CUDAPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("_alloc_int",
+           [](framework::Tensor &self, paddle::platform::MLUPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("_alloc_int",
+           [](framework::Tensor &self,
+              paddle::platform::CUDAPinnedPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("_alloc_float",
+           [](framework::Tensor &self,
+              paddle::platform::CUDAPinnedPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("_mutable_data",
+           [](framework::Tensor &self,
+              paddle::platform::CPUPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
+           })
+      .def("_mutable_data",
+           [](framework::Tensor &self,
+              paddle::platform::CustomPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
+           })
+      .def("_mutable_data",
+           [](framework::Tensor &self,
+              paddle::platform::XPUPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
+           })
+      .def("_mutable_data",
+           [](framework::Tensor &self,
+              paddle::platform::CUDAPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
+           })
+      .def("_mutable_data",
+           [](framework::Tensor &self,
+              paddle::platform::CUDAPinnedPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
+           })
+      .def("_mutable_data",
+           [](framework::Tensor &self,
+              paddle::platform::MLUPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
+           })
+      .def("_clear", &framework::Tensor::clear)
+      .def("_mutable_data",
+           [](framework::Tensor &self,
+              paddle::platform::NPUPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
+           })
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::CPUPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::CustomPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::XPUPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::CUDAPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::NPUPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::CUDAPinnedPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::MLUPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::Place>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::CPUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::CustomPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::XPUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::CUDAPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::NPUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::IPUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::MLUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false,
+           R"DOC(
+        Set the data of Tensor on place with given numpy array.
+        Args:
+          lod (numpy.ndarray): The data to set.
+          place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace|MLUPlace): The place where the
+          Tensor is to be set.
+          zero_copy (bool, optional): Whether to share memory with the input numpy array.
+          This parameter only works with CPUPlace. Default: False.
+        Returns:
+            None.
+        Examples:
+            .. code-block:: python
+                import paddle.fluid as fluid
+                import numpy as np
+                t = fluid.Tensor()
+                t.set(np.ndarray([5, 30]), fluid.CPUPlace())
+          )DOC")
+      .def(
+          "shape",
+          [](framework::Tensor &self) { return vectorize(self.dims()); },
+          R"DOC(
+           Return the shape of Tensor.
+           Returns:
+               list[int]: The shape of Tensor.
+           Examples:
+               .. code-block:: python
+                  import paddle.fluid as fluid
+                  import numpy as np
+                  t = fluid.Tensor()
+                  t.set(np.ndarray([5, 30]), fluid.CPUPlace())
+                  print(t.shape())  # [5, 30]
+           )DOC")
+      .def("_to_dlpack",
+           [](framework::Tensor &self) {
+             DLPackTensor dlpack_tensor(self, 1);
+             DLManagedTensor *dmt = dlpack_tensor.ToDLManagedTensor();
+             auto capsule = py::capsule(
+                 static_cast<void *>(dmt), "dltensor", [](PyObject *ptr) {
+                   if (ptr) {
+                     auto dltensor = new DLManagedTensor;
+                     try {
+                       dltensor = reinterpret_cast<DLManagedTensor *>(
+                           PyCapsule_GetPointer(ptr, "used_dltensor"));
+                       return;
+                     } catch (...) {
+                       dltensor = reinterpret_cast<DLManagedTensor *>(
+                           PyCapsule_GetPointer(ptr, "dltensor"));
+                     }
+                     dltensor->deleter(dltensor);
+                   }
+                 });
+             return capsule;
+           })
+      .def("_set_float_element", TensorSetElement<float>)
+      .def("_get_float_element", TensorGetElement<float>)
+      .def("_set_double_element", TensorSetElement<double>)
+      .def("_get_double_element", TensorGetElement<double>)
+      .def("_place", [](framework::Tensor &self) { return self.place(); })
+      .def("_dtype",
+           [](framework::Tensor &self) {
+             return framework::TransToProtoVarType(self.type());
+           })
+      .def("_layout",
+           [](framework::Tensor &self) {
+             return DataLayoutToString(self.layout());
+           })
+      .def("_share_data_with", &framework::Tensor::ShareDataWith)
+      .def("__getitem__", PySliceTensor, py::return_value_policy::reference)
+      .def("__str__",
+           [](const framework::Tensor &self) {
+             std::stringstream ostr;
+             ostr << self;
+             return ostr.str();
+           }) /* ------ End of original Tensor ------ */
+      .def("__init__",
+           [](framework::Tensor &instance,
+              const std::vector<std::vector<size_t>>
+                  &recursive_sequence_lengths) {
+             LoD new_lod;
+             new_lod.reserve(recursive_sequence_lengths.size());
+             std::copy(recursive_sequence_lengths.begin(),
+                       recursive_sequence_lengths.end(),
+                       std::back_inserter(new_lod));
+             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
+             PADDLE_ENFORCE_EQ(
+                 CheckLoD(new_offset_lod, -1),
+                 true,
+                 platform::errors::InvalidArgument(
+                     "The provided recursive_sequence_lengths info is "
+                     "invalid, "
+                     "the LoD converted by recursive_sequence_lengths is %s",
+                     new_lod));
+             new (&instance) framework::Tensor(new_offset_lod);
+           })
+      .def("__init__",
+           [](framework::Tensor &instance) {
+             new (&instance) framework::Tensor();
+           })
+      // We implement offset based LOD in C++ while we use length based with
+      // Python API. So we changed set_lod to set_recursive_sequence_lengths
+      // to
+      // avoid misuse.
+      // The discussion is here:
+      // https://github.com/PaddlePaddle/Paddle/issues/10855
+      .def(
+          "set_lod",
+          [](framework::Tensor &self,
+             const std::vector<std::vector<size_t>> &lod) {
+            // the input lod is offset-based level-of-detail info
+            LoD new_lod;
+            new_lod.reserve(lod.size());
+            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+            PADDLE_ENFORCE_EQ(
+                CheckLoD(new_lod, vectorize(self.dims()).front()),
+                true,
+                platform::errors::InvalidArgument(
+                    "The provided LoD is invalid, the LoD is %s", new_lod));
+            self.set_lod(new_lod);
+          },
+          py::arg("lod"),
+          R"DOC(
+           Set LoD of the Tensor.
+           Args:
+               lod (list[list[int]]): The lod to set.
+           Returns:
+                None.
+           Examples:
+               .. code-block:: python
+                 import paddle.fluid as fluid
+                 import numpy as np
+                 t = fluid.Tensor()
+                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
+                 t.set_lod([[0, 2, 5]])
+                 print(t.lod()) # [[0, 2, 5]]
+           )DOC")
+      .def(
+          "set_recursive_sequence_lengths",
+          [](framework::Tensor &self,
+             const std::vector<std::vector<size_t>>
+                 &recursive_sequence_lengths) {
+            // the input recursive_sequence_lengths is length-based
+            // level-of-detail info
+            LoD new_lod;
+            new_lod.reserve(recursive_sequence_lengths.size());
+            std::copy(recursive_sequence_lengths.begin(),
+                      recursive_sequence_lengths.end(),
+                      std::back_inserter(new_lod));
+            LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
+            PADDLE_ENFORCE_EQ(
+                CheckLoD(new_offset_lod, vectorize(self.dims()).front()),
+                true,
+                platform::errors::InvalidArgument(
+                    "The provided recursive_sequence_lengths info is "
+                    "invalid, "
+                    "the LoD converted by recursive_sequence_lengths is "
+                    "%s",
+                    new_lod));
+            self.set_lod(new_offset_lod);
+          },
+          py::arg("recursive_sequence_lengths"),
+          R"DOC(
+           Set LoD of the Tensor according to recursive sequence lengths.
+           For example, if recursive_sequence_lengths=[[2, 3]], which means
+           there are two sequences with length 2 and 3 respectively, the
+           corresponding lod would be [[0, 2, 2+3]], i.e., [[0, 2, 5]].
+           Args:
+                recursive_sequence_lengths (list[list[int]]): The recursive sequence lengths.
+           Returns:
+                None.
+           Examples:
+               .. code-block:: python
+                 import paddle.fluid as fluid
+                 import numpy as np
+                 t = fluid.Tensor()
+                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
+                 t.set_recursive_sequence_lengths([[2, 3]])
+                 print(t.recursive_sequence_lengths())  # [[2, 3]]
+                 print(t.lod())  # [[0, 2, 5]]
+           )DOC")
+      .def(
+          "lod",
+          [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
+            // output the offset-based lod info
+            LoD lod = self.lod();
+            std::vector<std::vector<size_t>> new_lod;
+            new_lod.reserve(lod.size());
+            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+            return new_lod;
+          },
+          R"DOC(
+           Return the LoD of the Tensor.
+           Returns:
+               list[list[int]]: The lod of the Tensor.
+           Examples:
+               .. code-block:: python
+                 import paddle.fluid as fluid
+                 import numpy as np
+                 t = fluid.Tensor()
+                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
+                 t.set_lod([[0, 2, 5]])
+                 print(t.lod()) # [[0, 2, 5]]
+           )DOC")
+      // Set above comments of set_lod.
+      .def(
+          "recursive_sequence_lengths",
+          [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
+            // output the length-based lod info
+            LoD lod = phi::ConvertToLengthBasedLoD(self.lod());
+            std::vector<std::vector<size_t>> new_lod;
+            new_lod.reserve(lod.size());
+            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+            return new_lod;
+          },
+          R"DOC(
+           Return the recursive sequence lengths corresponding to of the LodD 
+           of the Tensor.
+           Returns:
+                list[list[int]]: The recursive sequence lengths.
+           Examples:
+               .. code-block:: python
+                 import paddle.fluid as fluid
+                 import numpy as np
+                 t = fluid.Tensor()
+                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
+                 t.set_recursive_sequence_lengths([[2, 3]])
+                 print(t.recursive_sequence_lengths()) # [[2, 3]]
+           )DOC")
+      .def(
+          "has_valid_recursive_sequence_lengths",
+          [](framework::Tensor &self) -> bool {
+            // Check that the lod info is valid and match the outermost
+            // dimension of the Tensor data
+            return CheckLoD(self.lod(), vectorize(self.dims()).front());
+          },
+          R"DOC(
+           Check whether the LoD of the Tensor is valid.
+           Returns:
+               bool: Whether the LoD is valid.
+           Examples:
+               .. code-block:: python
+                 import paddle.fluid as fluid
+                 import numpy as np
+                 t = fluid.Tensor()
+                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
+                 t.set_recursive_sequence_lengths([[2, 3]])
+                 print(t.has_valid_recursive_sequence_lengths()) # True
+           )DOC")
+      .def("_as_type",
+           [](const framework::Tensor &self,
+              paddle::framework::proto::VarType::Type type) {
+             framework::Tensor dst;
+             if (self.IsInitialized() && self.numel() > 0) {
+               TransDataType(self, type, &dst);
+             }
+             return dst;
+           })
+      .def("_copy",
+           [](const framework::Tensor &self, const platform::Place &place) {
+             // follow fetch_op's inplementation
+             framework::Tensor dst;
+             if (self.IsInitialized() && self.numel() > 0) {
+               TensorCopySync(self, place, &dst);
+             } else {
+               // Not copy, if the src tensor is empty.
+               dst.clear();
+               dst.Resize({0});
+             }
+             dst.set_lod(self.lod());
+             return dst;
+#ifdef _WIN32
+           });
+#else
+           })
+#ifdef PADDLE_WITH_CUDA
+      .def("_share_buffer_with",
+           [](framework::Tensor &self, const framework::Tensor src,
+              py::tuple t) {
+             auto *cuda_ipc_allocation =
+                 dynamic_cast<memory::allocation::CudaIpcAllocation *>(
+                     src.Holder().get());
+             PADDLE_ENFORCE_NOT_NULL(
+                 cuda_ipc_allocation,
+                 platform::errors::PreconditionNotMet(
+                     "Tensor is not Cuda IPC shared tensor. "
+                     "Now only Tensor shared by cuda ipc could use this "
+                     "api."));
+             size_t size = t[0].cast<size_t>();
+             auto dtype =
+                 static_cast<paddle::experimental::DataType>(t[1].cast<int>());
+             auto dims = phi::make_ddim(t[2].cast<std::vector<int>>());
+             auto lod_info = t[3].cast<framework::LoD>();
+             auto device_id = t[4].cast<int>();
+             auto shared_reader_holder =
+                 std::make_shared<memory::allocation::Allocation>(
+                     cuda_ipc_allocation->ptr(),
+                     cuda_ipc_allocation->base_ptr(), size,
+                     platform::CUDAPlace(device_id));
+             self.ResetHolderWithType(shared_reader_holder, dtype);
+             self.Resize(dims);
+             self.set_lod(lod_info);
+             VLOG(6) << "Reconstructed tensor with buffer shared!";
+           },
+           R"DOC(
+           Deserialize GPU Tensor for existed shared Cuda IPC tensor.
+           Params:
+               tensor: Shared Cuda IPC tensor.
+               tuple: contrains data size, data type,
+                      tensor dims, lod information, device index.
+       )DOC")
+      .def("_share_cuda",
+           [](framework::Tensor self) {
+             if (!self.IsInitialized() || self.numel() == 0)
+               throw std::runtime_error(
+                   "Tensor not initialized or numel is 0.  could not pass "
+                   "to shared memory. ");
+             auto *holder = dynamic_cast<memory::allocation::Allocation *>(
+                 self.Holder().get());
+             PADDLE_ENFORCE_EQ(
+                 platform::is_gpu_place(holder->place()), true,
+                 platform::errors::InvalidArgument(
+                     "Tensor is not on GPU. share_cuda only support GPU "
+                     "Tensor, share_filename is for CPU tensor."));
+             void *base_ptr = holder->base_ptr();
+             ptrdiff_t offset_bytes = reinterpret_cast<char *>(holder->ptr()) -
+                                      reinterpret_cast<char *>(base_ptr);
+             cudaIpcMemHandle_t handle;
+             PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcGetMemHandle(&handle, base_ptr));
+             auto _handle = py::bytes(reinterpret_cast<char *>(&handle),
+                                      (py::ssize_t)CUDA_IPC_HANDLE_SIZE);
+             // TODO(ZHUI): use cuda event, to avoid sync.
+             const auto &device_id = paddle::platform::GetCurrentDeviceId();
+             auto stream =
+                 paddle::platform::stream::get_current_stream(device_id);
+             stream->Synchronize();
+             int type_idx = static_cast<int>(self.type());
+             size_t data_size =
+                 self.numel() *
+                 framework::SizeOfType(
+                     framework::TransToProtoVarType(self.type()));
+             return py::make_tuple(_handle, (py::size_t)offset_bytes, data_size,
+                                   type_idx, vectorize(self.dims()), self.lod(),
+                                   device_id);
+           },
+           R"DOC(
+           Serialize GPU Tensor by cudaIpcMemHandle.
+           Returns:
+               tuple: contrains handle, data size, data type,
+                      tensor dims, lod information, device index.
+           Examples:
+               .. code-block:: python
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_cuda()
+      )DOC")
+      .def("_new_shared_cuda",
+           [](py::tuple t) {
+             if (t.size() != 7)
+               throw std::runtime_error(
+                   "Invalid Tensor meta info for shared cuda tensor!");
+             // 1. Create a new C++ instance
+             framework::Tensor tensor;
+             // 2. Rebuild Allocation from handle
+             const std::string &handle = t[0].cast<std::string>();
+             ptrdiff_t offset_bytes = (ptrdiff_t)t[1].cast<int64_t>();
+             auto device_id = t[6].cast<int>();
+             auto base_ptr = memory::allocation::GetIpcBasePtr(handle);
+             size_t size = t[2].cast<size_t>();
+             void *dev = base_ptr.get();
+             dev = reinterpret_cast<char *>(dev) + offset_bytes;
+             auto shared_reader_holder =
+                 std::make_shared<memory::allocation::CudaIpcAllocation>(
+                     dev, size, device_id, std::move(base_ptr));
+             // 3. Rebuild Tensor
+             tensor.ResetHolderWithType(
+                 shared_reader_holder,
+                 static_cast<paddle::experimental::DataType>(t[3].cast<int>()));
+             tensor.Resize(phi::make_ddim(t[4].cast<std::vector<int>>()));
+             tensor.set_lod(t[5].cast<framework::LoD>());
+             return tensor;
+           },
+           R"DOC(
+           Deserialize GPU lod tensor from cudaIpcMemHandle.
+           Params:
+               tuple: contrains handle, data size, data type,
+                      tensor dims, lod information, device index.
+           Examples:
+               .. code-block:: python
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_cuda()
+                 tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_cuda(metainfo))
+        )DOC")
+#endif
+      .def("_share_filename",
+           [](framework::Tensor &self) {
+             if (!self.IsInitialized() || self.numel() == 0)
+               throw std::runtime_error(
+                   "Tensor not initialized or numel is 0. could not pass to "
+                   "shared memory. ");
+             auto holder = self.Holder();
+             PADDLE_ENFORCE_EQ(
+                 platform::is_cpu_place(holder->place()) ||
+                     platform::is_cuda_pinned_place(holder->place()),
+                 true, platform::errors::InvalidArgument(
+                           "Tensor is not on CPU. share_filename only "
+                           "support CPU Tensor."));
+             auto *mmap_allocation = dynamic_cast<
+                 memory::allocation::RefcountedMemoryMapAllocation *>(
+                 holder.get());
+             // If the tensor is not shared, allocate memory map allocation.
+             if (mmap_allocation == nullptr) {
+               void *data_ptr = self.data();
+               size_t data_size =
+                   self.numel() *
+                   framework::SizeOfType(
+                       framework::TransToProtoVarType(self.type()));
+               int flags = memory::allocation::MAPPED_SHAREDMEM |
+                           memory::allocation::MAPPED_EXCLUSIVE;
+               std::string handle = memory::allocation::GetIPCName();
+               auto shared_holder =
+                   memory::allocation::AllocateRefcountedMemoryMapAllocation(
+                       handle, flags, data_size);
+               // copy data & reset holder
+               if (platform::is_cuda_pinned_place(holder->place())) {
+#ifdef PADDLE_WITH_CUDA
+                 memory::Copy(platform::CPUPlace(), shared_holder->ptr(),
+                              platform::CUDAPinnedPlace(), data_ptr, data_size);
+#endif
+               } else {
+                 memory::Copy(platform::CPUPlace(), shared_holder->ptr(),
+                              platform::CPUPlace(), data_ptr, data_size);
+               }
+               self.ResetHolder(shared_holder);
+               mmap_allocation = shared_holder.get();
+             }
+             int type_idx = static_cast<int>(self.type());
+             return py::make_tuple(mmap_allocation->ipc_name(),
+                                   mmap_allocation->size(), type_idx,
+                                   vectorize(self.dims()), self.lod());
+           },
+           R"DOC(
+           Serialize CPU lod tensor in shared memory to tuple.
+           If the tensor is not in shared memory, we will copy it first.
+           Returns:
+               tuple: contrains ipc name, data size, data type,
+                      tensor dims and lod imformation.
+           Examples:
+               .. code-block:: python
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_filename()
+       )DOC")
+      .def("_new_shared_filename",
+           [](py::tuple t) {  // __setstate__
+             if (t.size() != 5)
+               throw std::runtime_error("Invalid Tensor meta info state!");
+             framework::Tensor tensor;
+             // 2. Rebuild Allocation
+             const std::string &ipc_name = t[0].cast<std::string>();
+             size_t size = t[1].cast<size_t>();
+             int flags = memory::allocation::MAPPED_SHAREDMEM |
+                         memory::allocation::MAPPED_NOCREATE;
+             auto shared_holder =
+                 memory::allocation::AllocateRefcountedMemoryMapAllocation(
+                     ipc_name, flags, size);
+             // 3. Rebuild Tensor
+             tensor.ResetHolderWithType(
+                 shared_holder,
+                 static_cast<paddle::experimental::DataType>(t[2].cast<int>()));
+             tensor.Resize(phi::make_ddim(t[3].cast<std::vector<int>>()));
+             tensor.set_lod(t[4].cast<framework::LoD>());
+             return tensor;
+           },
+           R"DOC(
+           Deserialize CPU lod tensor from shared memory.
+           Params:
+               tuple: contrains ipc file name, data size, data type,
+                      tensor dims and lod information.
+           Examples:
+               .. code-block:: python
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_filename()
+                 tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_filename(metainfo))
+        )DOC")
+      .def("_shared_incref",
+           [](framework::Tensor &self) {
+             auto *mmap_allocation = dynamic_cast<
+                 memory::allocation::RefcountedMemoryMapAllocation *>(
+                 self.Holder().get());
+             if (mmap_allocation) {
+               mmap_allocation->incref();
+             }
+           },
+           R"DOC(
+            Increase reference count of share_filename tensor.
+      )DOC")
+      .def("_shared_decref",
+           [](framework::Tensor &self) {
+             auto *mmap_allocation = dynamic_cast<
+                 memory::allocation::RefcountedMemoryMapAllocation *>(
+                 self.Holder().get());
+             if (mmap_allocation) {
+               mmap_allocation->decref();
+             }
+           },
+           R"DOC(
+            Decrease reference count of share_filename tensor.
+      )DOC")
+      .def(py::pickle(
+          [](const framework::Tensor &t) {  // __getstate__
+            auto holder = t.Holder();
+            PADDLE_ENFORCE_EQ(platform::is_cpu_place(holder->place()), true,
+                              platform::errors::PreconditionNotMet(
+                                  "Tensor is not on CPU."
+                                  "Now only Tensor on CPU can be serialized."));
+            auto *mmap_writer_allocation =
+                dynamic_cast<memory::allocation::MemoryMapWriterAllocation *>(
+                    holder.get());
+            PADDLE_ENFORCE_NOT_NULL(
+                mmap_writer_allocation,
+                platform::errors::PreconditionNotMet(
+                    "Tensor is not in shared memory."
+                    "Now only Tensor on shared memory can be serialized."));
+            int type_idx = static_cast<int>(t.type());
+            return py::make_tuple(mmap_writer_allocation->ipc_name(),
+                                  mmap_writer_allocation->size(), type_idx,
+                                  vectorize(t.dims()), t.lod());
+          },
+          [](py::tuple t) {  // __setstate__
+            if (t.size() != 5)
+              throw std::runtime_error("Invalid Tensor state!");
+            // 1. Create a new C++ instance
+            framework::Tensor tensor;
+            // 2. Rebuild Allocation
+            const std::string &ipc_name = t[0].cast<std::string>();
+            size_t size = t[1].cast<size_t>();
+            auto shared_reader_holder =
+                memory::allocation::RebuildMemoryMapReaderAllocation(ipc_name,
+                                                                     size);
+            // 3. Maintain global fd set
+            VLOG(3) << "Tensor ipc name: " << ipc_name;
+            memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
+            // 4. Rebuild Tensor
+            tensor.ResetHolderWithType(
+                shared_reader_holder,
+                static_cast<paddle::experimental::DataType>(t[2].cast<int>()));
+            tensor.Resize(phi::make_ddim(t[3].cast<std::vector<int>>()));
+            tensor.set_lod(t[4].cast<framework::LoD>());
+            return tensor;
+          }));
+#endif
+  py::class_<phi::SelectedRows>(m, "SelectedRows")
+      .def("__init__",
+           [](phi::SelectedRows &instance) {
+             new (&instance) phi::SelectedRows();
+           })
+      .def("__init__",
+           [](phi::SelectedRows &instance,
+              const std::vector<int64_t> rows,
+              const int64_t &height) {
+             new (&instance) phi::SelectedRows(rows, height);
+           })
+      .def(
+          "get_tensor",
+          [](phi::SelectedRows &self) { return self.mutable_value(); },
+          py::return_value_policy::reference)
+      .def("numel",
+           [](phi::SelectedRows &self) -> int64_t {
+             return self.value().numel();
+           })
+      .def("set_height", &phi::SelectedRows::set_height)
+      .def("height", &phi::SelectedRows::height)
+      .def("set_rows",
+           [](phi::SelectedRows &self, std::vector<int64_t> rows) {
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+             self.set_rows(rows);
+#else
+        Vector<int64_t> new_rows(rows);
+        self.set_rows(new_rows);
+#endif
+           })
+      .def("sync_index",
+           [](phi::SelectedRows &instance) { instance.SyncIndex(); })
+      .def("rows", [](phi::SelectedRows &self) {
+        auto rows = self.rows();
+        std::vector<int64_t> new_rows;
+        new_rows.reserve(rows.size());
+        std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
+        return new_rows;
+      });
+}
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/tensor.h
+++ b/paddle/fluid/pybind/tensor.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "pybind11/pybind11.h"
+namespace paddle {
+namespace pybind {
+void BindTensor(pybind11::module& m);  // NOLINT
+}  // namespace pybind
+}  // namespace paddle