feed improve dict/merge_patch

1fb811f8 · xiexionghang · 0dc7d425 · 0dc7d425
隐藏空白更改
内联并排

Showing with 0 addition and 1753 deletion

paddle/fluid/feed/pybind/pybind.cc paddle/fluid/feed/pybind/pybind.cc +0 -1753

未找到文件。
--- a/paddle/fluid/feed/pybind/pybind.cc
+++ b/paddle/fluid/feed/pybind/pybind.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <Python.h>
-#include <algorithm>
-#include <cstdlib>
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT // for call_once
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
-#include "paddle/fluid/framework/ir/pass_builder.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/parallel_executor.h"
-#include "paddle/fluid/framework/prune.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/scope_pool.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/version.h"
-#include "paddle/fluid/memory/allocation/allocator_strategy.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/py_func_op.h"
-#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
-#include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"
-#include "paddle/fluid/pybind/box_helper_py.h"
-#include "paddle/fluid/pybind/const_value.h"
-#include "paddle/fluid/pybind/data_set_py.h"
-#include "paddle/fluid/pybind/exception.h"
-#include "paddle/fluid/pybind/fleet_wrapper_py.h"
-#include "paddle/fluid/pybind/imperative.h"
-#include "paddle/fluid/pybind/inference_api.h"
-#include "paddle/fluid/pybind/ir.h"
-#include "paddle/fluid/pybind/expand_api.h"
-#ifndef _WIN32
-#include "paddle/fluid/pybind/nccl_wrapper_py.h"
-#endif
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/pybind/protobuf.h"
-#include "paddle/fluid/pybind/pybind.h"  // NOLINT
-#include "paddle/fluid/pybind/reader_py.h"
-#include "paddle/fluid/pybind/tensor_py.h"
-#include "paddle/fluid/string/to_string.h"
-#ifdef PADDLE_WITH_CUDA
-#ifndef _WIN32
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#endif
-#include "paddle/fluid/platform/cuda_profiler.h"
-#include "paddle/fluid/platform/gpu_info.h"
-#endif
-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/pybind/communicator_py.h"
-#endif
-#include "pybind11/stl.h"
-DEFINE_bool(reader_queue_speed_test_mode, false,
-            "If set true, the queue.pop will only get data from queue but not "
-            "remove the data from queue for speed testing");
-DECLARE_bool(use_mkldnn);
-#ifdef PADDLE_WITH_NGRAPH
-DECLARE_bool(use_ngraph);
-#endif
-// disable auto conversion to list in Python
-PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
-namespace paddle {
-namespace pybind {
-bool IsCompiledWithCUDA() {
-#ifndef PADDLE_WITH_CUDA
-  return false;
-#else
-  return true;
-#endif
-}
-bool IsCompiledWithMKLDNN() {
-#ifndef PADDLE_WITH_MKLDNN
-  return false;
-#else
-  return true;
-#endif
-}
-bool IsCompiledWithNGRAPH() {
-#ifndef PADDLE_WITH_NGRAPH
-  return false;
-#else
-  return true;
-#endif
-}
-bool IsCompiledWithBrpc() {
-#ifndef PADDLE_WITH_DISTRIBUTE
-  return false;
-#endif
-#ifdef PADDLE_WITH_GRPC
-  return false;
-#endif
-  return true;
-}
-bool IsCompiledWithDIST() {
-#ifdef PADDLE_WITH_DISTRIBUTE
-  return true;
-#else
-  return false;
-#endif
-}
-template <typename PlaceType1, typename PlaceType2>
-static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) {
-  return paddle::platform::Place(p1) == paddle::platform::Place(p2);
-}
-template <typename PlaceType>
-static inline int PlaceIndex(const PlaceType &p) {
-  return static_cast<int>(paddle::platform::Place(p).which());
-}
-#ifdef PADDLE_WITH_AVX
-PYBIND11_MODULE(core_avx, m) {
-#else
-PYBIND11_MODULE(core_noavx, m) {
-#endif
-  // Not used, just make sure cpu_info.cc is linked.
-  paddle::platform::CpuTotalPhysicalMemory();
-  paddle::memory::allocation::UseAllocatorStrategyGFlag();
-  m.doc() = "C++ core of PaddlePaddle";
-  // using framework in this function. Since it is inside a function, it will
-  // not cause namespace pollution.
-  using namespace paddle::framework;  // NOLINT
-  BindException(&m);
-  m.def("set_num_threads", &platform::SetNumThreads);
-  m.def(
-      "_append_python_callable_object_and_return_id",
-      [](py::object py_obj) -> size_t {
-        return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj);
-      });
-  m.def("_get_use_default_grad_op_desc_maker_ops",
-        [] { return OpInfoMap::Instance().GetUseDefaultGradOpDescMakerOps(); });
-  // NOTE(zjl): ctest would load environment variables at the beginning even
-  // though we have not `import paddle.fluid as fluid`. So we add this API
-  // to enable eager deletion mode in unittest.
-  m.def("_set_eager_deletion_mode", &paddle::framework::SetEagerDeletionMode);
-  m.def("_set_fuse_parameter_group_size",
-        &paddle::framework::ir::SetFuseParameterGroupsSize);
-  m.def("_set_fuse_parameter_memory_size",
-        &paddle::framework::ir::SetFuseParameterMemorySize);
-  m.add_object("_cleanup",
-               py::capsule([]() { ScopePool::Instance().Clear(); }));
-  m.def("_set_paddle_lib_path", &paddle::platform::dynload::SetPaddleLibPath);
-  BindImperative(&m);
-  py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
-      .def("__array__", [](Tensor &self) { return TensorToPyArray(self); })
-      .def("_is_initialized",
-           [](const Tensor &self) { return self.IsInitialized(); })
-      .def("_get_dims",
-           [](const Tensor &self) { return vectorize(self.dims()); })
-      .def("_set_dims",
-           [](Tensor &self, const std::vector<int64_t> &dim) {
-             self.Resize(make_ddim(dim));
-           })
-      .def("_set_layout",
-           [](Tensor &self, const std::string &layout) {
-             self.set_layout(StringToDataLayout(layout));
-           })
-      .def("_alloc_float",
-           [](Tensor &self, paddle::platform::CUDAPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_alloc_float",
-           [](Tensor &self, paddle::platform::CPUPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_alloc_double",
-           [](Tensor &self, paddle::platform::CPUPlace &place) {
-             self.mutable_data<double>(place);
-           })
-      .def("_alloc_int",
-           [](Tensor &self, paddle::platform::CPUPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_int",
-           [](Tensor &self, paddle::platform::CUDAPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_int",
-           [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_float",
-           [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_clear", &Tensor::clear)
-      .def("set", PyCPUTensorSetFromArray<float>)
-      .def("set", PyCPUTensorSetFromArray<int>)
-      .def("set", PyCPUTensorSetFromArray<double>)
-      .def("set", PyCPUTensorSetFromArray<int64_t>)
-      .def("set", PyCPUTensorSetFromArray<bool>)
-      .def("set", PyCPUTensorSetFromArray<uint16_t>)
-      .def("set", PyCPUTensorSetFromArray<uint8_t>)
-      .def("set", PyCPUTensorSetFromArray<int8_t>)
-#ifdef PADDLE_WITH_CUDA
-      .def("set", PyCUDATensorSetFromArray<float>)
-      .def("set", PyCUDATensorSetFromArray<int>)
-      .def("set", PyCUDATensorSetFromArray<double>)
-      .def("set", PyCUDATensorSetFromArray<int64_t>)
-      .def("set", PyCUDATensorSetFromArray<bool>)
-      .def("set", PyCUDATensorSetFromArray<uint16_t>)
-      .def("set", PyCUDATensorSetFromArray<uint8_t>)
-      .def("set", PyCUDATensorSetFromArray<int8_t>)
-      .def("set", PyCUDAPinnedTensorSetFromArray<float>)
-      .def("set", PyCUDAPinnedTensorSetFromArray<int>)
-      .def("set", PyCUDAPinnedTensorSetFromArray<double>)
-      .def("set", PyCUDAPinnedTensorSetFromArray<int64_t>)
-      .def("set", PyCUDAPinnedTensorSetFromArray<bool>)
-      .def("set", PyCUDAPinnedTensorSetFromArray<uint16_t>)
-      .def("set", PyCUDAPinnedTensorSetFromArray<uint8_t>)
-      .def("set", PyCUDAPinnedTensorSetFromArray<int8_t>)
-#endif
-      .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
-      .def("_set_float_element", TensorSetElement<float>)
-      .def("_get_float_element", TensorGetElement<float>)
-      .def("_set_double_element", TensorSetElement<double>)
-      .def("_get_double_element", TensorGetElement<double>)
-      .def("_place", [](Tensor &self) { return self.place(); })
-      .def("_dtype", [](Tensor &self) { return self.type(); })
-      .def("__getitem__", PySliceTensor, py::return_value_policy::reference)
-      .def("__str__", [](const Tensor &self) {
-        std::stringstream ostr;
-        ostr << self;
-        return ostr.str();
-      });
-  py::class_<LoDTensor, Tensor>(m, "LoDTensor", R"DOC(
-    LoDTensor is a Tensor with optional LoD information.
-    np.array(lod_tensor) can convert LoDTensor to numpy array.
-    lod_tensor.lod() can retrieve the LoD information.
-    LoD is short for Level of Details and is usually used for varied sequence
-    length. You can skip the following comment if you don't need optional LoD.
-    For example, a LoDTensor X can look like the example below. It contains
-    2 sequences. The first has length 2 and the second has length 3, as
-    described by x.lod.
-    The first tensor dimension 5=2+3 is calculated from LoD if it's available.
-    It means the total number of sequence element. In X, each element has 2
-    columns, hence [5, 2].
-    x.lod  = [[2, 3]]
-    x.data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
-    x.shape = [5, 2]
-    LoD can have multiple levels (for example, a paragraph can have multiple
-    sentences and a sentence can have multiple words). In the following
-    LodTensor Y, the lod_level is 2. It means there are 2 sequence, the
-    first sequence length is 2 (has 2 sub-sequences), the second one's
-    length is 1. The first sequence's 2 sub-sequences have length 2 and 2,
-    respectively. And the second sequence's 1 sub-sequence has length 3.
-    y.lod = [[2 1], [2 2 3]]
-    y.shape = [2+2+3, ...]
-    Examples:
-        .. code-block:: python
-          import paddle.fluid as fluid
-          t = fluid.LoDTensor()
-  Note:
-      In above description, LoD is length-based. In Paddle internal
-      implementation, lod is offset-based. Hence, internally,
-      y.lod is represented as [[0, 2, 3], [0, 2, 4, 7]] (length-based
-      equivlent would be [[2-0, 3-2], [2-0, 4-2, 7-4]]).
-      Sometimes LoD is called recursive_sequence_length to be more
-      self-explanatory. In this case, it must be length-based. Due to history
-      reasons. when LoD is called lod in public API, it might be offset-based.
-      Users should be careful about it.
-        )DOC")
-      .def("__array__", [](Tensor &self) { return TensorToPyArray(self); })
-      .def("__init__",
-           [](LoDTensor &instance, const std::vector<std::vector<size_t>>
-                                       &recursive_sequence_lengths) {
-             LoD new_lod;
-             new_lod.reserve(recursive_sequence_lengths.size());
-             std::copy(recursive_sequence_lengths.begin(),
-                       recursive_sequence_lengths.end(),
-                       std::back_inserter(new_lod));
-             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
-             PADDLE_ENFORCE_EQ(
-                 CheckLoD(new_offset_lod, -1), true,
-                 "the provided recursive_sequence_lengths info is invalid");
-             new (&instance) LoDTensor(new_offset_lod);
-           })
-      .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
-      // We implement offset based LOD in C++ while we use length based with
-      // Python API. So we changed set_lod to set_recursive_sequence_lengths to
-      // avoid misuse.
-      // The discussion is here:
-      // https://github.com/PaddlePaddle/Paddle/issues/10855
-      .def("set_lod",
-           [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
-             // the input lod is offset-based level-of-detail info
-             LoD new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             PADDLE_ENFORCE_EQ(
-                 CheckLoD(new_lod, vectorize(self.dims()).front()), true,
-                 "the provided lod info is invalid");
-             self.set_lod(new_lod);
-           },
-           py::arg("lod"), R"DOC(
-           Set LoD of the LoDTensor.
-           Args:
-               lod (List[List[int]]): the lod to be set.
-           Examples:
-               .. code-block:: python
-                 import paddle.fluid as fluid
-                 import numpy as np
-                 t = fluid.LoDTensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_lod([[0, 2, 5]])
-           )DOC")
-      .def("set_recursive_sequence_lengths",
-           [](LoDTensor &self, const std::vector<std::vector<size_t>>
-                                   &recursive_sequence_lengths) {
-             // the input recursive_sequence_lengths is length-based
-             // level-of-detail info
-             LoD new_lod;
-             new_lod.reserve(recursive_sequence_lengths.size());
-             std::copy(recursive_sequence_lengths.begin(),
-                       recursive_sequence_lengths.end(),
-                       std::back_inserter(new_lod));
-             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
-             PADDLE_ENFORCE_EQ(
-                 CheckLoD(new_offset_lod, vectorize(self.dims()).front()), true,
-                 "the provided recursive_sequence_lengths info is invalid");
-             self.set_lod(new_offset_lod);
-           },
-           py::arg("recursive_sequence_lengths"), R"DOC(
-           Set LoD of the LoDTensor according to recursive sequence length.
-           For example, if recursive_sequence_lengths=[[2, 3]], meaning that
-           there are two sequences with length 2 and 3 respectively, the
-           corresponding lod would be [[0, 2, 2+3]], i.e, [[0, 2, 5]].
-           Args:
-                recursive_sequence_lengths (List[List[int]]): sequence lengths.
-           Examples:
-               .. code-block:: python
-                 import paddle.fluid as fluid
-                 import numpy as np
-                 t = fluid.LoDTensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_recursive_sequence_lengths([[2, 3]])
-           )DOC")
-      .def("lod",
-           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
-             // output the offset-based lod info
-             LoD lod = self.lod();
-             std::vector<std::vector<size_t>> new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             return new_lod;
-           },
-           R"DOC(
-           Return the LoD of the LoDTensor.
-           Returns:
-               out (List[List[int]]): the lod of the LoDTensor.
-           Examples:
-               .. code-block:: python
-                 import paddle.fluid as fluid
-                 import numpy as np
-                 t = fluid.LoDTensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_lod([[0, 2, 5]])
-                 print(t.lod()) # [[0, 2, 5]]
-           )DOC")
-      // Set above comments of set_lod.
-      .def("recursive_sequence_lengths",
-           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
-             // output the length-based lod info
-             LoD lod = ConvertToLengthBasedLoD(self.lod());
-             std::vector<std::vector<size_t>> new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             return new_lod;
-           },
-           R"DOC(
-           Return the sequence length of the LoDTensor corresponding to LoD.
-           Returns:
-               out (List[List[int]): the sequence lengths.
-           Examples:
-               .. code-block:: python
-                 import paddle.fluid as fluid
-                 import numpy as np
-                 t = fluid.LoDTensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_recursive_sequence_lengths([[2, 3]])
-                 print(t.recursive_sequence_lengths()) # [[2, 3]]
-           )DOC")
-      .def("has_valid_recursive_sequence_lengths",
-           [](LoDTensor &self) -> bool {
-             // Check that the lod info is valid and match the outermost
-             // dimension of the LoDTensor data
-             return CheckLoD(self.lod(), vectorize(self.dims()).front());
-           },
-           R"DOC(
-           Check whether the lod of the LoDTensor is valid.
-           Returns:
-               out (bool): whether the lod is valid.
-           Examples:
-               .. code-block:: python
-                 import paddle.fluid as fluid
-                 import numpy as np
-                 t = fluid.LoDTensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_recursive_sequence_lengths([[2, 3]])
-                 print(t.has_valid_recursive_sequence_lengths()) # True
-           )DOC")
-      .def("__getitem__", PySliceTensor, py::return_value_policy::reference,
-           R"DOC(
-           Slice the original Tensor, and remove the LoD information.
-           Returns:
-               out (Tensor): new Tensor(NOT LoDTensor).
-           )DOC")
-      .def("__str__",
-           [](const LoDTensor &self) {
-             std::stringstream ostr;
-             ostr << self;
-             return ostr.str();
-           })
-      .def("_copy", [](const LoDTensor &self, const platform::Place &place) {
-        // follow fetch_op's inplementation
-        LoDTensor dst;
-        if (self.IsInitialized() && self.numel() > 0) {
-          TensorCopySync(self, place, &dst);
-        } else {
-          // Not copy, if the src tensor is empty.
-          dst.clear();
-          dst.Resize({0});
-        }
-        dst.set_lod(self.lod());
-        return dst;
-      });
-  py::class_<SelectedRows>(m, "SelectedRows")
-      .def("__init__",
-           [](SelectedRows &instance) { new (&instance) SelectedRows(); })
-      .def("__init__",
-           [](SelectedRows &instance, const std::vector<int64_t> rows,
-              const int64_t &height) {
-             new (&instance) SelectedRows(rows, height);
-           })
-      .def("get_tensor",
-           [](SelectedRows &self) { return self.mutable_value(); },
-           py::return_value_policy::reference)
-      .def("numel",
-           [](SelectedRows &self) -> int64_t { return self.value().numel(); })
-      .def("set_height", &SelectedRows::set_height)
-      .def("height", &SelectedRows::height)
-      .def("set_rows",
-           [](SelectedRows &self, std::vector<int64_t> rows) {
-#ifndef PADDLE_WITH_CUDA
-             self.set_rows(rows);
-#else
-        Vector<int64_t> new_rows(rows);
-        self.set_rows(new_rows);
-#endif
-           })
-      .def("sync_index", [](SelectedRows &instance) { instance.SyncIndex(); })
-      .def("rows", [](SelectedRows &self) {
-        auto rows = self.rows();
-        std::vector<int64_t> new_rows;
-        new_rows.reserve(rows.size());
-        std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
-        return new_rows;
-      });
-  py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
-All parameter, weight, gradient are variables in Paddle.
-)DOC")
-      .def(py::init<>())
-      .def("is_int", [](const Variable &var) { return var.IsType<int>(); })
-      .def("set_int",
-           [](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
-      .def("get_int", [](const Variable &var) -> int { return var.Get<int>(); })
-      .def("is_float", [](const Variable &var) { return var.IsType<float>(); })
-      .def("set_float",
-           [](Variable &var, float val) -> void {
-             *var.GetMutable<float>() = val;
-           })
-      .def("get_float",
-           [](const Variable &var) -> float { return var.Get<float>(); })
-      .def("get_tensor",
-           [](Variable &self) -> LoDTensor * {
-             return self.GetMutable<LoDTensor>();
-           },
-           py::return_value_policy::reference)
-      .def("get_lod_rank_table",
-           [](Variable &self) { return self.GetMutable<LoDRankTable>(); },
-           py::return_value_policy::reference)
-      .def("get_selected_rows",
-           [](Variable &self) -> SelectedRows * {
-             return self.GetMutable<SelectedRows>();
-           },
-           py::return_value_policy::reference)
-      .def("get_lod_tensor_array",
-           [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
-           py::return_value_policy::reference)
-#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
-      .def("get_communicator",
-           [](Variable &self) -> platform::Communicator * {
-             return self.GetMutable<platform::Communicator>();
-           },
-           py::return_value_policy::reference)
-#endif
-      .def("get_reader",
-           [](Variable &self) -> framework::ReaderHolder * {
-             PADDLE_ENFORCE_EQ(self.IsType<framework::ReaderHolder>(), true);
-             return self.GetMutable<framework::ReaderHolder>();
-           },
-           py::return_value_policy::reference);
-  BindReader(&m);
-  using LoDTensorBlockingQueue =
-      ::paddle::operators::reader::LoDTensorBlockingQueue;
-  using LoDTensorBlockingQueueHolder =
-      ::paddle::operators::reader::LoDTensorBlockingQueueHolder;
-  py::class_<LoDTensorBlockingQueue, std::shared_ptr<LoDTensorBlockingQueue>>(
-      m, "LoDTensorBlockingQueue", "")
-      .def("push",
-           [](LoDTensorBlockingQueue &self,
-              const std::vector<framework::LoDTensor> &lod_tensor_vec) {
-             pybind11::gil_scoped_release release;
-             return self.Push(lod_tensor_vec);
-           })
-      .def("size", &LoDTensorBlockingQueue::Size)
-      .def("capacity", &LoDTensorBlockingQueue::Cap)
-      .def("close", &LoDTensorBlockingQueue::Close)
-      .def("is_closed", &LoDTensorBlockingQueue::IsClosed);
-  m.def("init_lod_tensor_blocking_queue",
-        [](Variable &var,
-           size_t capacity) -> std::shared_ptr<LoDTensorBlockingQueue> {
-          VLOG(1) << "init_lod_tensor_blocking_queue";
-          auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
-          holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode);
-          return holder->GetQueue();
-        },
-        py::return_value_policy::copy);
-  py::class_<Scope>(m, "_Scope", R"DOC(
-    Scope is an association of a name to Variable. All variables belong to Scope.
-    Variables in a parent scope can be retrieved from local scope.
-    You need to specify a scope to run a Net, i.e., `exe.Run(&scope)`.
-    One net can run in different scopes and update different variable in the
-    scope.
-    You can create var in a scope and get it from the scope.
-    Examples:
-        .. code-block:: python
-          import paddle.fluid as fluid
-          # create tensor from a scope and set value to it.
-          param = scope.var('Param').get_tensor()
-          param_array = np.full((height, row_numel), 5.0).astype("float32")
-          param.set(param_array, place)
-        )DOC")
-      .def("_remove_from_pool",
-           [](Scope &self) { ScopePool::Instance().Remove(&self); })
-      .def("var",
-           [](Scope &self, const std::string &name) -> Variable * {
-             return self.Var(name);
-           },
-           py::arg("name"),
-           R"DOC(
-           Find or create variable named :code:`name` in the current scope.
-           If the variable named :code:`name` does not exist in the
-           current scope, the variable would be created. Otherwise,
-           return the existing variable.
-           Args:
-               name (str): the variable name.
-           Returns:
-               out (core.Variable): the found or created variable.
-           )DOC",
-           py::return_value_policy::reference)
-      .def("find_var", &Scope::FindVar, py::arg("name"),
-           R"DOC(
-           Find variable named :code:`name` in the current scope or
-           its parent scope. Return None if not found.
-           Args:
-               name (str): the variable name.
-           Returns:
-               out (core.Variable|None): the found variable or None.
-           )DOC",
-           py::return_value_policy::reference)
-      .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
-           R"DOC(
-           Create a new sub-scope of the current scope.
-           Returns:
-               out (core._Scope): the created sub-scope.
-           )DOC",
-           py::return_value_policy::reference)
-      .def("drop_kids", &Scope::DropKids,
-           R"DOC(
-           Delete all sub-scopes of the current scope.
-           )DOC")
-      .def("_kids", &Scope::kids);
-  m.def("Scope",
-        []() -> Scope * {
-          auto *s = new Scope();
-          ScopePool::Instance().Insert(std::unique_ptr<Scope>(s));
-          return s;
-        },
-        R"DOC(
-        Create a new scope.
-        Returns:
-            out (core._Scope): the created scope.
-        )DOC",
-        py::return_value_policy::reference);
-  //! @note: Be careful! PyBind will return std::string as an unicode, not
-  //! Python str. If you want a str object, you should cast them in Python.
-  m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
-    std::vector<py::bytes> ret_values;
-    for (auto &iter : OpInfoMap::Instance().map()) {
-      auto &info = iter.second;
-      if (info.HasOpProtoAndChecker()) {
-        std::string str;
-        PADDLE_ENFORCE_EQ(
-            info.Proto().SerializeToString(&str), true,
-            "Serialize OpProto Error. This could be a bug of Paddle.");
-        ret_values.emplace_back(str);
-      }
-    }
-    return ret_values;
-  });
-  m.def(
-      "get_grad_op_desc", [](const OpDesc &op_desc,
-                             const std::unordered_set<std::string> &no_grad_set,
-                             const std::vector<BlockDesc *> &grad_sub_block) {
-        std::unordered_map<std::string, std::string> grad_to_var;
-        std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
-            framework::OpInfoMap::Instance()
-                .Get(op_desc.Type())
-                .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
-                               grad_sub_block);
-        std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
-        std::transform(grad_op_descs.begin(), grad_op_descs.end(),
-                       grad_op_desc_ptrs.begin(),
-                       [](std::unique_ptr<OpDesc> &p) { return p.release(); });
-        return std::make_pair(grad_op_desc_ptrs, grad_to_var);
-      });
-  m.def("has_grad_op_maker", [](const std::string op_type) {
-    return framework::OpInfoMap::Instance().Get(op_type).HasGradOpMaker();
-  });
-  m.def("has_infer_inplace", [](const std::string op_type) {
-    return framework::OpInfoMap::Instance().Get(op_type).HasInferInplace();
-  });
-  m.def("get_flags_use_mkldnn", []() { return FLAGS_use_mkldnn; });
-#ifdef PADDLE_WITH_NGRAPH
-  m.def("get_flags_use_ngraph", []() { return FLAGS_use_ngraph; });
-#endif
-  m.def("prune", [](const ProgramDesc &origin,
-                    const std::set<std::string> &feeded_var_names,
-                    const std::vector<std::array<size_t, 2>> &targets) {
-    ProgramDesc prog_with_targets(origin);
-    for (const auto &t : targets) {
-      prog_with_targets.MutableBlock(t[0])->Op(t[1])->SetIsTarget(true);
-    }
-    proto::ProgramDesc pruned_desc;
-    Prune(*prog_with_targets.Proto(), feeded_var_names, &pruned_desc);
-    return new ProgramDesc(pruned_desc);
-  });
-  m.def("prune_backward", [](const framework::ProgramDesc &program) {
-    return PruneBackward(program);
-  });
-  m.def("empty_var_name",
-        []() { return std::string(framework::kEmptyVarName); });
-  m.def("grad_var_suffix",
-        []() { return std::string(framework::kGradVarSuffix); });
-  m.def_submodule(
-       "var_names",
-       "The module will return special predefined variable name in Paddle")
-      .def("empty", []() { return kEmptyVarName; })
-      .def("temp", []() { return kTempVarName; });
-  // clang-format off
-  py::class_<paddle::platform::DeviceContext>(m, "DeviceContext")
-      .def_static("create",
-                  [](paddle::platform::CPUPlace& place)
-                      -> paddle::platform::DeviceContext* {
-                    return new paddle::platform::CPUDeviceContext();
-                  })
-      .def_static("create",
-                  [](paddle::platform::CUDAPlace& place)
-                      -> paddle::platform::DeviceContext* {
-#ifndef PADDLE_WITH_CUDA
-                    PADDLE_THROW("CUDAPlace is not supported in CPU device.");
-#else
-                    return new paddle::platform::CUDADeviceContext(place);
-#endif
-                  })
-          .def_static("create",
-                [](paddle::platform::CUDAPinnedPlace& place)
-                        -> paddle::platform::DeviceContext* {
-#ifndef PADDLE_WITH_CUDA
-                  PADDLE_THROW(
-                        "CUDAPinnedPlace is not supported in CPU device.");
-#else
-                  return new paddle::platform::CUDAPinnedDeviceContext(place);
-#endif
-                });;
-// clang-format on
-#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
-  py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
-#endif
-  py::class_<platform::CUDAPlace>(m, "CUDAPlace", R"DOC(
-    CUDAPlace is a descriptor of a device. It represents a GPU, and each CUDAPlace
-    has a dev_id to indicate the number of cards represented by the current CUDAPlace.
-    The memory of CUDAPlace with different dev_id is not accessible.
-    Examples:
-        .. code-block:: python
-          import paddle.fluid as fluid
-          gpu_place = fluid.CUDAPlace(0)
-        )DOC")
-      .def("__init__",
-           [](platform::CUDAPlace &self, int dev_id) {
-#ifdef PADDLE_WITH_CUDA
-             if (UNLIKELY(dev_id < 0)) {
-               LOG(ERROR) << string::Sprintf(
-                   "Invalid CUDAPlace(%d), device id must be 0 or "
-                   "positive integer",
-                   dev_id);
-               std::exit(-1);
-             }
-             if (UNLIKELY(dev_id >= platform::GetCUDADeviceCount())) {
-               if (platform::GetCUDADeviceCount() == 0) {
-                 LOG(ERROR) << "Cannot use GPU because there is no GPU "
-                               "detected on your "
-                               "machine.";
-                 std::exit(-1);
-               } else {
-                 LOG(ERROR) << string::Sprintf(
-                     "Invalid CUDAPlace(%d), must inside [0, %d), because GPU "
-                     "number on your machine is %d",
-                     dev_id, platform::GetCUDADeviceCount(),
-                     platform::GetCUDADeviceCount());
-                 std::exit(-1);
-               }
-             }
-             new (&self) platform::CUDAPlace(dev_id);
-#else
-             LOG(ERROR) << string::Sprintf(
-                 "Cannot use GPU because you have installed CPU version "
-                 "PaddlePaddle.\n"
-                 "If you want to use GPU, please try to install GPU version "
-                 "PaddlePaddle by: pip install paddlepaddle-gpu\n"
-                 "If you only have CPU, please change CUDAPlace(%d) to be "
-                 "CPUPlace().\n",
-                 dev_id);
-             std::exit(-1);
-#endif
-           })
-      .def("_type", &PlaceIndex<platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
-      .def("__str__", string::to_string<const platform::CUDAPlace &>);
-  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
-    CPUPlace is a descriptor of a device. It represents a CPU, and the memory
-    CPUPlace can be accessed by CPU.
-    Examples:
-        .. code-block:: python
-          import paddle.fluid as fluid
-          cpu_place = fluid.CPUPlace()
-        )DOC")
-      .def(py::init<>())
-      .def("_type", &PlaceIndex<platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CPUPlace, platform::CUDAPinnedPlace>)
-      .def("__str__", string::to_string<const platform::CPUPlace &>);
-  py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace", R"DOC(
-    CUDAPinnedPlace is a descriptor of a device. The memory of CUDAPinnedPlace
-    can be accessed by GPU and CPU.
-    Examples:
-        .. code-block:: python
-          import paddle.fluid as fluid
-          place = fluid.CUDAPinnedPlace()
-        )DOC")
-      .def("__init__",
-           [](platform::CUDAPinnedPlace &self) {
-#ifndef PADDLE_WITH_CUDA
-             PADDLE_THROW("Cannot use CUDAPinnedPlace in CPU only version");
-#endif
-             new (&self) platform::CUDAPinnedPlace();
-           })
-      .def("_type", &PlaceIndex<platform::CUDAPinnedPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>)
-      .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
-  py::class_<platform::Place>(m, "Place")
-      .def(py::init<>())
-      .def("_type", &PlaceIndex<platform::Place>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
-      .def("is_gpu_place",
-           [](platform::Place &self) { return platform::is_gpu_place(self); })
-      .def("is_cpu_place",
-           [](platform::Place &self) { return platform::is_cpu_place(self); })
-      .def("is_cuda_pinned_place",
-           [](platform::Place &self) {
-             return platform::is_cuda_pinned_place(self);
-           })
-      .def("gpu_device_id",
-           [](platform::Place &self) {
-             return boost::get<platform::CUDAPlace>(self).device;
-           })
-      .def("set_place", [](platform::Place &self,
-                           const platform::Place &other) { self = other; })
-      .def("set_place",
-           [](platform::Place &self, const platform::CPUPlace &cpu_place) {
-             self = cpu_place;
-           })
-      .def("set_place",
-           [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
-             self = gpu_place;
-           })
-      .def("set_place", [](platform::Place &self,
-                           const platform::CUDAPinnedPlace &cuda_pinned_place) {
-        self = cuda_pinned_place;
-      });
-  py::class_<OperatorBase>(m, "Operator")
-      .def_static(
-          "create",
-          [](py::bytes protobin) {
-            proto::OpDesc desc;
-            PADDLE_ENFORCE_EQ(desc.ParsePartialFromString(protobin), true,
-                              "Cannot parse user input to OpDesc");
-            PADDLE_ENFORCE_EQ(desc.IsInitialized(), true,
-                              "User OpDesc is not initialized, reason %s",
-                              desc.InitializationErrorString());
-            return OpRegistry::CreateOp(desc);
-          })
-      .def("run",
-           [](OperatorBase &self, const Scope &scope,
-              const platform::CPUPlace &place) { self.Run(scope, place); })
-      .def("run",
-           [](OperatorBase &self, const Scope &scope,
-              const platform::CUDAPlace &place) { self.Run(scope, place); })
-      .def("run",
-           [](OperatorBase &self, const Scope &scope,
-              const platform::CUDAPinnedPlace &place) {
-             self.Run(scope, place);
-           })
-      .def("type",
-           [](const OperatorBase &op) -> std::string { return op.Type(); })
-      .def("outputs",
-           [](const OperatorBase &op)
-               -> std::map<std::string, std::vector<std::string>> {
-                 return op.Outputs();
-               })
-      .def("output_vars",
-           [](const OperatorBase &op) { return op.OutputVars(true); })
-      .def("inputs", [](const OperatorBase &op) { return op.Inputs(); })
-      .def("input_vars", [](const OperatorBase &op) { return op.InputVars(); })
-      .def("__str__", &OperatorBase::DebugString)
-      .def("no_intermediate_outputs",
-           [](const OperatorBase &op) { return op.OutputVars(false); })
-      .def("support_gpu", &OperatorBase::SupportGPU);
-  py::class_<framework::ExecutorPrepareContext>(m, "ExecutorPrepareContext")
-      .def(py::init<const ProgramDesc &, size_t>());
-  py::class_<framework::Executor>(m, "Executor")
-      .def(py::init<const platform::Place &>())
-      .def("close", &Executor::Close)
-      .def("run_from_dataset", &Executor::RunFromDataset,
-           py::call_guard<py::gil_scoped_release>())
-      .def("run_prepared_ctx",
-           [](Executor &self, ExecutorPrepareContext *ctx, Scope *scope,
-              std::map<std::string, const LoDTensor *> *feed_targets,
-              std::map<std::string, LoDTensor *> *fetch_targets,
-              bool create_local_scope = true, bool create_vars = true,
-              const std::string &feed_holder_name = "feed",
-              const std::string &fetch_holder_name = "fetch") {
-             pybind11::gil_scoped_release release;
-             self.RunPreparedContext(ctx, scope, feed_targets, fetch_targets,
-                                     create_local_scope, create_vars,
-                                     feed_holder_name, fetch_holder_name);
-           })
-      .def("run_cached_prepared_ctx",
-           [](Executor &self, ExecutorPrepareContext *ctx, Scope *scope,
-              bool create_local_scope = true, bool create_vars = true,
-              bool keep_kids = false) {
-             pybind11::gil_scoped_release release;
-             self.RunPreparedContext(ctx, scope, create_local_scope,
-                                     create_vars, keep_kids);
-           })
-      .def("prepare_ctx_cache", &Executor::PrepareCtxCache,
-           py::call_guard<py::gil_scoped_release>())
-      .def("create_variables", &Executor::CreateVariables,
-           py::call_guard<py::gil_scoped_release>())
-      .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
-                     int block_id, bool create_local_scope, bool create_vars,
-                     const std::vector<std::string> &fetch_vars) {
-        pybind11::gil_scoped_release release;
-        self.Run(prog, scope, block_id, create_local_scope, create_vars,
-                 fetch_vars);
-      });
-  m.def("init_gflags", framework::InitGflags);
-  m.def("init_glog", framework::InitGLOG);
-  m.def("init_dgc", framework::InitDGC);
-  m.def("init_devices",
-        [](bool init_p2p) { framework::InitDevices(init_p2p); });
-  m.def("is_compiled_with_ngraph", IsCompiledWithNGRAPH);
-  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
-  m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
-  m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
-  m.def("is_compiled_with_dist", IsCompiledWithDIST);
-#ifdef PADDLE_WITH_CUDA
-  m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
-    // Only GPUs with Compute Capability >= 53 support float16
-    return platform::GetCUDAComputeCapability(place.device) >= 53;
-  });
-#endif
-  m.def("set_feed_variable", framework::SetFeedVariable);
-  m.def("get_fetch_variable", framework::GetFetchVariable);
-  m.def("get_variable_tensor", framework::GetVariableTensor);
-  m.def("_is_program_version_supported", IsProgramVersionSupported);
-  BindProgramDesc(&m);
-  BindBlockDesc(&m);
-  BindVarDsec(&m);
-  BindOpDesc(&m);
-  BindConstValue(&m);
-  py::class_<framework::LoDRankTable>(m, "LodRankTable")
-      .def("items", [](framework::LoDRankTable &table) {
-        std::vector<std::pair<size_t, size_t>> res;
-        for (auto &item : table.items()) {
-          res.push_back({item.index, item.length});
-        }
-        return res;
-      });
-  py::class_<LoDTensorArray>(m, "LoDTensorArray", R"DOC(
-    Array of LoDTensor.
-    Examples:
-        .. code-block:: python
-          import paddle.fluid as fluid
-          arr = fluid.LoDTensorArray()
-)DOC")
-      .def("__init__",
-           [](LoDTensorArray &instance) { new (&instance) LoDTensorArray(); })
-      .def("__getitem__",
-           [](LoDTensorArray &self, size_t i) { return &self.at(i); },
-           py::return_value_policy::reference)
-      .def("__len__", [](LoDTensorArray &self) { return self.size(); })
-      .def("__setitem__",
-           [](LoDTensorArray &self, size_t i, const LoDTensor &t) {
-             PADDLE_ENFORCE_LT(i, self.size());
-             self[i].ShareDataWith(t);
-             self[i].set_lod(t.lod());
-           })
-      .def("append",
-           [](LoDTensorArray &self, const LoDTensor &t) {
-             self.emplace_back();
-             self.back().ShareDataWith(t);
-             self.back().set_lod(t.lod());
-           },
-           py::arg("tensor"), R"DOC(
-             Append a LoDensor to LoDTensorArray.
-             Examples:
-                 .. code-block:: python
-                   import paddle.fluid as fluid
-                   import numpy as np
-                   arr = fluid.LoDTensorArray()
-                   t = fluid.LoDTensor()
-                   t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                   arr.append(t)
-           )DOC")
-      .def("_move_to_list",
-           [](LoDTensorArray &self) -> py::list {
-             py::list res(self.size());
-             for (size_t i = 0; i < self.size(); ++i) {
-               res[i] = py::cast(std::move(self[i]));
-             }
-             self.clear();
-             return res;
-           },
-           py::return_value_policy::take_ownership);
-  m.def("op_support_gpu", OpSupportGPU);
-#ifdef PADDLE_WITH_CUDA
-  m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
-#ifndef _WIN32
-  m.def("nvprof_init", platform::CudaProfilerInit);
-  m.def("nvprof_start", platform::CudaProfilerStart);
-  m.def("nvprof_stop", platform::CudaProfilerStop);
-#endif
-#endif
-  py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
-      .value("kDisabled", platform::ProfilerState::kDisabled)
-      .value("kCPU", platform::ProfilerState::kCPU)
-      .value("kCUDA", platform::ProfilerState::kCUDA)
-      .value("kAll", platform::ProfilerState::kAll)
-      .export_values();
-  py::enum_<platform::EventSortingKey>(m, "EventSortingKey", py::arithmetic())
-      .value("kDefault", platform::EventSortingKey::kDefault)
-      .value("kCalls", platform::EventSortingKey::kCalls)
-      .value("kTotal", platform::EventSortingKey::kTotal)
-      .value("kMin", platform::EventSortingKey::kMin)
-      .value("kMax", platform::EventSortingKey::kMax)
-      .value("kAve", platform::EventSortingKey::kAve)
-      .export_values();
-  m.def("enable_profiler", platform::EnableProfiler);
-  m.def("disable_profiler", platform::DisableProfiler);
-  m.def("is_profiler_enabled", platform::IsProfileEnabled);
-  m.def("reset_profiler", platform::ResetProfiler);
-  m.def("get_pass", [](const std::string &pass_type) {
-    auto pass = framework::ir::PassRegistry::Instance().Get(pass_type);
-    return std::shared_ptr<framework::ir::Pass>(std::move(pass));
-  });
-  m.def("size_of_dtype", framework::SizeOfType);
-  using VarQuantScale =
-      std::unordered_map<std::string, std::pair<bool, LoDTensor>>;
-  py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
-  pass.def(py::init())
-      .def("has", &ir::Pass::Has)
-      .def("set_not_owned",
-           [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) {
-             self.SetNotOwned<ProgramDesc>(attr_name, &attr);
-           })
-      .def(
-          "set",
-          [](ir::Pass &self, const std::string &name, const std::string &attr) {
-            self.Set<std::string>(name, new std::string(attr));
-          })
-      .def("set", [](ir::Pass &self, const std::string &name,
-                     int val) { self.Set<const int>(name, new int(val)); })
-      .def("set",
-           [](ir::Pass &self, const std::string &name,
-              std::unordered_set<std::string> set) {
-             self.Set(name, new std::unordered_set<std::string>(set));
-           })
-      .def("set",
-           [](ir::Pass &self, const std::string &name,
-              std::unordered_set<int> set) {
-             self.Set(name, new std::unordered_set<int>(set));
-           })
-      .def("set",
-           [](ir::Pass &self, const std::string &name, VarQuantScale scales) {
-             self.Set(name, new VarQuantScale(scales));
-           })
-      .def("type", &ir::Pass::Type)
-      .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
-        self.Apply(graph.get());
-      });
-  py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
-      m, "PassBuilder");
-  pb.def(py::init())
-      .def("append_pass",
-           [](ir::PassBuilder &self,
-              const std::string &pass_type) -> std::shared_ptr<ir::Pass> {
-             return self.AppendPass(pass_type);
-           })
-      .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); })
-      .def("insert_pass",
-           [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) {
-             return self.InsertPass(idx, pass_type);
-           })
-      .def("remove_pass",
-           [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
-  // -- python binds for parallel executor.
-  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
-  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
-    ExecutionStrategy allows the user to more preciously control how to run
-    the program in ParallelExecutor by setting the property.
-    Examples:
-        .. code-block:: python
-          import paddle.fluid as fluid
-          x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-          y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-          y_predict = fluid.layers.fc(input=x, size=1, act=None)
-          cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-          avg_loss = fluid.layers.mean(cost)
-          sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-          sgd_optimizer.minimize(avg_loss)
-          exec_strategy = fluid.ExecutionStrategy()
-          exec_strategy.num_threads = 4
-          train_exe = fluid.ParallelExecutor(use_cuda=False,
-                                             loss_name=avg_loss.name,
-                                             exec_strategy=exec_strategy)
-        )DOC");
-  exec_strategy.def(py::init())
-      .def_property(
-          "num_threads",
-          [](const ExecutionStrategy &self) { return self.num_threads_; },
-          [](ExecutionStrategy &self, size_t num_threads) {
-            self.num_threads_ = num_threads;
-          },
-          R"DOC(The type is INT, num_threads represents the size of thread pool that
-            used to run the operators of the current program in ParallelExecutor.
-            If :math:`num\_threads=1`, all the operators will execute one by one,
-            but the order maybe difference between iterations.
-            If it is not set, it will be set in ParallelExecutor according to the
-            device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
-            :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
-            if it is not set, ParallelExecutor will get the cpu count by calling
-            `multiprocessing.cpu_count()`. Default 0.)DOC")
-      .def_property(
-          "use_cuda",
-          [](const ExecutionStrategy &self) { return self.use_cuda_; },
-          [](ExecutionStrategy &self, bool use_cuda) {
-            self.use_cuda_ = use_cuda;
-          })  // FIXME(chengduo): Doesn't add doc for 'use_cuda', use_cuda may
-      // make user confuse, because ParallelExecutor has a parameter named
-      // 'use_cuda' too, in current implementation, ParallelExecutor's
-      // 'use_cuda' will rewrite ExecutionStrategy's 'use_cuda'.
-      .def_property(
-          "allow_op_delay",
-          [](const ExecutionStrategy &self) { return self.allow_op_delay_; },
-          [](ExecutionStrategy &self, bool allow_op_delay) {
-            self.allow_op_delay_ = allow_op_delay;
-          },
-          R"DOC(The type is BOOL, allow_op_delay represents whether to delay the
-                communication operators to run, it may make the execution faster.
-                Note that this option is invalid now, and it will be removed in
-                next version. Default False.)DOC")
-      .def_property(
-          "num_iteration_per_drop_scope",
-          [](const ExecutionStrategy &self) {
-            return self.num_iteration_per_drop_scope_;
-          },
-          [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
-            self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
-          },
-          R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
-                many iterations to clean up the temp variables which
-                is generated during execution. It may make the execution faster,
-                because the temp variable's shape maybe the same between two iterations.
-                Default 1.
-                NOTES:
-                    1. If you fetch data when calling the 'run', the ParallelExecutor
-                       will clean up the temp variables at the end of the current iteration.
-                    2. In some NLP model, it may cause the GPU memory is insufficient,
-                       in this case, you should reduce `num_iteration_per_drop_scope`.
-              )DOC")
-      .def_property(
-          "num_iteration_per_run",
-          [](const ExecutionStrategy &self) {
-            return self.num_iteration_per_run_;
-          },
-          [](ExecutionStrategy &self, size_t num_iteration_per_run) {
-            self.num_iteration_per_run_ = num_iteration_per_run;
-          },
-          R"DOC(This config that how many iteration the executor will run when
-                user call pe.run() in python
-              )DOC")
-      .def_property("_dry_run",
-                    [](const ExecutionStrategy &self) { return self.dry_run_; },
-                    [](ExecutionStrategy &self, bool dry_run) {
-                      self.dry_run_ = dry_run;
-                    });
-  exec_strategy.def_property(
-      "use_experimental_executor",
-      [](const ExecutionStrategy &self) {
-        return self.type_ == ExecutionStrategy::kExperimental;
-      },
-      [](ExecutionStrategy &self, bool experimental) {
-        self.type_ = experimental ? ExecutionStrategy::kExperimental
-                                  : ExecutionStrategy::kDefault;
-      });
-  py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy", R"DOC(
-    BuildStrategy allows the user to more preciously control how to
-    build the SSA Graph in ParallelExecutor by setting the property.
-    Examples:
-        .. code-block:: python
-            import paddle.fluid as fluid
-            build_strategy = fluid.BuildStrategy()
-            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-)DOC");
-  py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
-      .value("Reduce", BuildStrategy::ReduceStrategy::kReduce)
-      .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce);
-  py::enum_<BuildStrategy::GradientScaleStrategy>(build_strategy,
-                                                  "GradientScaleStrategy")
-      .value("CoeffNumDevice",
-             BuildStrategy::GradientScaleStrategy::kCoeffNumDevice)
-      .value("One", BuildStrategy::GradientScaleStrategy::kOne)
-      .value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized);
-  build_strategy.def(py::init())
-      .def_property(
-          "reduce_strategy",
-          [](const BuildStrategy &self) { return self.reduce_; },
-          [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
-            PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                              "BuildStrategy is finlaized.");
-            self.reduce_ = strategy;
-          },
-          R"DOC(The type is fluid.BuildStrategy.ReduceStrategy, there are two reduce
-                strategies in ParallelExecutor, AllReduce and Reduce. If you want
-                that all the parameters' optimization are done on all devices independently,
-                you should choose AllReduce; if you choose Reduce, all the parameters'
-                optimization will be evenly distributed to different devices, and then
-                broadcast the optimized parameter to other devices.
-                Default 'AllReduce'.
-                Examples:
-                    .. code-block:: python
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-                  )DOC")
-      .def_property(
-          "gradient_scale_strategy",
-          [](const BuildStrategy &self) { return self.gradient_scale_; },
-          [](BuildStrategy &self,
-             BuildStrategy::GradientScaleStrategy strategy) {
-            PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                              "BuildStrategy is finalized.");
-            self.gradient_scale_ = strategy;
-          },
-          R"DOC(The type is fluid.BuildStrategy.GradientScaleStrategy, there are three
-                ways of defining :math:`loss@grad` in ParallelExecutor, CoeffNumDevice,
-                One and Customized. By default, ParallelExecutor sets the :math:`loss@grad`
-                according to the number of devices. If you want to customize :math:`loss@grad`,
-                you can choose Customized. Default 'CoeffNumDevice'.
-                Examples:
-                    .. code-block:: python
-                        import paddle.fluid as fluid
-                        import paddle.fluid.compiler as compiler
-                        import numpy
-                        import os
-                        use_cuda = True
-                        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-                        exe = fluid.Executor(place)
-                        # NOTE: If you use CPU to run the program, you need
-                        # to specify the CPU_NUM, otherwise, fluid will use
-                        # all the number of the logic core as the CPU_NUM,
-                        # in that case, the batch size of the input should be
-                        # greater than CPU_NUM, if not, the process will be
-                        # failed by an exception.
-                        if not use_cuda:
-                            os.environ['CPU_NUM'] = str(2)
-                            places = fluid.cpu_places()
-                        else:
-                            places = places = fluid.cuda_places()
-                        data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-                        hidden = fluid.layers.fc(input=data, size=10)
-                        loss = fluid.layers.mean(hidden)
-                        fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
-                        fluid.default_startup_program().random_seed=1
-                        exe.run(fluid.default_startup_program())
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.gradient_scale_strategy = \
-                                 fluid.BuildStrategy.GradientScaleStrategy.Customized
-                        compiled_prog = compiler.CompiledProgram(
-                                 fluid.default_main_program()).with_data_parallel(
-                                          loss_name=loss.name, build_strategy=build_strategy,
-                                          places = places)
-                        dev_count =  len(places)
-                        x = numpy.random.random(size=(10, 1)).astype('float32')
-                        loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01
-                        loss_grad_name = loss.name+"@GRAD"
-                        loss_data = exe.run(compiled_prog,
-                                             feed={"X": x, loss_grad_name : loss_grad},
-                                             fetch_list=[loss.name, loss_grad_name])
-                   )DOC")
-      .def_property(
-          "debug_graphviz_path",
-          [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
-          [](BuildStrategy &self, const std::string &path) {
-            PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                              "BuildStrategy is finlaized.");
-            self.debug_graphviz_path_ = path;
-          },
-          R"DOC(The type is STR, debug_graphviz_path indicates the path that
-                writing the SSA Graph to file in the form of graphviz.
-                It is useful for debugging. Default ""
-                Examples:
-                    .. code-block:: python
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.debug_graphviz_path = "./graph"
-                    )DOC")
-      .def_property(
-          "enable_sequential_execution",
-          [](const BuildStrategy &self) {
-            return self.enable_sequential_execution_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                              "BuildStrategy is finlaized.");
-            self.enable_sequential_execution_ = b;
-          },
-          R"DOC(The type is BOOL. If set True, the execution order of ops would
-                be the same as what is in the program. Default False.
-                Examples:
-                    .. code-block:: python
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.enable_sequential_execution = True
-          )DOC")
-      .def_property(
-          "remove_unnecessary_lock",
-          [](const BuildStrategy &self) {
-            return self.remove_unnecessary_lock_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                              "BuildStrategy is finlaized.");
-            self.remove_unnecessary_lock_ = b;
-          },
-          R"DOC(The type is BOOL. If set True, some locks in GPU ops would be
-                released and ParallelExecutor would run faster. Default True.
-                Examples:
-                    .. code-block:: python
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.remove_unnecessary_lock = True
-          )DOC")
-      .def_property(
-          "num_trainers",
-          [](const BuildStrategy &self) { return self.num_trainers_; },
-          [](BuildStrategy &self, int num_trainers) {
-#ifdef WIN32
-            PADDLE_THROW("Windows has NO support to distribute mode.");
-#endif
-            self.num_trainers_ = num_trainers;
-          })
-      .def_property(
-          "trainers_endpoints",
-          [](const BuildStrategy &self) { return self.trainers_endpoints_; },
-          [](BuildStrategy &self,
-             const std::vector<std::string> &trainers_endpoints) {
-            self.trainers_endpoints_ = trainers_endpoints;
-          })
-      .def_property("trainer_id",
-                    [](const BuildStrategy &self) { return self.trainer_id_; },
-                    [](BuildStrategy &self, int trainer_id) {
-                      self.trainer_id_ = trainer_id;
-                    })
-      .def_property(
-          "nccl_comm_num",
-          [](const BuildStrategy &self) { return self.nccl_comm_num_; },
-          [](BuildStrategy &self, int nccl_comm_num) {
-            self.nccl_comm_num_ = nccl_comm_num;
-          })
-      .def_property("use_hierarchical_allreduce",
-                    [](const BuildStrategy &self) {
-                      return self.use_hierarchical_allreduce_;
-                    },
-                    [](BuildStrategy &self, bool use) {
-                      self.use_hierarchical_allreduce_ = use;
-                    })
-      .def_property("hierarchical_allreduce_inter_nranks",
-                    [](const BuildStrategy &self) {
-                      return self.hierarchical_allreduce_inter_nranks_;
-                    },
-                    [](BuildStrategy &self, int nranks) {
-                      self.hierarchical_allreduce_inter_nranks_ = nranks;
-                    })
-      .def_property(
-          "fuse_elewise_add_act_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_elewise_add_act_ops_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                              "BuildStrategy is finlaized.");
-            self.fuse_elewise_add_act_ops_ = b;
-          },
-          R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
-                to fuse elementwise_add_op and activation_op,
-                it may make the execution faster. Default False
-                Examples:
-                    .. code-block:: python
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.fuse_elewise_add_act_ops = True
-                     )DOC")
-      .def_property(
-          "fuse_relu_depthwise_conv",
-          [](const BuildStrategy &self) {
-            return self.fuse_relu_depthwise_conv_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                              "BuildStrategy is finlaized.");
-            self.fuse_relu_depthwise_conv_ = b;
-          },
-          R"DOC(The type is BOOL, fuse_relu_depthwise_conv indicate whether
-                to fuse relu and depthwise_conv2d,
-                it will save GPU memory and may make the execution faster.
-                This options is only available in GPU devices.
-                Default False.
-                Examples:
-                    .. code-block:: python
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.fuse_relu_depthwise_conv = True
-          )DOC")
-      .def_property("fuse_broadcast_ops",
-                    [](const BuildStrategy &self) {
-                      return self.fuse_broadcast_ops_ == true ||
-                             self.fuse_broadcast_ops_ == boost::none;
-                    },
-                    [](BuildStrategy &self, bool b) {
-                      PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                                        "BuildStrategy is finlaized.");
-                      self.fuse_broadcast_ops_ = b;
-                    },
-                    R"DOC(The type is BOOL, fuse_broadcast_op indicates whether
-                      to fuse the broadcast ops. Note that, in Reduce mode,
-                      fusing broadcast ops may make the program faster. Because
-                      fusing broadcast OP equals delaying the execution of all
-                      broadcast Ops, in this case, all nccl streams are used only
-                      for NCCLReduce operations for a period of time. Default False.)DOC")
-      .def_property("fuse_all_optimizer_ops",
-                    [](const BuildStrategy &self) {
-                      return self.fuse_all_optimizer_ops_ == true ||
-                             self.fuse_all_optimizer_ops_ == boost::none;
-                    },
-                    [](BuildStrategy &self, bool b) {
-                      PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                                        "BuildStrategy is finlaized.");
-                      self.fuse_all_optimizer_ops_ = b;
-                    })
-      .def_property(
-          "sync_batch_norm",
-          [](const BuildStrategy &self) { return self.sync_batch_norm_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                              "BuildStrategy is finlaized.");
-            self.sync_batch_norm_ = b;
-          },
-          R"DOC(The type is BOOL, sync_batch_norm indicates whether to use
-                synchronous batch normalization which synchronizes the mean
-                and variance through multi-devices in training phase.
-                Current implementation doesn't support FP16 training and CPU.
-                And only synchronous on one machine, not all machines.
-                Default False
-                Examples:
-                    .. code-block:: python
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.sync_batch_norm = True
-                )DOC")
-      .def_property(
-          "memory_optimize",
-          [](const BuildStrategy &self) -> py::object {
-            if (self.memory_optimize_) {
-              return py::cast(self.memory_optimize_.get());
-            } else {
-              return py::cast(nullptr);
-            }
-          },
-          [](BuildStrategy &self, const py::handle &value) {
-            auto *py_obj = value.ptr();
-            if (py_obj == nullptr || py_obj == Py_None) {
-              self.memory_optimize_ = boost::none;
-            } else if (PyBool_Check(py_obj)) {
-              self.memory_optimize_ = (py_obj == Py_True);
-            } else {
-              PADDLE_THROW(
-                  "BuildStrategy.memory_optimize must be None, False or True");
-            }
-          },
-          R"DOC(The type is BOOL or None, memory opitimize aims to save total memory
-                consumption, set to True to enable it.
-                Default None. None means framework would choose to use or not use 
-                this strategy automatically. Currently, None means that it is 
-                enabled when GC is disabled, and disabled when GC is enabled. 
-                True means enabling and False means disabling. Default None.)DOC")
-      .def_property(
-          "is_distribution",
-          [](const BuildStrategy &self) { return self.is_distribution_; },
-          [](BuildStrategy &self, bool b) {
-#ifdef WIN32
-            if (b) {
-              PADDLE_THROW("Windows has NO support to distribute mode.");
-            }
-#else
-            self.is_distribution_ = b;
-#endif
-          })
-      .def_property("async_mode",
-                    [](const BuildStrategy &self) { return self.async_mode_; },
-                    [](BuildStrategy &self, bool b) { self.async_mode_ = b; })
-      .def_property(
-          "enable_inplace",
-          [](const BuildStrategy &self) { return self.enable_inplace_; },
-          [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; })
-      .def_property(
-          "fuse_all_reduce_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_all_reduce_ops_ == true ||
-                   self.fuse_all_reduce_ops_ == boost::none;
-          },
-          [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; })
-      .def_property("enable_backward_optimizer_op_deps",
-                    [](const BuildStrategy &self) {
-                      return self.enable_backward_optimizer_op_deps_;
-                    },
-                    [](BuildStrategy &self, bool b) {
-                      self.enable_backward_optimizer_op_deps_ = b;
-                    })
-      .def_property(
-          "cache_runtime_context",
-          [](const BuildStrategy &self) { return self.cache_runtime_context_; },
-          [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; })
-      .def_property(
-          "mkldnn_enabled_op_types",
-          [](const BuildStrategy &self) {
-            return self.mkldnn_enabled_op_types_;
-          },
-          [](BuildStrategy &self,
-             const std::unordered_set<std::string> &mkldnn_enabled_op_types) {
-            self.mkldnn_enabled_op_types_ = mkldnn_enabled_op_types;
-          })
-      .def("_finalize_strategy_and_create_passes",
-           [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
-             return self.CreatePassesFromStrategy(true);
-           },
-           R"DOC(Allow user to customized passes. Normally model-specific
-                optimization passes should be defined in this way. BuildStrategy
-                cannot be updated after being finalized.)DOC");
-  pe.def(py::init<const std::vector<platform::Place> &,
-                  const std::vector<std::string> &, const std::string &,
-                  Scope *, std::vector<Scope *> &, const ExecutionStrategy &,
-                  const BuildStrategy &, ir::Graph *>())
-      // NOTE: even we return a vec<Scope*>* to Python use reference policy.
-      // We still cannot get local_scope from this vector, since the element
-      // of vec<Scope*> will be freed by Python GC. We can only return Scope*
-      // one by one and mark them as reference.
-      .def("local_scopes",
-           [](ParallelExecutor &self) -> std::vector<Scope *> * {
-             return &self.GetLocalScopes();
-           },
-           py::return_value_policy::reference)
-      .def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes)
-      .def("_need_create_local_exe_scopes",
-           &ParallelExecutor::NeedCreateLocalExeScope)
-      .def("feed_tensors_into_local_scopes",
-           &ParallelExecutor::FeedTensorsIntoLocalScopes)
-      .def("feed_and_split_tensor_into_local_scopes",
-           &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
-      .def("run", [](ParallelExecutor &self,
-                     const std::vector<std::string> &fetch_tensors) {
-        pybind11::gil_scoped_release release;
-        return self.Run(fetch_tensors);
-      });
-  BindFleetWrapper(&m);
-  BindBoxHelper(&m);
-#ifndef _WIN32
-  BindNCCLWrapper(&m);
-#endif
-  BindGraph(&m);
-  BindNode(&m);
-  BindInferenceApi(&m);
-  BindExpandApi(&m);
-  BindDataset(&m);
-#ifdef PADDLE_WITH_DISTRIBUTE
-  BindCommunicator(&m);
-#endif
-}
-}  // namespace pybind
-}  // namespace paddle